diff --git a/debian/bin/genpatch-pfkernel b/debian/bin/genpatch-pfkernel new file mode 100755 index 0000000..1356fdb --- /dev/null +++ b/debian/bin/genpatch-pfkernel @@ -0,0 +1,61 @@ +#!/bin/sh +set -ef + +export GIT_OPTIONAL_LOCKS=0 + +w=$(git rev-parse --path-format=absolute --show-toplevel) ; : "${w:?}" ; cd "$w" + +dst='debian/patches/pf' +src='../linux-extras' +branches='amd-pstate amd-rapl cpuidle crypto fixes ksm zstd' + +[ -d "${dst}" ] + +kver= +if [ -n "$1" ] ; then + kver="$1" +else + kver=$(dpkg-parsechangelog --show-field=Version | sed -E 's/^[0-9]+://;s/-[^-]*$//' | cut -d. -f1-2) +fi +from="upstream/linux-${kver}.y" + +t=$(mktemp -d) ; : "${t:?}" + +cp -ar "${src}" "$t/" +cd "$t/${src##*/}" + +git config advice.skippedCherryPicks false + +for b in ${branches} ; do + ref="pf/$b-${kver}" + r="tmp-rebase-$b" + + git switch --detach "${ref}" + git switch -C "$r" + + if git rebase "${from}" ; then + [ -d "$w/${dst}/$b/" ] || mkdir -p "$w/${dst}/$b" + + set +e + env -C "$w" git ls-files -z | grep -zF "${dst}/$b/" | grep -zFv '/.' | env -C "$w" -u GIT_OPTIONAL_LOCKS xargs -r -0 git rm -f + find "$w/${dst}/$b/" -name '*.patch' -type f -exec rm -f {} + + set -e + + git format-patch -N --subject-prefix='' --output-directory "$w/${dst}/$b" "${from}..$r" + else + echo >&2 + git rebase --abort + echo >&2 + fi + + git switch -q --detach "${ref}" + git branch -D "$r" + echo >&2 +done + +cd "$w" ; rm -rf "$t" + +echo >&2 +echo 'put in debian/patches/series' >&2 +echo >&2 +find "${dst}/" -type f -name '*.patch' | sed -E 's#^debian/patches/##' | sort -V diff --git a/debian/patches/krd/0001-Revert-objtool-dont-fail-the-kernel-build-on-fatal-errors.patch b/debian/patches/krd/0001-Revert-objtool-dont-fail-the-kernel-build-on-fatal-errors.patch new file mode 100644 index 0000000..30a817b --- /dev/null +++ b/debian/patches/krd/0001-Revert-objtool-dont-fail-the-kernel-build-on-fatal-errors.patch @@ -0,0 +1,52 @@ +this reverts following commit: + + From: Josh Poimboeuf + Date: Thu, 14 Jan 2021 16:32:42 -0600 + Subject: objtool: Don't fail the kernel build on fatal errors + + [ Upstream commit 655cf86548a3938538642a6df27dd359e13c86bd ] + + This is basically a revert of commit 644592d32837 ("objtool: Fail the + kernel build on fatal errors"). + + That change turned out to be more trouble than it's worth. Failing the + build is an extreme measure which sometimes gets too much attention and + blocks CI build testing. + + These fatal-type warnings aren't yet as rare as we'd hope, due to the + ever-increasing matrix of supported toolchains/plugins and their + fast-changing nature as of late. + + Also, there are more people (and bots) looking for objtool warnings than + ever before, so even non-fatal warnings aren't likely to be ignored for + long. + + Suggested-by: Nick Desaulniers + Reviewed-by: Miroslav Benes + Reviewed-by: Nick Desaulniers + Reviewed-by: Kamalesh Babulal + Signed-off-by: Josh Poimboeuf + Signed-off-by: Sasha Levin + +--- a/tools/objtool/check.c ++++ b/tools/objtool/check.c +@@ -4872,10 +4872,14 @@ int check(struct objtool_file *file) + } + + out: +- /* +- * For now, don't fail the kernel build on fatal warnings. These +- * errors are still fairly common due to the growing matrix of +- * supported toolchains and their recent pace of change. +- */ ++ if (ret < 0) { ++ /* ++ * Fatal error. The binary is corrupt or otherwise broken in ++ * some way, or objtool itself is broken. Fail the kernel ++ * build. ++ */ ++ return ret; ++ } ++ + return 0; + } diff --git a/debian/patches/krd/0002-established-timeout.patch b/debian/patches/krd/0002-established-timeout.patch new file mode 100644 index 0000000..3330b53 --- /dev/null +++ b/debian/patches/krd/0002-established-timeout.patch @@ -0,0 +1,11 @@ +--- a/net/netfilter/nf_conntrack_proto_tcp.c ++++ b/net/netfilter/nf_conntrack_proto_tcp.c +@@ -61,7 +61,7 @@ enum nf_ct_tcp_action { + static const unsigned int tcp_timeouts[TCP_CONNTRACK_TIMEOUT_MAX] = { + [TCP_CONNTRACK_SYN_SENT] = 2 MINS, + [TCP_CONNTRACK_SYN_RECV] = 60 SECS, +- [TCP_CONNTRACK_ESTABLISHED] = 5 DAYS, ++ [TCP_CONNTRACK_ESTABLISHED] = 128 MINS, + [TCP_CONNTRACK_FIN_WAIT] = 2 MINS, + [TCP_CONNTRACK_CLOSE_WAIT] = 60 SECS, + [TCP_CONNTRACK_LAST_ACK] = 30 SECS, diff --git a/debian/patches/krd/0003-local-ports.patch b/debian/patches/krd/0003-local-ports.patch new file mode 100644 index 0000000..fd20a12 --- /dev/null +++ b/debian/patches/krd/0003-local-ports.patch @@ -0,0 +1,11 @@ +--- a/net/ipv4/af_inet.c ++++ b/net/ipv4/af_inet.c +@@ -1802,7 +1802,7 @@ static __net_init int inet_init_net(stru + /* + * Set defaults for local port range + */ +- net->ipv4.ip_local_ports.range = 60999u << 16 | 32768u; ++ net->ipv4.ip_local_ports.range = 65533u << 16 | 49152u; + + seqlock_init(&net->ipv4.ping_group_range.lock); + /* diff --git a/debian/patches/krd/0004-bridge-group_fwd_mask.patch b/debian/patches/krd/0004-bridge-group_fwd_mask.patch new file mode 100644 index 0000000..61ac436 --- /dev/null +++ b/debian/patches/krd/0004-bridge-group_fwd_mask.patch @@ -0,0 +1,37 @@ +--- a/net/bridge/br_input.c ++++ b/net/bridge/br_input.c +@@ -374,7 +374,11 @@ static rx_handler_result_t br_handle_fra + return RX_HANDLER_PASS; + + case 0x01: /* IEEE MAC (Pause) */ +- goto drop; ++ fwd_mask |= p->br->group_fwd_mask; ++ if (fwd_mask & (1u << dest[5])) ++ goto forward; ++ else ++ goto drop; + + case 0x0E: /* 802.1AB LLDP */ + fwd_mask |= p->br->group_fwd_mask; +--- a/net/bridge/br_netlink.c ++++ b/net/bridge/br_netlink.c +@@ -1365,8 +1365,6 @@ static int br_changelink(struct net_devi + if (data[IFLA_BR_GROUP_FWD_MASK]) { + u16 fwd_mask = nla_get_u16(data[IFLA_BR_GROUP_FWD_MASK]); + +- if (fwd_mask & BR_GROUPFWD_RESTRICTED) +- return -EINVAL; + br->group_fwd_mask = fwd_mask; + } + +--- a/net/bridge/br_sysfs_br.c ++++ b/net/bridge/br_sysfs_br.c +@@ -179,8 +179,6 @@ static ssize_t group_fwd_mask_show(struc + static int set_group_fwd_mask(struct net_bridge *br, unsigned long val, + struct netlink_ext_ack *extack) + { +- if (val & BR_GROUPFWD_RESTRICTED) +- return -EINVAL; + + br->group_fwd_mask = val; + diff --git a/debian/patches/misc-bbr3/0001-net-tcp_bbr-broaden-app-limited-rate-sample-detectio.patch b/debian/patches/misc-bbr3/0001-net-tcp_bbr-broaden-app-limited-rate-sample-detectio.patch new file mode 100644 index 0000000..620da28 --- /dev/null +++ b/debian/patches/misc-bbr3/0001-net-tcp_bbr-broaden-app-limited-rate-sample-detectio.patch @@ -0,0 +1,52 @@ +From ce1cd7869a208112a8728d1fe9e373f78a2e4a6e Mon Sep 17 00:00:00 2001 +From: Neal Cardwell +Date: Tue, 11 Jun 2019 12:26:55 -0400 +Subject: [PATCH 01/19] net-tcp_bbr: broaden app-limited rate sample detection + +This commit is a bug fix for the Linux TCP app-limited +(application-limited) logic that is used for collecting rate +(bandwidth) samples. + +Previously the app-limited logic only looked for "bubbles" of +silence in between application writes, by checking at the start +of each sendmsg. But "bubbles" of silence can also happen before +retransmits: e.g. bubbles can happen between an application write +and a retransmit, or between two retransmits. + +Retransmits are triggered by ACKs or timers. So this commit checks +for bubbles of app-limited silence upon ACKs or timers. + +Why does this commit check for app-limited state at the start of +ACKs and timer handling? Because at that point we know whether +inflight was fully using the cwnd. During processing the ACK or +timer event we often change the cwnd; after changing the cwnd we +can't know whether inflight was fully using the old cwnd. + +Origin-9xx-SHA1: 3fe9b53291e018407780fb8c356adb5666722cbc +Change-Id: I37221506f5166877c2b110753d39bb0757985e68 +Signed-off-by: Alexandre Frade +--- + net/ipv4/tcp_input.c | 1 + + net/ipv4/tcp_timer.c | 1 + + 2 files changed, 2 insertions(+) + +--- a/net/ipv4/tcp_input.c ++++ b/net/ipv4/tcp_input.c +@@ -3961,6 +3961,7 @@ static int tcp_ack(struct sock *sk, cons + + prior_fack = tcp_is_sack(tp) ? tcp_highest_sack_seq(tp) : tp->snd_una; + rs.prior_in_flight = tcp_packets_in_flight(tp); ++ tcp_rate_check_app_limited(sk); + + /* ts_recent update must be made after we are sure that the packet + * is in window. +--- a/net/ipv4/tcp_timer.c ++++ b/net/ipv4/tcp_timer.c +@@ -689,6 +689,7 @@ void tcp_write_timer_handler(struct sock + return; + } + ++ tcp_rate_check_app_limited(sk); + tcp_mstamp_refresh(tcp_sk(sk)); + event = icsk->icsk_pending; + diff --git a/debian/patches/misc-bbr3/0002-net-tcp_bbr-v2-shrink-delivered_mstamp-first_tx_msta.patch b/debian/patches/misc-bbr3/0002-net-tcp_bbr-v2-shrink-delivered_mstamp-first_tx_msta.patch new file mode 100644 index 0000000..b279c96 --- /dev/null +++ b/debian/patches/misc-bbr3/0002-net-tcp_bbr-v2-shrink-delivered_mstamp-first_tx_msta.patch @@ -0,0 +1,74 @@ +From b32715fbe2ab96d1060ec37bb9c03feedf366494 Mon Sep 17 00:00:00 2001 +From: Neal Cardwell +Date: Sun, 24 Jun 2018 21:55:59 -0400 +Subject: [PATCH 02/19] net-tcp_bbr: v2: shrink delivered_mstamp, + first_tx_mstamp to u32 to free up 8 bytes + +Free up some space for tracking inflight and losses for each +bw sample, in upcoming commits. + +These timestamps are in microseconds, and are now stored in 32 +bits. So they can only hold time intervals up to roughly 2^12 = 4096 +seconds. But Linux TCP RTT and RTO tracking has the same 32-bit +microsecond implementation approach and resulting deployment +limitations. So this is not introducing a new limit. And these should +not be a limitation for the foreseeable future. + +Effort: net-tcp_bbr +Origin-9xx-SHA1: 238a7e6b5d51625fef1ce7769826a7b21b02ae55 +Change-Id: I3b779603797263b52a61ad57c565eb91fe42680c +Signed-off-by: Alexandre Frade +--- + include/net/tcp.h | 9 +++++++-- + net/ipv4/tcp_rate.c | 7 ++++--- + 2 files changed, 11 insertions(+), 5 deletions(-) + +--- a/include/net/tcp.h ++++ b/include/net/tcp.h +@@ -884,6 +884,11 @@ static inline u32 tcp_stamp_us_delta(u64 + return max_t(s64, t1 - t0, 0); + } + ++static inline u32 tcp_stamp32_us_delta(u32 t1, u32 t0) ++{ ++ return max_t(s32, t1 - t0, 0); ++} ++ + /* provide the departure time in us unit */ + static inline u64 tcp_skb_timestamp_us(const struct sk_buff *skb) + { +@@ -973,9 +978,9 @@ struct tcp_skb_cb { + /* pkts S/ACKed so far upon tx of skb, incl retrans: */ + __u32 delivered; + /* start of send pipeline phase */ +- u64 first_tx_mstamp; ++ u32 first_tx_mstamp; + /* when we reached the "delivered" count */ +- u64 delivered_mstamp; ++ u32 delivered_mstamp; + } tx; /* only used for outgoing skbs */ + union { + struct inet_skb_parm h4; +--- a/net/ipv4/tcp_rate.c ++++ b/net/ipv4/tcp_rate.c +@@ -101,8 +101,9 @@ void tcp_rate_skb_delivered(struct sock + /* Record send time of most recently ACKed packet: */ + tp->first_tx_mstamp = tx_tstamp; + /* Find the duration of the "send phase" of this window: */ +- rs->interval_us = tcp_stamp_us_delta(tp->first_tx_mstamp, +- scb->tx.first_tx_mstamp); ++ rs->interval_us = tcp_stamp32_us_delta( ++ tp->first_tx_mstamp, ++ scb->tx.first_tx_mstamp); + + } + /* Mark off the skb delivered once it's sacked to avoid being +@@ -155,7 +156,7 @@ void tcp_rate_gen(struct sock *sk, u32 d + * longer phase. + */ + snd_us = rs->interval_us; /* send phase */ +- ack_us = tcp_stamp_us_delta(tp->tcp_mstamp, ++ ack_us = tcp_stamp32_us_delta(tp->tcp_mstamp, + rs->prior_mstamp); /* ack phase */ + rs->interval_us = max(snd_us, ack_us); + diff --git a/debian/patches/misc-bbr3/0003-net-tcp_bbr-v2-snapshot-packets-in-flight-at-transmi.patch b/debian/patches/misc-bbr3/0003-net-tcp_bbr-v2-snapshot-packets-in-flight-at-transmi.patch new file mode 100644 index 0000000..07bebb5 --- /dev/null +++ b/debian/patches/misc-bbr3/0003-net-tcp_bbr-v2-snapshot-packets-in-flight-at-transmi.patch @@ -0,0 +1,109 @@ +From 25856231832186fe13189b986cc0e91860c18201 Mon Sep 17 00:00:00 2001 +From: Neal Cardwell +Date: Sat, 5 Aug 2017 11:49:50 -0400 +Subject: [PATCH 03/19] net-tcp_bbr: v2: snapshot packets in flight at transmit + time and pass in rate_sample + +CC algorithms may want to snapshot the number of packets in flight at +transmit time and pass in rate_sample, to understand the relationship +between inflight and losses or ECN signals, to try to find the highest +inflight value that has acceptable levels of loss/ECN marking. + +We split out the code to set an skb's tx.in_flight field into its own +function, so that this code can be used for the TCP_REPAIR "fake send" +code path that inserts skbs into the rtx queue without sending them. + +Effort: net-tcp_bbr +Origin-9xx-SHA1: b3eb4f2d20efab4ca001f32c9294739036c493ea +Origin-9xx-SHA1: e880fc907d06ea7354333f60f712748ebce9497b +Origin-9xx-SHA1: 330f825a08a6fe92cef74d799cc468864c479f63 +Change-Id: I7314047d0ff14dd261a04b1969a46dc658c8836a +Signed-off-by: Alexandre Frade +--- + include/net/tcp.h | 6 ++++++ + net/ipv4/tcp_output.c | 1 + + net/ipv4/tcp_rate.c | 20 ++++++++++++++++++++ + 3 files changed, 27 insertions(+) + +--- a/include/net/tcp.h ++++ b/include/net/tcp.h +@@ -981,6 +981,10 @@ struct tcp_skb_cb { + u32 first_tx_mstamp; + /* when we reached the "delivered" count */ + u32 delivered_mstamp; ++#define TCPCB_IN_FLIGHT_BITS 20 ++#define TCPCB_IN_FLIGHT_MAX ((1U << TCPCB_IN_FLIGHT_BITS) - 1) ++ u32 in_flight:20, /* packets in flight at transmit */ ++ unused2:12; + } tx; /* only used for outgoing skbs */ + union { + struct inet_skb_parm h4; +@@ -1136,6 +1140,7 @@ struct rate_sample { + u64 prior_mstamp; /* starting timestamp for interval */ + u32 prior_delivered; /* tp->delivered at "prior_mstamp" */ + u32 prior_delivered_ce;/* tp->delivered_ce at "prior_mstamp" */ ++ u32 tx_in_flight; /* packets in flight at starting timestamp */ + s32 delivered; /* number of packets delivered over interval */ + s32 delivered_ce; /* number of packets delivered w/ CE marks*/ + long interval_us; /* time for tp->delivered to incr "delivered" */ +@@ -1258,6 +1263,7 @@ static inline void tcp_ca_event(struct s + void tcp_set_ca_state(struct sock *sk, const u8 ca_state); + + /* From tcp_rate.c */ ++void tcp_set_tx_in_flight(struct sock *sk, struct sk_buff *skb); + void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb); + void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb, + struct rate_sample *rs); +--- a/net/ipv4/tcp_output.c ++++ b/net/ipv4/tcp_output.c +@@ -2765,6 +2765,7 @@ static bool tcp_write_xmit(struct sock * + skb_set_delivery_time(skb, tp->tcp_wstamp_ns, SKB_CLOCK_MONOTONIC); + list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue); + tcp_init_tso_segs(skb, mss_now); ++ tcp_set_tx_in_flight(sk, skb); + goto repair; /* Skip network transmission */ + } + +--- a/net/ipv4/tcp_rate.c ++++ b/net/ipv4/tcp_rate.c +@@ -34,6 +34,24 @@ + * ready to send in the write queue. + */ + ++void tcp_set_tx_in_flight(struct sock *sk, struct sk_buff *skb) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ u32 in_flight; ++ ++ /* Check, sanitize, and record packets in flight after skb was sent. */ ++ in_flight = tcp_packets_in_flight(tp) + tcp_skb_pcount(skb); ++ if (WARN_ONCE(in_flight > TCPCB_IN_FLIGHT_MAX, ++ "insane in_flight %u cc %s mss %u " ++ "cwnd %u pif %u %u %u %u\n", ++ in_flight, inet_csk(sk)->icsk_ca_ops->name, ++ tp->mss_cache, tp->snd_cwnd, ++ tp->packets_out, tp->retrans_out, ++ tp->sacked_out, tp->lost_out)) ++ in_flight = TCPCB_IN_FLIGHT_MAX; ++ TCP_SKB_CB(skb)->tx.in_flight = in_flight; ++} ++ + /* Snapshot the current delivery information in the skb, to generate + * a rate sample later when the skb is (s)acked in tcp_rate_skb_delivered(). + */ +@@ -67,6 +85,7 @@ void tcp_rate_skb_sent(struct sock *sk, + TCP_SKB_CB(skb)->tx.delivered = tp->delivered; + TCP_SKB_CB(skb)->tx.delivered_ce = tp->delivered_ce; + TCP_SKB_CB(skb)->tx.is_app_limited = tp->app_limited ? 1 : 0; ++ tcp_set_tx_in_flight(sk, skb); + } + + /* When an skb is sacked or acked, we fill in the rate sample with the (prior) +@@ -96,6 +115,7 @@ void tcp_rate_skb_delivered(struct sock + rs->prior_mstamp = scb->tx.delivered_mstamp; + rs->is_app_limited = scb->tx.is_app_limited; + rs->is_retrans = scb->sacked & TCPCB_RETRANS; ++ rs->tx_in_flight = scb->tx.in_flight; + rs->last_end_seq = scb->end_seq; + + /* Record send time of most recently ACKed packet: */ diff --git a/debian/patches/misc-bbr3/0004-net-tcp_bbr-v2-count-packets-lost-over-TCP-rate-samp.patch b/debian/patches/misc-bbr3/0004-net-tcp_bbr-v2-count-packets-lost-over-TCP-rate-samp.patch new file mode 100644 index 0000000..8882ff8 --- /dev/null +++ b/debian/patches/misc-bbr3/0004-net-tcp_bbr-v2-count-packets-lost-over-TCP-rate-samp.patch @@ -0,0 +1,70 @@ +From b1772710e8b5b98c09e96d4f1af620cd938fddf7 Mon Sep 17 00:00:00 2001 +From: Neal Cardwell +Date: Thu, 12 Oct 2017 23:44:27 -0400 +Subject: [PATCH 04/19] net-tcp_bbr: v2: count packets lost over TCP rate + sampling interval + +For understanding the relationship between inflight and packet loss +signals, to try to find the highest inflight value that has acceptable +levels of packet losses. + +Effort: net-tcp_bbr +Origin-9xx-SHA1: 4527e26b2bd7756a88b5b9ef1ada3da33dd609ab +Change-Id: I594c2500868d9c530770e7ddd68ffc87c57f4fd5 +Signed-off-by: Alexandre Frade +--- + include/net/tcp.h | 5 ++++- + net/ipv4/tcp_rate.c | 3 +++ + 2 files changed, 7 insertions(+), 1 deletion(-) + +--- a/include/net/tcp.h ++++ b/include/net/tcp.h +@@ -985,6 +985,7 @@ struct tcp_skb_cb { + #define TCPCB_IN_FLIGHT_MAX ((1U << TCPCB_IN_FLIGHT_BITS) - 1) + u32 in_flight:20, /* packets in flight at transmit */ + unused2:12; ++ u32 lost; /* packets lost so far upon tx of skb */ + } tx; /* only used for outgoing skbs */ + union { + struct inet_skb_parm h4; +@@ -1138,11 +1139,13 @@ struct ack_sample { + */ + struct rate_sample { + u64 prior_mstamp; /* starting timestamp for interval */ ++ u32 prior_lost; /* tp->lost at "prior_mstamp" */ + u32 prior_delivered; /* tp->delivered at "prior_mstamp" */ + u32 prior_delivered_ce;/* tp->delivered_ce at "prior_mstamp" */ + u32 tx_in_flight; /* packets in flight at starting timestamp */ ++ s32 lost; /* number of packets lost over interval */ + s32 delivered; /* number of packets delivered over interval */ +- s32 delivered_ce; /* number of packets delivered w/ CE marks*/ ++ s32 delivered_ce; /* packets delivered w/ CE mark over interval */ + long interval_us; /* time for tp->delivered to incr "delivered" */ + u32 snd_interval_us; /* snd interval for delivered packets */ + u32 rcv_interval_us; /* rcv interval for delivered packets */ +--- a/net/ipv4/tcp_rate.c ++++ b/net/ipv4/tcp_rate.c +@@ -84,6 +84,7 @@ void tcp_rate_skb_sent(struct sock *sk, + TCP_SKB_CB(skb)->tx.delivered_mstamp = tp->delivered_mstamp; + TCP_SKB_CB(skb)->tx.delivered = tp->delivered; + TCP_SKB_CB(skb)->tx.delivered_ce = tp->delivered_ce; ++ TCP_SKB_CB(skb)->tx.lost = tp->lost; + TCP_SKB_CB(skb)->tx.is_app_limited = tp->app_limited ? 1 : 0; + tcp_set_tx_in_flight(sk, skb); + } +@@ -110,6 +111,7 @@ void tcp_rate_skb_delivered(struct sock + if (!rs->prior_delivered || + tcp_skb_sent_after(tx_tstamp, tp->first_tx_mstamp, + scb->end_seq, rs->last_end_seq)) { ++ rs->prior_lost = scb->tx.lost; + rs->prior_delivered_ce = scb->tx.delivered_ce; + rs->prior_delivered = scb->tx.delivered; + rs->prior_mstamp = scb->tx.delivered_mstamp; +@@ -165,6 +167,7 @@ void tcp_rate_gen(struct sock *sk, u32 d + return; + } + rs->delivered = tp->delivered - rs->prior_delivered; ++ rs->lost = tp->lost - rs->prior_lost; + + rs->delivered_ce = tp->delivered_ce - rs->prior_delivered_ce; + /* delivered_ce occupies less than 32 bits in the skb control block */ diff --git a/debian/patches/misc-bbr3/0005-net-tcp_bbr-v2-export-FLAG_ECE-in-rate_sample.is_ece.patch b/debian/patches/misc-bbr3/0005-net-tcp_bbr-v2-export-FLAG_ECE-in-rate_sample.is_ece.patch new file mode 100644 index 0000000..1045dbf --- /dev/null +++ b/debian/patches/misc-bbr3/0005-net-tcp_bbr-v2-export-FLAG_ECE-in-rate_sample.is_ece.patch @@ -0,0 +1,38 @@ +From fdf01142aea8645186e080f1278d3b5a5fd8c66c Mon Sep 17 00:00:00 2001 +From: Neal Cardwell +Date: Mon, 19 Nov 2018 13:48:36 -0500 +Subject: [PATCH 05/19] net-tcp_bbr: v2: export FLAG_ECE in rate_sample.is_ece + +For understanding the relationship between inflight and ECN signals, +to try to find the highest inflight value that has acceptable levels +ECN marking. + +Effort: net-tcp_bbr +Origin-9xx-SHA1: 3eba998f2898541406c2666781182200934965a8 +Change-Id: I3a964e04cee83e11649a54507043d2dfe769a3b3 +Signed-off-by: Alexandre Frade +--- + include/net/tcp.h | 1 + + net/ipv4/tcp_input.c | 1 + + 2 files changed, 2 insertions(+) + +--- a/include/net/tcp.h ++++ b/include/net/tcp.h +@@ -1157,6 +1157,7 @@ struct rate_sample { + bool is_app_limited; /* is sample from packet with bubble in pipe? */ + bool is_retrans; /* is sample from retransmission? */ + bool is_ack_delayed; /* is this (likely) a delayed ACK? */ ++ bool is_ece; /* did this ACK have ECN marked? */ + }; + + struct tcp_congestion_ops { +--- a/net/ipv4/tcp_input.c ++++ b/net/ipv4/tcp_input.c +@@ -4060,6 +4060,7 @@ static int tcp_ack(struct sock *sk, cons + delivered = tcp_newly_delivered(sk, delivered, flag); + lost = tp->lost - lost; /* freshly marked lost */ + rs.is_ack_delayed = !!(flag & FLAG_ACK_MAYBE_DELAYED); ++ rs.is_ece = !!(flag & FLAG_ECE); + tcp_rate_gen(sk, delivered, lost, is_sack_reneg, sack_state.rate); + tcp_cong_control(sk, ack, delivered, flag, sack_state.rate); + tcp_xmit_recovery(sk, rexmit); diff --git a/debian/patches/misc-bbr3/0006-net-tcp_bbr-v2-introduce-ca_ops-skb_marked_lost-CC-m.patch b/debian/patches/misc-bbr3/0006-net-tcp_bbr-v2-introduce-ca_ops-skb_marked_lost-CC-m.patch new file mode 100644 index 0000000..1582d9e --- /dev/null +++ b/debian/patches/misc-bbr3/0006-net-tcp_bbr-v2-introduce-ca_ops-skb_marked_lost-CC-m.patch @@ -0,0 +1,57 @@ +From a3e88432c2ebf12de9c2053a13417ddf2ad4cb4e Mon Sep 17 00:00:00 2001 +From: Neal Cardwell +Date: Tue, 7 Aug 2018 21:52:06 -0400 +Subject: [PATCH 06/19] net-tcp_bbr: v2: introduce ca_ops->skb_marked_lost() CC + module callback API + +For connections experiencing reordering, RACK can mark packets lost +long after we receive the SACKs/ACKs hinting that the packets were +actually lost. + +This means that CC modules cannot easily learn the volume of inflight +data at which packet loss happens by looking at the current inflight +or even the packets in flight when the most recently SACKed packet was +sent. To learn this, CC modules need to know how many packets were in +flight at the time lost packets were sent. This new callback, combined +with TCP_SKB_CB(skb)->tx.in_flight, allows them to learn this. + +This also provides a consistent callback that is invoked whether +packets are marked lost upon ACK processing, using the RACK reordering +timer, or at RTO time. + +Effort: net-tcp_bbr +Origin-9xx-SHA1: afcbebe3374e4632ac6714d39e4dc8a8455956f4 +Change-Id: I54826ab53df636be537e5d3c618a46145d12d51a +Signed-off-by: Alexandre Frade +--- + include/net/tcp.h | 3 +++ + net/ipv4/tcp_input.c | 5 +++++ + 2 files changed, 8 insertions(+) + +--- a/include/net/tcp.h ++++ b/include/net/tcp.h +@@ -1184,6 +1184,9 @@ struct tcp_congestion_ops { + /* override sysctl_tcp_min_tso_segs */ + u32 (*min_tso_segs)(struct sock *sk); + ++ /* react to a specific lost skb (optional) */ ++ void (*skb_marked_lost)(struct sock *sk, const struct sk_buff *skb); ++ + /* call when packets are delivered to update cwnd and pacing rate, + * after all the ca_state processing. (optional) + */ +--- a/net/ipv4/tcp_input.c ++++ b/net/ipv4/tcp_input.c +@@ -1120,7 +1120,12 @@ static void tcp_verify_retransmit_hint(s + */ + static void tcp_notify_skb_loss_event(struct tcp_sock *tp, const struct sk_buff *skb) + { ++ struct sock *sk = (struct sock *)tp; ++ const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops; ++ + tp->lost += tcp_skb_pcount(skb); ++ if (ca_ops->skb_marked_lost) ++ ca_ops->skb_marked_lost(sk, skb); + } + + void tcp_mark_skb_lost(struct sock *sk, struct sk_buff *skb) diff --git a/debian/patches/misc-bbr3/0007-net-tcp_bbr-v2-adjust-skb-tx.in_flight-upon-merge-in.patch b/debian/patches/misc-bbr3/0007-net-tcp_bbr-v2-adjust-skb-tx.in_flight-upon-merge-in.patch new file mode 100644 index 0000000..35dec47 --- /dev/null +++ b/debian/patches/misc-bbr3/0007-net-tcp_bbr-v2-adjust-skb-tx.in_flight-upon-merge-in.patch @@ -0,0 +1,59 @@ +From af7d33e71649b8e2ae00dccf336720a8ab891606 Mon Sep 17 00:00:00 2001 +From: Neal Cardwell +Date: Wed, 1 May 2019 20:16:33 -0400 +Subject: [PATCH 07/19] net-tcp_bbr: v2: adjust skb tx.in_flight upon merge in + tcp_shifted_skb() + +When tcp_shifted_skb() updates state as adjacent SACKed skbs are +coalesced, previously the tx.in_flight was not adjusted, so we could +get contradictory state where the skb's recorded pcount was bigger +than the tx.in_flight (the number of segments that were in_flight +after sending the skb). + +Normally have a SACKed skb with contradictory pcount/tx.in_flight +would not matter. However, with SACK reneging, the SACKed bit is +removed, and an skb once again becomes eligible for retransmitting, +fragmenting, SACKing, etc. Packetdrill testing verified the following +sequence is possible in a kernel that does not have this commit: + + - skb N is SACKed + - skb N+1 is SACKed and combined with skb N using tcp_shifted_skb() + - tcp_shifted_skb() will increase the pcount of prev, + but leave tx.in_flight as-is + - so prev skb can have pcount > tx.in_flight + - RTO, tcp_timeout_mark_lost(), detect reneg, + remove "SACKed" bit, mark skb N as lost + - find pcount of skb N is greater than its tx.in_flight + +I suspect this issue iw what caused the bbr2_inflight_hi_from_lost_skb(): + WARN_ON_ONCE(inflight_prev < 0) +to fire in production machines using bbr2. + +Effort: net-tcp_bbr +Origin-9xx-SHA1: 1a3e997e613d2dcf32b947992882854ebe873715 +Change-Id: I1b0b75c27519953430c7db51c6f358f104c7af55 +Signed-off-by: Alexandre Frade +--- + net/ipv4/tcp_input.c | 11 +++++++++++ + 1 file changed, 11 insertions(+) + +--- a/net/ipv4/tcp_input.c ++++ b/net/ipv4/tcp_input.c +@@ -1506,6 +1506,17 @@ static bool tcp_shifted_skb(struct sock + WARN_ON_ONCE(tcp_skb_pcount(skb) < pcount); + tcp_skb_pcount_add(skb, -pcount); + ++ /* Adjust tx.in_flight as pcount is shifted from skb to prev. */ ++ if (WARN_ONCE(TCP_SKB_CB(skb)->tx.in_flight < pcount, ++ "prev in_flight: %u skb in_flight: %u pcount: %u", ++ TCP_SKB_CB(prev)->tx.in_flight, ++ TCP_SKB_CB(skb)->tx.in_flight, ++ pcount)) ++ TCP_SKB_CB(skb)->tx.in_flight = 0; ++ else ++ TCP_SKB_CB(skb)->tx.in_flight -= pcount; ++ TCP_SKB_CB(prev)->tx.in_flight += pcount; ++ + /* When we're adding to gso_segs == 1, gso_size will be zero, + * in theory this shouldn't be necessary but as long as DSACK + * code can come after this skb later on it's better to keep diff --git a/debian/patches/misc-bbr3/0008-net-tcp_bbr-v2-adjust-skb-tx.in_flight-upon-split-in.patch b/debian/patches/misc-bbr3/0008-net-tcp_bbr-v2-adjust-skb-tx.in_flight-upon-split-in.patch new file mode 100644 index 0000000..f437449 --- /dev/null +++ b/debian/patches/misc-bbr3/0008-net-tcp_bbr-v2-adjust-skb-tx.in_flight-upon-split-in.patch @@ -0,0 +1,97 @@ +From a4d44bce49f61f8755f558dc40edff5f8958b7c6 Mon Sep 17 00:00:00 2001 +From: Neal Cardwell +Date: Wed, 1 May 2019 20:16:25 -0400 +Subject: [PATCH 08/19] net-tcp_bbr: v2: adjust skb tx.in_flight upon split in + tcp_fragment() + +When we fragment an skb that has already been sent, we need to update +the tx.in_flight for the first skb in the resulting pair ("buff"). + +Because we were not updating the tx.in_flight, the tx.in_flight value +was inconsistent with the pcount of the "buff" skb (tx.in_flight would +be too high). That meant that if the "buff" skb was lost, then +bbr2_inflight_hi_from_lost_skb() would calculate an inflight_hi value +that is too high. This could result in longer queues and higher packet +loss. + +Packetdrill testing verified that without this commit, when the second +half of an skb is SACKed and then later the first half of that skb is +marked lost, the calculated inflight_hi was incorrect. + +Effort: net-tcp_bbr +Origin-9xx-SHA1: 385f1ddc610798fab2837f9f372857438b25f874 +Origin-9xx-SHA1: a0eb099690af net-tcp_bbr: v2: fix tcp_fragment() tx.in_flight recomputation [prod feb 8 2021; use as a fixup] +Origin-9xx-SHA1: 885503228153ff0c9114e net-tcp_bbr: v2: introduce tcp_skb_tx_in_flight_is_suspicious() helper for warnings +Change-Id: I617f8cab4e9be7a0b8e8d30b047bf8645393354d +Signed-off-by: Alexandre Frade +--- + include/net/tcp.h | 15 +++++++++++++++ + net/ipv4/tcp_output.c | 26 +++++++++++++++++++++++++- + 2 files changed, 40 insertions(+), 1 deletion(-) + +--- a/include/net/tcp.h ++++ b/include/net/tcp.h +@@ -1283,6 +1283,21 @@ static inline bool tcp_skb_sent_after(u6 + return t1 > t2 || (t1 == t2 && after(seq1, seq2)); + } + ++/* If a retransmit failed due to local qdisc congestion or other local issues, ++ * then we may have called tcp_set_skb_tso_segs() to increase the number of ++ * segments in the skb without increasing the tx.in_flight. In all other cases, ++ * the tx.in_flight should be at least as big as the pcount of the sk_buff. We ++ * do not have the state to know whether a retransmit failed due to local qdisc ++ * congestion or other local issues, so to avoid spurious warnings we consider ++ * that any skb marked lost may have suffered that fate. ++ */ ++static inline bool tcp_skb_tx_in_flight_is_suspicious(u32 skb_pcount, ++ u32 skb_sacked_flags, ++ u32 tx_in_flight) ++{ ++ return (skb_pcount > tx_in_flight) && !(skb_sacked_flags & TCPCB_LOST); ++} ++ + /* These functions determine how the current flow behaves in respect of SACK + * handling. SACK is negotiated with the peer, and therefore it can vary + * between different flows. +--- a/net/ipv4/tcp_output.c ++++ b/net/ipv4/tcp_output.c +@@ -1601,7 +1601,7 @@ int tcp_fragment(struct sock *sk, enum t + { + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *buff; +- int old_factor; ++ int old_factor, inflight_prev; + long limit; + int nlen; + u8 flags; +@@ -1676,6 +1676,30 @@ int tcp_fragment(struct sock *sk, enum t + + if (diff) + tcp_adjust_pcount(sk, skb, diff); ++ ++ inflight_prev = TCP_SKB_CB(skb)->tx.in_flight - old_factor; ++ if (inflight_prev < 0) { ++ WARN_ONCE(tcp_skb_tx_in_flight_is_suspicious( ++ old_factor, ++ TCP_SKB_CB(skb)->sacked, ++ TCP_SKB_CB(skb)->tx.in_flight), ++ "inconsistent: tx.in_flight: %u " ++ "old_factor: %d mss: %u sacked: %u " ++ "1st pcount: %d 2nd pcount: %d " ++ "1st len: %u 2nd len: %u ", ++ TCP_SKB_CB(skb)->tx.in_flight, old_factor, ++ mss_now, TCP_SKB_CB(skb)->sacked, ++ tcp_skb_pcount(skb), tcp_skb_pcount(buff), ++ skb->len, buff->len); ++ inflight_prev = 0; ++ } ++ /* Set 1st tx.in_flight as if 1st were sent by itself: */ ++ TCP_SKB_CB(skb)->tx.in_flight = inflight_prev + ++ tcp_skb_pcount(skb); ++ /* Set 2nd tx.in_flight with new 1st and 2nd pcounts: */ ++ TCP_SKB_CB(buff)->tx.in_flight = inflight_prev + ++ tcp_skb_pcount(skb) + ++ tcp_skb_pcount(buff); + } + + /* Link BUFF into the send queue. */ diff --git a/debian/patches/misc-bbr3/0009-net-tcp-add-new-ca-opts-flag-TCP_CONG_WANTS_CE_EVENT.patch b/debian/patches/misc-bbr3/0009-net-tcp-add-new-ca-opts-flag-TCP_CONG_WANTS_CE_EVENT.patch new file mode 100644 index 0000000..4f9b314 --- /dev/null +++ b/debian/patches/misc-bbr3/0009-net-tcp-add-new-ca-opts-flag-TCP_CONG_WANTS_CE_EVENT.patch @@ -0,0 +1,73 @@ +From 65cca0e8fd954a150ec874650af47f7800ea3049 Mon Sep 17 00:00:00 2001 +From: Yousuk Seung +Date: Wed, 23 May 2018 17:55:54 -0700 +Subject: [PATCH 09/19] net-tcp: add new ca opts flag TCP_CONG_WANTS_CE_EVENTS + +Add a a new ca opts flag TCP_CONG_WANTS_CE_EVENTS that allows a +congestion control module to receive CE events. + +Currently congestion control modules have to set the TCP_CONG_NEEDS_ECN +bit in opts flag to receive CE events but this may incur changes in ECN +behavior elsewhere. This patch adds a new bit TCP_CONG_WANTS_CE_EVENTS +that allows congestion control modules to receive CE events +independently of TCP_CONG_NEEDS_ECN. + +Effort: net-tcp +Origin-9xx-SHA1: 9f7e14716cde760bc6c67ef8ef7e1ee48501d95b +Change-Id: I2255506985242f376d910c6fd37daabaf4744f24 +Signed-off-by: Alexandre Frade +--- + include/net/tcp.h | 14 +++++++++++++- + net/ipv4/tcp_input.c | 4 ++-- + 2 files changed, 15 insertions(+), 3 deletions(-) + +--- a/include/net/tcp.h ++++ b/include/net/tcp.h +@@ -1119,7 +1119,11 @@ enum tcp_ca_ack_event_flags { + #define TCP_CONG_NON_RESTRICTED 0x1 + /* Requires ECN/ECT set on all packets */ + #define TCP_CONG_NEEDS_ECN 0x2 +-#define TCP_CONG_MASK (TCP_CONG_NON_RESTRICTED | TCP_CONG_NEEDS_ECN) ++/* Wants notification of CE events (CA_EVENT_ECN_IS_CE, CA_EVENT_ECN_NO_CE). */ ++#define TCP_CONG_WANTS_CE_EVENTS 0x4 ++#define TCP_CONG_MASK (TCP_CONG_NON_RESTRICTED | \ ++ TCP_CONG_NEEDS_ECN | \ ++ TCP_CONG_WANTS_CE_EVENTS) + + union tcp_cc_info; + +@@ -1251,6 +1255,14 @@ static inline char *tcp_ca_get_name_by_k + } + #endif + ++static inline bool tcp_ca_wants_ce_events(const struct sock *sk) ++{ ++ const struct inet_connection_sock *icsk = inet_csk(sk); ++ ++ return icsk->icsk_ca_ops->flags & (TCP_CONG_NEEDS_ECN | ++ TCP_CONG_WANTS_CE_EVENTS); ++} ++ + static inline bool tcp_ca_needs_ecn(const struct sock *sk) + { + const struct inet_connection_sock *icsk = inet_csk(sk); +--- a/net/ipv4/tcp_input.c ++++ b/net/ipv4/tcp_input.c +@@ -370,7 +370,7 @@ static void __tcp_ecn_check_ce(struct so + tcp_enter_quickack_mode(sk, 2); + break; + case INET_ECN_CE: +- if (tcp_ca_needs_ecn(sk)) ++ if (tcp_ca_wants_ce_events(sk)) + tcp_ca_event(sk, CA_EVENT_ECN_IS_CE); + + if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) { +@@ -381,7 +381,7 @@ static void __tcp_ecn_check_ce(struct so + tp->ecn_flags |= TCP_ECN_SEEN; + break; + default: +- if (tcp_ca_needs_ecn(sk)) ++ if (tcp_ca_wants_ce_events(sk)) + tcp_ca_event(sk, CA_EVENT_ECN_NO_CE); + tp->ecn_flags |= TCP_ECN_SEEN; + break; diff --git a/debian/patches/misc-bbr3/0010-net-tcp-re-generalize-TSO-sizing-in-TCP-CC-module-AP.patch b/debian/patches/misc-bbr3/0010-net-tcp-re-generalize-TSO-sizing-in-TCP-CC-module-AP.patch new file mode 100644 index 0000000..b195ec9 --- /dev/null +++ b/debian/patches/misc-bbr3/0010-net-tcp-re-generalize-TSO-sizing-in-TCP-CC-module-AP.patch @@ -0,0 +1,118 @@ +From 3acb852e1cfcdeea388bd428c6dd81609fd40792 Mon Sep 17 00:00:00 2001 +From: Neal Cardwell +Date: Fri, 27 Sep 2019 17:10:26 -0400 +Subject: [PATCH 10/19] net-tcp: re-generalize TSO sizing in TCP CC module API + +Reorganize the API for CC modules so that the CC module once again +gets complete control of the TSO sizing decision. This is how the API +was set up around 2016 and the initial BBRv1 upstreaming. Later Eric +Dumazet simplified it. But with wider testing it now seems that to +avoid CPU regressions BBR needs to have a different TSO sizing +function. + +This is necessary to handle cases where there are many flows +bottlenecked on the sender host's NIC, in which case BBR's pacing rate +is much lower than CUBIC/Reno/DCTCP's. Why does this happen? Because +BBR's pacing rate adapts to the low bandwidth share each flow sees. By +contrast, CUBIC/Reno/DCTCP see no loss or ECN, so they grow a very +large cwnd, and thus large pacing rate and large TSO burst size. + +Change-Id: Ic8ccfdbe4010ee8d4bf6a6334c48a2fceb2171ea +Signed-off-by: Alexandre Frade +--- + include/net/tcp.h | 4 ++-- + net/ipv4/tcp_bbr.c | 37 ++++++++++++++++++++++++++----------- + net/ipv4/tcp_output.c | 11 +++++------ + 3 files changed, 33 insertions(+), 19 deletions(-) + +--- a/include/net/tcp.h ++++ b/include/net/tcp.h +@@ -1185,8 +1185,8 @@ struct tcp_congestion_ops { + /* hook for packet ack accounting (optional) */ + void (*pkts_acked)(struct sock *sk, const struct ack_sample *sample); + +- /* override sysctl_tcp_min_tso_segs */ +- u32 (*min_tso_segs)(struct sock *sk); ++ /* pick target number of segments per TSO/GSO skb (optional): */ ++ u32 (*tso_segs)(struct sock *sk, unsigned int mss_now); + + /* react to a specific lost skb (optional) */ + void (*skb_marked_lost)(struct sock *sk, const struct sk_buff *skb); +--- a/net/ipv4/tcp_bbr.c ++++ b/net/ipv4/tcp_bbr.c +@@ -301,20 +301,35 @@ __bpf_kfunc static u32 bbr_min_tso_segs( + return READ_ONCE(sk->sk_pacing_rate) < (bbr_min_tso_rate >> 3) ? 1 : 2; + } + ++/* Return the number of segments BBR would like in a TSO/GSO skb, given ++ * a particular max gso size as a constraint. ++ */ ++static u32 bbr_tso_segs_generic(struct sock *sk, unsigned int mss_now, ++ u32 gso_max_size) ++{ ++ u32 segs; ++ u64 bytes; ++ ++ /* Budget a TSO/GSO burst size allowance based on bw (pacing_rate). */ ++ bytes = READ_ONCE(sk->sk_pacing_rate) >> READ_ONCE(sk->sk_pacing_shift); ++ ++ bytes = min_t(u32, bytes, gso_max_size - 1 - MAX_TCP_HEADER); ++ segs = max_t(u32, bytes / mss_now, bbr_min_tso_segs(sk)); ++ return segs; ++} ++ ++/* Custom tcp_tso_autosize() for BBR, used at transmit time to cap skb size. */ ++static u32 bbr_tso_segs(struct sock *sk, unsigned int mss_now) ++{ ++ return bbr_tso_segs_generic(sk, mss_now, sk->sk_gso_max_size); ++} ++ ++/* Like bbr_tso_segs(), using mss_cache, ignoring driver's sk_gso_max_size. */ + static u32 bbr_tso_segs_goal(struct sock *sk) + { + struct tcp_sock *tp = tcp_sk(sk); +- u32 segs, bytes; +- +- /* Sort of tcp_tso_autosize() but ignoring +- * driver provided sk_gso_max_size. +- */ +- bytes = min_t(unsigned long, +- READ_ONCE(sk->sk_pacing_rate) >> READ_ONCE(sk->sk_pacing_shift), +- GSO_LEGACY_MAX_SIZE - 1 - MAX_TCP_HEADER); +- segs = max_t(u32, bytes / tp->mss_cache, bbr_min_tso_segs(sk)); + +- return min(segs, 0x7FU); ++ return bbr_tso_segs_generic(sk, tp->mss_cache, GSO_MAX_SIZE); + } + + /* Save "last known good" cwnd so we can restore it after losses or PROBE_RTT */ +@@ -1150,7 +1165,7 @@ static struct tcp_congestion_ops tcp_bbr + .undo_cwnd = bbr_undo_cwnd, + .cwnd_event = bbr_cwnd_event, + .ssthresh = bbr_ssthresh, +- .min_tso_segs = bbr_min_tso_segs, ++ .tso_segs = bbr_tso_segs, + .get_info = bbr_get_info, + .set_state = bbr_set_state, + }; +--- a/net/ipv4/tcp_output.c ++++ b/net/ipv4/tcp_output.c +@@ -2057,13 +2057,12 @@ static u32 tcp_tso_autosize(const struct + static u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now) + { + const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops; +- u32 min_tso, tso_segs; ++ u32 tso_segs; + +- min_tso = ca_ops->min_tso_segs ? +- ca_ops->min_tso_segs(sk) : +- READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs); +- +- tso_segs = tcp_tso_autosize(sk, mss_now, min_tso); ++ tso_segs = ca_ops->tso_segs ? ++ ca_ops->tso_segs(sk, mss_now) : ++ tcp_tso_autosize(sk, mss_now, ++ sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs); + return min_t(u32, tso_segs, sk->sk_gso_max_segs); + } + diff --git a/debian/patches/misc-bbr3/0011-net-tcp-add-fast_ack_mode-1-skip-rwin-check-in-tcp_f.patch b/debian/patches/misc-bbr3/0011-net-tcp-add-fast_ack_mode-1-skip-rwin-check-in-tcp_f.patch new file mode 100644 index 0000000..894d5b1 --- /dev/null +++ b/debian/patches/misc-bbr3/0011-net-tcp-add-fast_ack_mode-1-skip-rwin-check-in-tcp_f.patch @@ -0,0 +1,72 @@ +From 3741ada76bab5111cbb9c279cf27e67a0334eb05 Mon Sep 17 00:00:00 2001 +From: Neal Cardwell +Date: Sun, 7 Jan 2024 21:11:26 -0300 +Subject: [PATCH 11/19] net-tcp: add fast_ack_mode=1: skip rwin check in + tcp_fast_ack_mode__tcp_ack_snd_check() + +Add logic for an experimental TCP connection behavior, enabled with +tp->fast_ack_mode = 1, which disables checking the receive window +before sending an ack in __tcp_ack_snd_check(). If this behavior is +enabled, the data receiver sends an ACK if the amount of data is > +RCV.MSS. + +Change-Id: Iaa0a0fd7108221f883137a79d5bfa724f1b096d4 +Signed-off-by: Alexandre Frade +--- + include/linux/tcp.h | 3 ++- + net/ipv4/tcp.c | 1 + + net/ipv4/tcp_cong.c | 1 + + net/ipv4/tcp_input.c | 5 +++-- + 4 files changed, 7 insertions(+), 3 deletions(-) + +--- a/include/linux/tcp.h ++++ b/include/linux/tcp.h +@@ -369,7 +369,8 @@ struct tcp_sock { + u8 compressed_ack; + u8 dup_ack_counter:2, + tlp_retrans:1, /* TLP is a retransmission */ +- unused:5; ++ fast_ack_mode:2, /* which fast ack mode ? */ ++ unused:3; + u8 thin_lto : 1,/* Use linear timeouts for thin streams */ + fastopen_connect:1, /* FASTOPEN_CONNECT sockopt */ + fastopen_no_cookie:1, /* Allow send/recv SYN+data without a cookie */ +--- a/net/ipv4/tcp.c ++++ b/net/ipv4/tcp.c +@@ -3123,6 +3123,7 @@ int tcp_disconnect(struct sock *sk, int + tp->rx_opt.dsack = 0; + tp->rx_opt.num_sacks = 0; + tp->rcv_ooopack = 0; ++ tp->fast_ack_mode = 0; + + + /* Clean up fastopen related fields */ +--- a/net/ipv4/tcp_cong.c ++++ b/net/ipv4/tcp_cong.c +@@ -237,6 +237,7 @@ void tcp_init_congestion_control(struct + struct inet_connection_sock *icsk = inet_csk(sk); + + tcp_sk(sk)->prior_ssthresh = 0; ++ tcp_sk(sk)->fast_ack_mode = 0; + if (icsk->icsk_ca_ops->init) + icsk->icsk_ca_ops->init(sk); + if (tcp_ca_needs_ecn(sk)) +--- a/net/ipv4/tcp_input.c ++++ b/net/ipv4/tcp_input.c +@@ -5763,13 +5763,14 @@ static void __tcp_ack_snd_check(struct s + + /* More than one full frame received... */ + if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss && ++ (tp->fast_ack_mode == 1 || + /* ... and right edge of window advances far enough. + * (tcp_recvmsg() will send ACK otherwise). + * If application uses SO_RCVLOWAT, we want send ack now if + * we have not received enough bytes to satisfy the condition. + */ +- (tp->rcv_nxt - tp->copied_seq < sk->sk_rcvlowat || +- __tcp_select_window(sk) >= tp->rcv_wnd)) || ++ (tp->rcv_nxt - tp->copied_seq < sk->sk_rcvlowat || ++ __tcp_select_window(sk) >= tp->rcv_wnd))) || + /* We ACK each frame or... */ + tcp_in_quickack_mode(sk) || + /* Protocol state mandates a one-time immediate ACK */ diff --git a/debian/patches/misc-bbr3/0012-net-tcp_bbr-v2-record-app-limited-status-of-TLP-repa.patch b/debian/patches/misc-bbr3/0012-net-tcp_bbr-v2-record-app-limited-status-of-TLP-repa.patch new file mode 100644 index 0000000..07b0d0c --- /dev/null +++ b/debian/patches/misc-bbr3/0012-net-tcp_bbr-v2-record-app-limited-status-of-TLP-repa.patch @@ -0,0 +1,45 @@ +From e5d35b7c882b7001f8a31b14c9f08917230dedc3 Mon Sep 17 00:00:00 2001 +From: Jianfeng Wang +Date: Fri, 19 Jun 2020 17:33:45 +0000 +Subject: [PATCH 12/19] net-tcp_bbr: v2: record app-limited status of + TLP-repaired flight + +When sending a TLP retransmit, record whether the outstanding flight +of data is application limited. This is important for congestion +control modules that want to respond to losses repaired by TLP +retransmits. This is important because the following scenarios convey +very different information: + (1) a packet loss with a small number of packets in flight; + (2) a packet loss with the maximum amount of data in flight allowed + by the CC module; + +Effort: net-tcp_bbr +Change-Id: Ic8ae567caa4e4bfd5fd82c3d4be12a5d9171655e +Signed-off-by: Alexandre Frade +--- + include/linux/tcp.h | 3 ++- + net/ipv4/tcp_output.c | 1 + + 2 files changed, 3 insertions(+), 1 deletion(-) + +--- a/include/linux/tcp.h ++++ b/include/linux/tcp.h +@@ -370,7 +370,8 @@ struct tcp_sock { + u8 dup_ack_counter:2, + tlp_retrans:1, /* TLP is a retransmission */ + fast_ack_mode:2, /* which fast ack mode ? */ +- unused:3; ++ tlp_orig_data_app_limited:1, /* app-limited before TLP rtx? */ ++ unused:2; + u8 thin_lto : 1,/* Use linear timeouts for thin streams */ + fastopen_connect:1, /* FASTOPEN_CONNECT sockopt */ + fastopen_no_cookie:1, /* Allow send/recv SYN+data without a cookie */ +--- a/net/ipv4/tcp_output.c ++++ b/net/ipv4/tcp_output.c +@@ -3003,6 +3003,7 @@ void tcp_send_loss_probe(struct sock *sk + if (WARN_ON(!skb || !tcp_skb_pcount(skb))) + goto rearm_timer; + ++ tp->tlp_orig_data_app_limited = TCP_SKB_CB(skb)->tx.is_app_limited; + if (__tcp_retransmit_skb(sk, skb, 1)) + goto rearm_timer; + diff --git a/debian/patches/misc-bbr3/0013-net-tcp_bbr-v2-inform-CC-module-of-losses-repaired-b.patch b/debian/patches/misc-bbr3/0013-net-tcp_bbr-v2-inform-CC-module-of-losses-repaired-b.patch new file mode 100644 index 0000000..217eaea --- /dev/null +++ b/debian/patches/misc-bbr3/0013-net-tcp_bbr-v2-inform-CC-module-of-losses-repaired-b.patch @@ -0,0 +1,45 @@ +From 77e7c22b63f8934206b1e89c173558c3967f0779 Mon Sep 17 00:00:00 2001 +From: Jianfeng Wang +Date: Tue, 16 Jun 2020 17:41:19 +0000 +Subject: [PATCH 13/19] net-tcp_bbr: v2: inform CC module of losses repaired by + TLP probe + +Before this commit, when there is a packet loss that creates a sequence +hole that is filled by a TLP loss probe, then tcp_process_tlp_ack() +only informs the congestion control (CC) module via a back-to-back entry +and exit of CWR. But some congestion control modules (e.g. BBR) do not +respond to CWR events. + +This commit adds a new CA event with which the core TCP stack notifies +the CC module when a loss is repaired by a TLP. This will allow CC +modules that do not use the CWR mechanism to have a custom handler for +such TLP recoveries. + +Effort: net-tcp_bbr +Change-Id: Ieba72332b401b329bff5a641d2b2043a3fb8f632 +Signed-off-by: Alexandre Frade +--- + include/net/tcp.h | 1 + + net/ipv4/tcp_input.c | 1 + + 2 files changed, 2 insertions(+) + +--- a/include/net/tcp.h ++++ b/include/net/tcp.h +@@ -1097,6 +1097,7 @@ enum tcp_ca_event { + CA_EVENT_LOSS, /* loss timeout */ + CA_EVENT_ECN_NO_CE, /* ECT set, but not CE marked */ + CA_EVENT_ECN_IS_CE, /* received CE marked IP packet */ ++ CA_EVENT_TLP_RECOVERY, /* a lost segment was repaired by TLP probe */ + }; + + /* Information about inbound ACK, passed to cong_ops->in_ack_event() */ +--- a/net/ipv4/tcp_input.c ++++ b/net/ipv4/tcp_input.c +@@ -3859,6 +3859,7 @@ static void tcp_process_tlp_ack(struct s + /* ACK advances: there was a loss, so reduce cwnd. Reset + * tlp_high_seq in tcp_init_cwnd_reduction() + */ ++ tcp_ca_event(sk, CA_EVENT_TLP_RECOVERY); + tcp_init_cwnd_reduction(sk); + tcp_set_ca_state(sk, TCP_CA_CWR); + tcp_end_cwnd_reduction(sk); diff --git a/debian/patches/misc-bbr3/0014-net-tcp_bbr-v2-introduce-is_acking_tlp_retrans_seq-i.patch b/debian/patches/misc-bbr3/0014-net-tcp_bbr-v2-introduce-is_acking_tlp_retrans_seq-i.patch new file mode 100644 index 0000000..43a1ade --- /dev/null +++ b/debian/patches/misc-bbr3/0014-net-tcp_bbr-v2-introduce-is_acking_tlp_retrans_seq-i.patch @@ -0,0 +1,73 @@ +From cab22a8e2e87870e8334a12ffcd0ba04ea81126f Mon Sep 17 00:00:00 2001 +From: Neal Cardwell +Date: Mon, 21 Sep 2020 14:46:26 -0400 +Subject: [PATCH 14/19] net-tcp_bbr: v2: introduce is_acking_tlp_retrans_seq + into rate_sample + +Introduce is_acking_tlp_retrans_seq into rate_sample. This bool will +export to the CC module the knowledge of whether the current ACK +matched a TLP retransmit. + +Note that when this bool is true, we cannot yet tell (in general) whether +this ACK is for the original or the TLP retransmit. + +Effort: net-tcp_bbr +Change-Id: I2e6494332167e75efcbdc99bd5c119034e9c39b4 +Signed-off-by: Alexandre Frade +--- + include/net/tcp.h | 1 + + net/ipv4/tcp_input.c | 12 +++++++++--- + 2 files changed, 10 insertions(+), 3 deletions(-) + +--- a/include/net/tcp.h ++++ b/include/net/tcp.h +@@ -1161,6 +1161,7 @@ struct rate_sample { + u32 last_end_seq; /* end_seq of most recently ACKed packet */ + bool is_app_limited; /* is sample from packet with bubble in pipe? */ + bool is_retrans; /* is sample from retransmission? */ ++ bool is_acking_tlp_retrans_seq; /* ACKed a TLP retransmit sequence? */ + bool is_ack_delayed; /* is this (likely) a delayed ACK? */ + bool is_ece; /* did this ACK have ECN marked? */ + }; +--- a/net/ipv4/tcp_input.c ++++ b/net/ipv4/tcp_input.c +@@ -3842,7 +3842,8 @@ static void tcp_replace_ts_recent(struct + /* This routine deals with acks during a TLP episode and ends an episode by + * resetting tlp_high_seq. Ref: TLP algorithm in draft-ietf-tcpm-rack + */ +-static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag) ++static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag, ++ struct rate_sample *rs) + { + struct tcp_sock *tp = tcp_sk(sk); + +@@ -3870,6 +3871,11 @@ static void tcp_process_tlp_ack(struct s + FLAG_NOT_DUP | FLAG_DATA_SACKED))) { + /* Pure dupack: original and TLP probe arrived; no loss */ + tp->tlp_high_seq = 0; ++ } else { ++ /* This ACK matches a TLP retransmit. We cannot yet tell if ++ * this ACK is for the original or the TLP retransmit. ++ */ ++ rs->is_acking_tlp_retrans_seq = 1; + } + } + +@@ -4053,7 +4059,7 @@ static int tcp_ack(struct sock *sk, cons + tcp_rack_update_reo_wnd(sk, &rs); + + if (tp->tlp_high_seq) +- tcp_process_tlp_ack(sk, ack, flag); ++ tcp_process_tlp_ack(sk, ack, flag, &rs); + + if (tcp_ack_is_dubious(sk, flag)) { + if (!(flag & (FLAG_SND_UNA_ADVANCED | +@@ -4097,7 +4103,7 @@ no_queue: + tcp_ack_probe(sk); + + if (tp->tlp_high_seq) +- tcp_process_tlp_ack(sk, ack, flag); ++ tcp_process_tlp_ack(sk, ack, flag, &rs); + return 1; + + old_ack: diff --git a/debian/patches/misc-bbr3/0015-tcp-introduce-per-route-feature-RTAX_FEATURE_ECN_LOW.patch b/debian/patches/misc-bbr3/0015-tcp-introduce-per-route-feature-RTAX_FEATURE_ECN_LOW.patch new file mode 100644 index 0000000..5271d85 --- /dev/null +++ b/debian/patches/misc-bbr3/0015-tcp-introduce-per-route-feature-RTAX_FEATURE_ECN_LOW.patch @@ -0,0 +1,112 @@ +From 38dd25482f815d949fec91edd7694b2f15823f67 Mon Sep 17 00:00:00 2001 +From: David Morley +Date: Fri, 14 Jul 2023 11:07:56 -0400 +Subject: [PATCH 15/19] tcp: introduce per-route feature RTAX_FEATURE_ECN_LOW + +Define and implement a new per-route feature, RTAX_FEATURE_ECN_LOW. + +This feature indicates that the given destination network is a +low-latency ECN environment, meaning both that ECN CE marks are +applied by the network using a low-latency marking threshold and also +that TCP endpoints provide precise per-data-segment ECN feedback in +ACKs (where the ACK ECE flag echoes the received CE status of all +newly-acknowledged data segments). This feature indication can be used +by congestion control algorithms to decide how to interpret ECN +signals over the given destination network. + +This feature is appropriate for datacenter-style ECN marking, such as +the ECN marking approach expected by DCTCP or BBR congestion control +modules. + +Signed-off-by: David Morley +Signed-off-by: Neal Cardwell +Signed-off-by: Yuchung Cheng +Tested-by: David Morley +Change-Id: I6bc06e9c6cb426fbae7243fc71c9a8c18175f5d3 +Signed-off-by: Alexandre Frade +--- + include/net/tcp.h | 10 ++++++++++ + include/uapi/linux/rtnetlink.h | 4 +++- + net/ipv4/tcp_minisocks.c | 2 ++ + net/ipv4/tcp_output.c | 6 ++++-- + 4 files changed, 19 insertions(+), 3 deletions(-) + +--- a/include/net/tcp.h ++++ b/include/net/tcp.h +@@ -375,6 +375,7 @@ static inline void tcp_dec_quickack_mode + #define TCP_ECN_QUEUE_CWR 2 + #define TCP_ECN_DEMAND_CWR 4 + #define TCP_ECN_SEEN 8 ++#define TCP_ECN_LOW 16 + + enum tcp_tw_status { + TCP_TW_SUCCESS = 0, +@@ -777,6 +778,15 @@ static inline void tcp_fast_path_check(s + tcp_fast_path_on(tp); + } + ++static inline void tcp_set_ecn_low_from_dst(struct sock *sk, ++ const struct dst_entry *dst) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ ++ if (dst_feature(dst, RTAX_FEATURE_ECN_LOW)) ++ tp->ecn_flags |= TCP_ECN_LOW; ++} ++ + u32 tcp_delack_max(const struct sock *sk); + + /* Compute the actual rto_min value */ +--- a/include/uapi/linux/rtnetlink.h ++++ b/include/uapi/linux/rtnetlink.h +@@ -507,12 +507,14 @@ enum { + #define RTAX_FEATURE_TIMESTAMP (1 << 2) /* unused */ + #define RTAX_FEATURE_ALLFRAG (1 << 3) /* unused */ + #define RTAX_FEATURE_TCP_USEC_TS (1 << 4) ++#define RTAX_FEATURE_ECN_LOW (1 << 5) + + #define RTAX_FEATURE_MASK (RTAX_FEATURE_ECN | \ + RTAX_FEATURE_SACK | \ + RTAX_FEATURE_TIMESTAMP | \ + RTAX_FEATURE_ALLFRAG | \ +- RTAX_FEATURE_TCP_USEC_TS) ++ RTAX_FEATURE_TCP_USEC_TS | \ ++ RTAX_FEATURE_ECN_LOW) + + struct rta_session { + __u8 proto; +--- a/net/ipv4/tcp_minisocks.c ++++ b/net/ipv4/tcp_minisocks.c +@@ -459,6 +459,8 @@ void tcp_ca_openreq_child(struct sock *s + u32 ca_key = dst_metric(dst, RTAX_CC_ALGO); + bool ca_got_dst = false; + ++ tcp_set_ecn_low_from_dst(sk, dst); ++ + if (ca_key != TCP_CA_UNSPEC) { + const struct tcp_congestion_ops *ca; + +--- a/net/ipv4/tcp_output.c ++++ b/net/ipv4/tcp_output.c +@@ -336,10 +336,9 @@ static void tcp_ecn_send_syn(struct sock + bool bpf_needs_ecn = tcp_bpf_ca_needs_ecn(sk); + bool use_ecn = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn) == 1 || + tcp_ca_needs_ecn(sk) || bpf_needs_ecn; ++ const struct dst_entry *dst = __sk_dst_get(sk); + + if (!use_ecn) { +- const struct dst_entry *dst = __sk_dst_get(sk); +- + if (dst && dst_feature(dst, RTAX_FEATURE_ECN)) + use_ecn = true; + } +@@ -351,6 +350,9 @@ static void tcp_ecn_send_syn(struct sock + tp->ecn_flags = TCP_ECN_OK; + if (tcp_ca_needs_ecn(sk) || bpf_needs_ecn) + INET_ECN_xmit(sk); ++ ++ if (dst) ++ tcp_set_ecn_low_from_dst(sk, dst); + } + } + diff --git a/debian/patches/misc-bbr3/0016-net-tcp_bbr-v3-update-TCP-bbr-congestion-control-mod.patch b/debian/patches/misc-bbr3/0016-net-tcp_bbr-v3-update-TCP-bbr-congestion-control-mod.patch new file mode 100644 index 0000000..f1ee5be --- /dev/null +++ b/debian/patches/misc-bbr3/0016-net-tcp_bbr-v3-update-TCP-bbr-congestion-control-mod.patch @@ -0,0 +1,2821 @@ +From 6e06d157aa50e3288c749919a81f04ec792d2d91 Mon Sep 17 00:00:00 2001 +From: Neal Cardwell +Date: Tue, 11 Jun 2019 12:54:22 -0400 +Subject: [PATCH 16/19] net-tcp_bbr: v3: update TCP "bbr" congestion control + module to BBRv3 + +BBR v3 is an enhacement to the BBR v1 algorithm. It's designed to aim for lower +queues, lower loss, and better Reno/CUBIC coexistence than BBR v1. + +BBR v3 maintains the core of BBR v1: an explicit model of the network +path that is two-dimensional, adapting to estimate the (a) maximum +available bandwidth and (b) maximum safe volume of data a flow can +keep in-flight in the network. It maintains the estimated BDP as a +core guide for estimating an appropriate level of in-flight data. + +BBR v3 makes several key enhancements: + +o Its bandwidth-probing time scale is adapted, within bounds, to allow improved +coexistence with Reno and CUBIC. The bandwidth-probing time scale is (a) +extended dynamically based on estimated BDP to improve coexistence with +Reno/CUBIC; (b) bounded by an interactive wall-clock time-scale to be more +scalable and responsive than Reno and CUBIC. + +o Rather than being largely agnostic to loss and ECN marks, it explicitly uses +loss and (DCTCP-style) ECN signals to maintain its model. + +o It aims for lower losses than v1 by adjusting its model to attempt to stay +within loss rate and ECN mark rate bounds (loss_thresh and ecn_thresh, +respectively). + +o It adapts to loss/ECN signals even when the application is running out of +data ("application-limited"), in case the "application-limited" flow is also +"network-limited" (the bw and/or inflight available to this flow is lower than +previously estimated when the flow ran out of data). + +o It has a three-part model: the model explicit three tracks operating points, +where an operating point is a tuple: (bandwidth, inflight). The three operating +points are: + + o latest: the latest measurement from the current round trip + o upper bound: robust, optimistic, long-term upper bound + o lower bound: robust, conservative, short-term lower bound + +These are stored in the following state variables: + + o latest: bw_latest, inflight_latest + o lo: bw_lo, inflight_lo + o hi: bw_hi[2], inflight_hi + +To gain intuition about the meaning of the three operating points, it +may help to consider the analogs in CUBIC, which has a somewhat +analogous three-part model used by its probing state machine: + + BBR param CUBIC param + ----------- ------------- + latest ~ cwnd + lo ~ ssthresh + hi ~ last_max_cwnd + +The analogy is only a loose one, though, since the BBR operating +points are calculated differently, and are 2-dimensional (bw,inflight) +rather than CUBIC's one-dimensional notion of operating point +(inflight). + +o It uses the three-part model to adapt the magnitude of its bandwidth +to match the estimated space available in the buffer, rather than (as +in BBR v1) assuming that it was always acceptable to place 0.25*BDP in +the bottleneck buffer when probing (commodity datacenter switches +commonly do not have that much buffer for WAN flows). When BBR v3 +estimates it hit a buffer limit during probing, its bandwidth probing +then starts gently in case little space is still available in the +buffer, and the accelerates, slowly at first and then rapidly if it +can grow inflight without seeing congestion signals. In such cases, +probing is bounded by inflight_hi + inflight_probe, where +inflight_probe grows as: [0, 1, 2, 4, 8, 16,...]. This allows BBR to +keep losses low and bounded if a bottleneck remains congested, while +rapidly/scalably utilizing free bandwidth when it becomes available. + +o It has a slightly revised state machine, to achieve the goals above. + BBR_BW_PROBE_UP: pushes up inflight to probe for bw/vol + BBR_BW_PROBE_DOWN: drain excess inflight from the queue + BBR_BW_PROBE_CRUISE: use pipe, w/ headroom in queue/pipe + BBR_BW_PROBE_REFILL: try refill the pipe again to 100%, leaving queue empty + +o The estimated BDP: BBR v3 continues to maintain an estimate of the +path's two-way propagation delay, by tracking a windowed min_rtt, and +coordinating (on an as-ndeeded basis) to try to expose the two-way +propagation delay by draining the bottleneck queue. + +BBR v3 continues to use its min_rtt and (currently-applicable) bandwidth +estimate to estimate the current bandwidth-delay product. The estimated BDP +still provides one important guideline for bounding inflight data. However, +because any min-filtered RTT and max-filtered bw inherently tend to both +overestimate, the estimated BDP is often too high; in this case loss or ECN +marks can ensue, in which case BBR v3 adjusts inflight_hi and inflight_lo to +adapt its sending rate and inflight down to match the available capacity of the +path. + +o Space: Note that ICSK_CA_PRIV_SIZE increased. This is because BBR v3 +requires more space. Note that much of the space is due to support for +per-socket parameterization and debugging in this release for research +and debugging. With that state removed, the full "struct bbr" is 140 +bytes, or 144 with padding. This is an increase of 40 bytes over the +existing ca_priv space. + +o Code: BBR v3 reuses many pieces from BBR v1. But it omits the following + significant pieces: + + o "packet conservation" (bbr_set_cwnd_to_recover_or_restore(), + bbr_can_grow_inflight()) + o long-term bandwidth estimator ("policer mode") + + The code layout tries to keep BBR v3 code near the bottom of the + file, so that v1-applicable code in the top does not accidentally + refer to v3 code. + +o Docs: + See the following docs for more details and diagrams decsribing the BBR v3 + algorithm: + https://datatracker.ietf.org/meeting/104/materials/slides-104-iccrg-an-update-on-bbr-00 + https://datatracker.ietf.org/meeting/102/materials/slides-102-iccrg-an-update-on-bbr-work-at-google-00 + +o Internal notes: + For this upstream rebase, Neal started from: + git show fed518041ac6:net/ipv4/tcp_bbr.c > net/ipv4/tcp_bbr.c + then removed dev instrumentation (dynamic get/set for parameters) + and code that was only used by BBRv1 + +Effort: net-tcp_bbr +Origin-9xx-SHA1: 2c84098e60bed6d67dde23cd7538c51dee273102 +Change-Id: I125cf26ba2a7a686f2fa5e87f4c2afceb65f7a05 +Signed-off-by: Alexandre Frade +--- + include/net/inet_connection_sock.h | 4 +- + include/net/tcp.h | 2 +- + include/uapi/linux/inet_diag.h | 23 + + net/ipv4/Kconfig | 21 +- + net/ipv4/tcp_bbr.c | 2214 +++++++++++++++++++++------- + 5 files changed, 1740 insertions(+), 524 deletions(-) + +--- a/include/net/inet_connection_sock.h ++++ b/include/net/inet_connection_sock.h +@@ -137,8 +137,8 @@ struct inet_connection_sock { + u32 icsk_probes_tstamp; + u32 icsk_user_timeout; + +- u64 icsk_ca_priv[104 / sizeof(u64)]; +-#define ICSK_CA_PRIV_SIZE sizeof_field(struct inet_connection_sock, icsk_ca_priv) ++#define ICSK_CA_PRIV_SIZE (144) ++ u64 icsk_ca_priv[ICSK_CA_PRIV_SIZE / sizeof(u64)]; + }; + + #define ICSK_TIME_RETRANS 1 /* Retransmit timer */ +--- a/include/net/tcp.h ++++ b/include/net/tcp.h +@@ -2473,7 +2473,7 @@ struct tcp_plb_state { + u8 consec_cong_rounds:5, /* consecutive congested rounds */ + unused:3; + u32 pause_until; /* jiffies32 when PLB can resume rerouting */ +-}; ++} __attribute__ ((__packed__)); + + static inline void tcp_plb_init(const struct sock *sk, + struct tcp_plb_state *plb) +--- a/include/uapi/linux/inet_diag.h ++++ b/include/uapi/linux/inet_diag.h +@@ -229,6 +229,29 @@ struct tcp_bbr_info { + __u32 bbr_min_rtt; /* min-filtered RTT in uSec */ + __u32 bbr_pacing_gain; /* pacing gain shifted left 8 bits */ + __u32 bbr_cwnd_gain; /* cwnd gain shifted left 8 bits */ ++ __u32 bbr_bw_hi_lsb; /* lower 32 bits of bw_hi */ ++ __u32 bbr_bw_hi_msb; /* upper 32 bits of bw_hi */ ++ __u32 bbr_bw_lo_lsb; /* lower 32 bits of bw_lo */ ++ __u32 bbr_bw_lo_msb; /* upper 32 bits of bw_lo */ ++ __u8 bbr_mode; /* current bbr_mode in state machine */ ++ __u8 bbr_phase; /* current state machine phase */ ++ __u8 unused1; /* alignment padding; not used yet */ ++ __u8 bbr_version; /* BBR algorithm version */ ++ __u32 bbr_inflight_lo; /* lower short-term data volume bound */ ++ __u32 bbr_inflight_hi; /* higher long-term data volume bound */ ++ __u32 bbr_extra_acked; /* max excess packets ACKed in epoch */ ++}; ++ ++/* TCP BBR congestion control bbr_phase as reported in netlink/ss stats. */ ++enum tcp_bbr_phase { ++ BBR_PHASE_INVALID = 0, ++ BBR_PHASE_STARTUP = 1, ++ BBR_PHASE_DRAIN = 2, ++ BBR_PHASE_PROBE_RTT = 3, ++ BBR_PHASE_PROBE_BW_UP = 4, ++ BBR_PHASE_PROBE_BW_DOWN = 5, ++ BBR_PHASE_PROBE_BW_CRUISE = 6, ++ BBR_PHASE_PROBE_BW_REFILL = 7, + }; + + union tcp_cc_info { +--- a/net/ipv4/Kconfig ++++ b/net/ipv4/Kconfig +@@ -668,15 +668,18 @@ config TCP_CONG_BBR + default n + help + +- BBR (Bottleneck Bandwidth and RTT) TCP congestion control aims to +- maximize network utilization and minimize queues. It builds an explicit +- model of the bottleneck delivery rate and path round-trip propagation +- delay. It tolerates packet loss and delay unrelated to congestion. It +- can operate over LAN, WAN, cellular, wifi, or cable modem links. It can +- coexist with flows that use loss-based congestion control, and can +- operate with shallow buffers, deep buffers, bufferbloat, policers, or +- AQM schemes that do not provide a delay signal. It requires the fq +- ("Fair Queue") pacing packet scheduler. ++ BBR (Bottleneck Bandwidth and RTT) TCP congestion control is a ++ model-based congestion control algorithm that aims to maximize ++ network utilization, keep queues and retransmit rates low, and to be ++ able to coexist with Reno/CUBIC in common scenarios. It builds an ++ explicit model of the network path. It tolerates a targeted degree ++ of random packet loss and delay. It can operate over LAN, WAN, ++ cellular, wifi, or cable modem links, and can use shallow-threshold ++ ECN signals. It can coexist to some degree with flows that use ++ loss-based congestion control, and can operate with shallow buffers, ++ deep buffers, bufferbloat, policers, or AQM schemes that do not ++ provide a delay signal. It requires pacing, using either TCP internal ++ pacing or the fq ("Fair Queue") pacing packet scheduler. + + choice + prompt "Default TCP congestion control" +--- a/net/ipv4/tcp_bbr.c ++++ b/net/ipv4/tcp_bbr.c +@@ -1,18 +1,19 @@ +-/* Bottleneck Bandwidth and RTT (BBR) congestion control ++/* BBR (Bottleneck Bandwidth and RTT) congestion control + * +- * BBR congestion control computes the sending rate based on the delivery +- * rate (throughput) estimated from ACKs. In a nutshell: ++ * BBR is a model-based congestion control algorithm that aims for low queues, ++ * low loss, and (bounded) Reno/CUBIC coexistence. To maintain a model of the ++ * network path, it uses measurements of bandwidth and RTT, as well as (if they ++ * occur) packet loss and/or shallow-threshold ECN signals. Note that although ++ * it can use ECN or loss signals explicitly, it does not require either; it ++ * can bound its in-flight data based on its estimate of the BDP. + * +- * On each ACK, update our model of the network path: +- * bottleneck_bandwidth = windowed_max(delivered / elapsed, 10 round trips) +- * min_rtt = windowed_min(rtt, 10 seconds) +- * pacing_rate = pacing_gain * bottleneck_bandwidth +- * cwnd = max(cwnd_gain * bottleneck_bandwidth * min_rtt, 4) +- * +- * The core algorithm does not react directly to packet losses or delays, +- * although BBR may adjust the size of next send per ACK when loss is +- * observed, or adjust the sending rate if it estimates there is a +- * traffic policer, in order to keep the drop rate reasonable. ++ * The model has both higher and lower bounds for the operating range: ++ * lo: bw_lo, inflight_lo: conservative short-term lower bound ++ * hi: bw_hi, inflight_hi: robust long-term upper bound ++ * The bandwidth-probing time scale is (a) extended dynamically based on ++ * estimated BDP to improve coexistence with Reno/CUBIC; (b) bounded by ++ * an interactive wall-clock time-scale to be more scalable and responsive ++ * than Reno and CUBIC. + * + * Here is a state transition diagram for BBR: + * +@@ -65,6 +66,13 @@ + #include + #include + ++#include ++#include "tcp_dctcp.h" ++ ++#define BBR_VERSION 3 ++ ++#define bbr_param(sk,name) (bbr_ ## name) ++ + /* Scale factor for rate in pkt/uSec unit to avoid truncation in bandwidth + * estimation. The rate unit ~= (1500 bytes / 1 usec / 2^24) ~= 715 bps. + * This handles bandwidths from 0.06pps (715bps) to 256Mpps (3Tbps) in a u32. +@@ -85,36 +93,41 @@ enum bbr_mode { + BBR_PROBE_RTT, /* cut inflight to min to probe min_rtt */ + }; + ++/* How does the incoming ACK stream relate to our bandwidth probing? */ ++enum bbr_ack_phase { ++ BBR_ACKS_INIT, /* not probing; not getting probe feedback */ ++ BBR_ACKS_REFILLING, /* sending at est. bw to fill pipe */ ++ BBR_ACKS_PROBE_STARTING, /* inflight rising to probe bw */ ++ BBR_ACKS_PROBE_FEEDBACK, /* getting feedback from bw probing */ ++ BBR_ACKS_PROBE_STOPPING, /* stopped probing; still getting feedback */ ++}; ++ + /* BBR congestion control block */ + struct bbr { + u32 min_rtt_us; /* min RTT in min_rtt_win_sec window */ + u32 min_rtt_stamp; /* timestamp of min_rtt_us */ + u32 probe_rtt_done_stamp; /* end time for BBR_PROBE_RTT mode */ +- struct minmax bw; /* Max recent delivery rate in pkts/uS << 24 */ +- u32 rtt_cnt; /* count of packet-timed rounds elapsed */ ++ u32 probe_rtt_min_us; /* min RTT in probe_rtt_win_ms win */ ++ u32 probe_rtt_min_stamp; /* timestamp of probe_rtt_min_us*/ + u32 next_rtt_delivered; /* scb->tx.delivered at end of round */ + u64 cycle_mstamp; /* time of this cycle phase start */ +- u32 mode:3, /* current bbr_mode in state machine */ ++ u32 mode:2, /* current bbr_mode in state machine */ + prev_ca_state:3, /* CA state on previous ACK */ +- packet_conservation:1, /* use packet conservation? */ + round_start:1, /* start of packet-timed tx->ack round? */ ++ ce_state:1, /* If most recent data has CE bit set */ ++ bw_probe_up_rounds:5, /* cwnd-limited rounds in PROBE_UP */ ++ try_fast_path:1, /* can we take fast path? */ + idle_restart:1, /* restarting after idle? */ + probe_rtt_round_done:1, /* a BBR_PROBE_RTT round at 4 pkts? */ +- unused:13, +- lt_is_sampling:1, /* taking long-term ("LT") samples now? */ +- lt_rtt_cnt:7, /* round trips in long-term interval */ +- lt_use_bw:1; /* use lt_bw as our bw estimate? */ +- u32 lt_bw; /* LT est delivery rate in pkts/uS << 24 */ +- u32 lt_last_delivered; /* LT intvl start: tp->delivered */ +- u32 lt_last_stamp; /* LT intvl start: tp->delivered_mstamp */ +- u32 lt_last_lost; /* LT intvl start: tp->lost */ ++ init_cwnd:7, /* initial cwnd */ ++ unused_1:10; + u32 pacing_gain:10, /* current gain for setting pacing rate */ + cwnd_gain:10, /* current gain for setting cwnd */ + full_bw_reached:1, /* reached full bw in Startup? */ + full_bw_cnt:2, /* number of rounds without large bw gains */ +- cycle_idx:3, /* current index in pacing_gain cycle array */ ++ cycle_idx:2, /* current index in pacing_gain cycle array */ + has_seen_rtt:1, /* have we seen an RTT sample yet? */ +- unused_b:5; ++ unused_2:6; + u32 prior_cwnd; /* prior cwnd upon entering loss recovery */ + u32 full_bw; /* recent bw, to estimate if pipe is full */ + +@@ -124,19 +137,67 @@ struct bbr { + u32 ack_epoch_acked:20, /* packets (S)ACKed in sampling epoch */ + extra_acked_win_rtts:5, /* age of extra_acked, in round trips */ + extra_acked_win_idx:1, /* current index in extra_acked array */ +- unused_c:6; ++ /* BBR v3 state: */ ++ full_bw_now:1, /* recently reached full bw plateau? */ ++ startup_ecn_rounds:2, /* consecutive hi ECN STARTUP rounds */ ++ loss_in_cycle:1, /* packet loss in this cycle? */ ++ ecn_in_cycle:1, /* ECN in this cycle? */ ++ unused_3:1; ++ u32 loss_round_delivered; /* scb->tx.delivered ending loss round */ ++ u32 undo_bw_lo; /* bw_lo before latest losses */ ++ u32 undo_inflight_lo; /* inflight_lo before latest losses */ ++ u32 undo_inflight_hi; /* inflight_hi before latest losses */ ++ u32 bw_latest; /* max delivered bw in last round trip */ ++ u32 bw_lo; /* lower bound on sending bandwidth */ ++ u32 bw_hi[2]; /* max recent measured bw sample */ ++ u32 inflight_latest; /* max delivered data in last round trip */ ++ u32 inflight_lo; /* lower bound of inflight data range */ ++ u32 inflight_hi; /* upper bound of inflight data range */ ++ u32 bw_probe_up_cnt; /* packets delivered per inflight_hi incr */ ++ u32 bw_probe_up_acks; /* packets (S)ACKed since inflight_hi incr */ ++ u32 probe_wait_us; /* PROBE_DOWN until next clock-driven probe */ ++ u32 prior_rcv_nxt; /* tp->rcv_nxt when CE state last changed */ ++ u32 ecn_eligible:1, /* sender can use ECN (RTT, handshake)? */ ++ ecn_alpha:9, /* EWMA delivered_ce/delivered; 0..256 */ ++ bw_probe_samples:1, /* rate samples reflect bw probing? */ ++ prev_probe_too_high:1, /* did last PROBE_UP go too high? */ ++ stopped_risky_probe:1, /* last PROBE_UP stopped due to risk? */ ++ rounds_since_probe:8, /* packet-timed rounds since probed bw */ ++ loss_round_start:1, /* loss_round_delivered round trip? */ ++ loss_in_round:1, /* loss marked in this round trip? */ ++ ecn_in_round:1, /* ECN marked in this round trip? */ ++ ack_phase:3, /* bbr_ack_phase: meaning of ACKs */ ++ loss_events_in_round:4,/* losses in STARTUP round */ ++ initialized:1; /* has bbr_init() been called? */ ++ u32 alpha_last_delivered; /* tp->delivered at alpha update */ ++ u32 alpha_last_delivered_ce; /* tp->delivered_ce at alpha update */ ++ ++ u8 unused_4; /* to preserve alignment */ ++ struct tcp_plb_state plb; + }; + +-#define CYCLE_LEN 8 /* number of phases in a pacing gain cycle */ ++struct bbr_context { ++ u32 sample_bw; ++}; + +-/* Window length of bw filter (in rounds): */ +-static const int bbr_bw_rtts = CYCLE_LEN + 2; + /* Window length of min_rtt filter (in sec): */ + static const u32 bbr_min_rtt_win_sec = 10; + /* Minimum time (in ms) spent at bbr_cwnd_min_target in BBR_PROBE_RTT mode: */ + static const u32 bbr_probe_rtt_mode_ms = 200; +-/* Skip TSO below the following bandwidth (bits/sec): */ +-static const int bbr_min_tso_rate = 1200000; ++/* Window length of probe_rtt_min_us filter (in ms), and consequently the ++ * typical interval between PROBE_RTT mode entries. The default is 5000ms. ++ * Note that bbr_probe_rtt_win_ms must be <= bbr_min_rtt_win_sec * MSEC_PER_SEC ++ */ ++static const u32 bbr_probe_rtt_win_ms = 5000; ++/* Proportion of cwnd to estimated BDP in PROBE_RTT, in units of BBR_UNIT: */ ++static const u32 bbr_probe_rtt_cwnd_gain = BBR_UNIT * 1 / 2; ++ ++/* Use min_rtt to help adapt TSO burst size, with smaller min_rtt resulting ++ * in bigger TSO bursts. We cut the RTT-based allowance in half ++ * for every 2^9 usec (aka 512 us) of RTT, so that the RTT-based allowance ++ * is below 1500 bytes after 6 * ~500 usec = 3ms. ++ */ ++static const u32 bbr_tso_rtt_shift = 9; + + /* Pace at ~1% below estimated bw, on average, to reduce queue at bottleneck. + * In order to help drive the network toward lower queues and low latency while +@@ -146,13 +207,15 @@ static const int bbr_min_tso_rate = 1200 + */ + static const int bbr_pacing_margin_percent = 1; + +-/* We use a high_gain value of 2/ln(2) because it's the smallest pacing gain ++/* We use a startup_pacing_gain of 4*ln(2) because it's the smallest value + * that will allow a smoothly increasing pacing rate that will double each RTT + * and send the same number of packets per RTT that an un-paced, slow-starting + * Reno or CUBIC flow would: + */ +-static const int bbr_high_gain = BBR_UNIT * 2885 / 1000 + 1; +-/* The pacing gain of 1/high_gain in BBR_DRAIN is calculated to typically drain ++static const int bbr_startup_pacing_gain = BBR_UNIT * 277 / 100 + 1; ++/* The gain for deriving startup cwnd: */ ++static const int bbr_startup_cwnd_gain = BBR_UNIT * 2; ++/* The pacing gain in BBR_DRAIN is calculated to typically drain + * the queue created in BBR_STARTUP in a single round: + */ + static const int bbr_drain_gain = BBR_UNIT * 1000 / 2885; +@@ -160,13 +223,17 @@ static const int bbr_drain_gain = BBR_UN + static const int bbr_cwnd_gain = BBR_UNIT * 2; + /* The pacing_gain values for the PROBE_BW gain cycle, to discover/share bw: */ + static const int bbr_pacing_gain[] = { +- BBR_UNIT * 5 / 4, /* probe for more available bw */ +- BBR_UNIT * 3 / 4, /* drain queue and/or yield bw to other flows */ +- BBR_UNIT, BBR_UNIT, BBR_UNIT, /* cruise at 1.0*bw to utilize pipe, */ +- BBR_UNIT, BBR_UNIT, BBR_UNIT /* without creating excess queue... */ ++ BBR_UNIT * 5 / 4, /* UP: probe for more available bw */ ++ BBR_UNIT * 91 / 100, /* DOWN: drain queue and/or yield bw */ ++ BBR_UNIT, /* CRUISE: try to use pipe w/ some headroom */ ++ BBR_UNIT, /* REFILL: refill pipe to estimated 100% */ ++}; ++enum bbr_pacing_gain_phase { ++ BBR_BW_PROBE_UP = 0, /* push up inflight to probe for bw/vol */ ++ BBR_BW_PROBE_DOWN = 1, /* drain excess inflight from the queue */ ++ BBR_BW_PROBE_CRUISE = 2, /* use pipe, w/ headroom in queue/pipe */ ++ BBR_BW_PROBE_REFILL = 3, /* v2: refill the pipe again to 100% */ + }; +-/* Randomize the starting gain cycling phase over N phases: */ +-static const u32 bbr_cycle_rand = 7; + + /* Try to keep at least this many packets in flight, if things go smoothly. For + * smooth functioning, a sliding window protocol ACKing every other packet +@@ -174,24 +241,12 @@ static const u32 bbr_cycle_rand = 7; + */ + static const u32 bbr_cwnd_min_target = 4; + +-/* To estimate if BBR_STARTUP mode (i.e. high_gain) has filled pipe... */ ++/* To estimate if BBR_STARTUP or BBR_BW_PROBE_UP has filled pipe... */ + /* If bw has increased significantly (1.25x), there may be more bw available: */ + static const u32 bbr_full_bw_thresh = BBR_UNIT * 5 / 4; + /* But after 3 rounds w/o significant bw growth, estimate pipe is full: */ + static const u32 bbr_full_bw_cnt = 3; + +-/* "long-term" ("LT") bandwidth estimator parameters... */ +-/* The minimum number of rounds in an LT bw sampling interval: */ +-static const u32 bbr_lt_intvl_min_rtts = 4; +-/* If lost/delivered ratio > 20%, interval is "lossy" and we may be policed: */ +-static const u32 bbr_lt_loss_thresh = 50; +-/* If 2 intervals have a bw ratio <= 1/8, their bw is "consistent": */ +-static const u32 bbr_lt_bw_ratio = BBR_UNIT / 8; +-/* If 2 intervals have a bw diff <= 4 Kbit/sec their bw is "consistent": */ +-static const u32 bbr_lt_bw_diff = 4000 / 8; +-/* If we estimate we're policed, use lt_bw for this many round trips: */ +-static const u32 bbr_lt_bw_max_rtts = 48; +- + /* Gain factor for adding extra_acked to target cwnd: */ + static const int bbr_extra_acked_gain = BBR_UNIT; + /* Window length of extra_acked window. */ +@@ -201,8 +256,121 @@ static const u32 bbr_ack_epoch_acked_res + /* Time period for clamping cwnd increment due to ack aggregation */ + static const u32 bbr_extra_acked_max_us = 100 * 1000; + ++/* Flags to control BBR ECN-related behavior... */ ++ ++/* Ensure ACKs only ACK packets with consistent ECN CE status? */ ++static const bool bbr_precise_ece_ack = true; ++ ++/* Max RTT (in usec) at which to use sender-side ECN logic. ++ * Disabled when 0 (ECN allowed at any RTT). ++ */ ++static const u32 bbr_ecn_max_rtt_us = 5000; ++ ++/* On losses, scale down inflight and pacing rate by beta scaled by BBR_SCALE. ++ * No loss response when 0. ++ */ ++static const u32 bbr_beta = BBR_UNIT * 30 / 100; ++ ++/* Gain factor for ECN mark ratio samples, scaled by BBR_SCALE (1/16 = 6.25%) */ ++static const u32 bbr_ecn_alpha_gain = BBR_UNIT * 1 / 16; ++ ++/* The initial value for ecn_alpha; 1.0 allows a flow to respond quickly ++ * to congestion if the bottleneck is congested when the flow starts up. ++ */ ++static const u32 bbr_ecn_alpha_init = BBR_UNIT; ++ ++/* On ECN, cut inflight_lo to (1 - ecn_factor * ecn_alpha) scaled by BBR_SCALE. ++ * No ECN based bounding when 0. ++ */ ++static const u32 bbr_ecn_factor = BBR_UNIT * 1 / 3; /* 1/3 = 33% */ ++ ++/* Estimate bw probing has gone too far if CE ratio exceeds this threshold. ++ * Scaled by BBR_SCALE. Disabled when 0. ++ */ ++static const u32 bbr_ecn_thresh = BBR_UNIT * 1 / 2; /* 1/2 = 50% */ ++ ++/* If non-zero, if in a cycle with no losses but some ECN marks, after ECN ++ * clears then make the first round's increment to inflight_hi the following ++ * fraction of inflight_hi. ++ */ ++static const u32 bbr_ecn_reprobe_gain = BBR_UNIT * 1 / 2; ++ ++/* Estimate bw probing has gone too far if loss rate exceeds this level. */ ++static const u32 bbr_loss_thresh = BBR_UNIT * 2 / 100; /* 2% loss */ ++ ++/* Slow down for a packet loss recovered by TLP? */ ++static const bool bbr_loss_probe_recovery = true; ++ ++/* Exit STARTUP if number of loss marking events in a Recovery round is >= N, ++ * and loss rate is higher than bbr_loss_thresh. ++ * Disabled if 0. ++ */ ++static const u32 bbr_full_loss_cnt = 6; ++ ++/* Exit STARTUP if number of round trips with ECN mark rate above ecn_thresh ++ * meets this count. ++ */ ++static const u32 bbr_full_ecn_cnt = 2; ++ ++/* Fraction of unutilized headroom to try to leave in path upon high loss. */ ++static const u32 bbr_inflight_headroom = BBR_UNIT * 15 / 100; ++ ++/* How much do we increase cwnd_gain when probing for bandwidth in ++ * BBR_BW_PROBE_UP? This specifies the increment in units of ++ * BBR_UNIT/4. The default is 1, meaning 0.25. ++ * The min value is 0 (meaning 0.0); max is 3 (meaning 0.75). ++ */ ++static const u32 bbr_bw_probe_cwnd_gain = 1; ++ ++/* Max number of packet-timed rounds to wait before probing for bandwidth. If ++ * we want to tolerate 1% random loss per round, and not have this cut our ++ * inflight too much, we must probe for bw periodically on roughly this scale. ++ * If low, limits Reno/CUBIC coexistence; if high, limits loss tolerance. ++ * We aim to be fair with Reno/CUBIC up to a BDP of at least: ++ * BDP = 25Mbps * .030sec /(1514bytes) = 61.9 packets ++ */ ++static const u32 bbr_bw_probe_max_rounds = 63; ++ ++/* Max amount of randomness to inject in round counting for Reno-coexistence. ++ */ ++static const u32 bbr_bw_probe_rand_rounds = 2; ++ ++/* Use BBR-native probe time scale starting at this many usec. ++ * We aim to be fair with Reno/CUBIC up to an inter-loss time epoch of at least: ++ * BDP*RTT = 25Mbps * .030sec /(1514bytes) * 0.030sec = 1.9 secs ++ */ ++static const u32 bbr_bw_probe_base_us = 2 * USEC_PER_SEC; /* 2 secs */ ++ ++/* Use BBR-native probes spread over this many usec: */ ++static const u32 bbr_bw_probe_rand_us = 1 * USEC_PER_SEC; /* 1 secs */ ++ ++/* Use fast path if app-limited, no loss/ECN, and target cwnd was reached? */ ++static const bool bbr_fast_path = true; ++ ++/* Use fast ack mode? */ ++static const bool bbr_fast_ack_mode = true; ++ ++static u32 bbr_max_bw(const struct sock *sk); ++static u32 bbr_bw(const struct sock *sk); ++static void bbr_exit_probe_rtt(struct sock *sk); ++static void bbr_reset_congestion_signals(struct sock *sk); ++static void bbr_run_loss_probe_recovery(struct sock *sk); ++ + static void bbr_check_probe_rtt_done(struct sock *sk); + ++/* This connection can use ECN if both endpoints have signaled ECN support in ++ * the handshake and the per-route settings indicated this is a ++ * shallow-threshold ECN environment, meaning both: ++ * (a) ECN CE marks indicate low-latency/shallow-threshold congestion, and ++ * (b) TCP endpoints provide precise ACKs that only ACK data segments ++ * with consistent ECN CE status ++ */ ++static bool bbr_can_use_ecn(const struct sock *sk) ++{ ++ return (tcp_sk(sk)->ecn_flags & TCP_ECN_OK) && ++ (tcp_sk(sk)->ecn_flags & TCP_ECN_LOW); ++} ++ + /* Do we estimate that STARTUP filled the pipe? */ + static bool bbr_full_bw_reached(const struct sock *sk) + { +@@ -214,17 +382,17 @@ static bool bbr_full_bw_reached(const st + /* Return the windowed max recent bandwidth sample, in pkts/uS << BW_SCALE. */ + static u32 bbr_max_bw(const struct sock *sk) + { +- struct bbr *bbr = inet_csk_ca(sk); ++ const struct bbr *bbr = inet_csk_ca(sk); + +- return minmax_get(&bbr->bw); ++ return max(bbr->bw_hi[0], bbr->bw_hi[1]); + } + + /* Return the estimated bandwidth of the path, in pkts/uS << BW_SCALE. */ + static u32 bbr_bw(const struct sock *sk) + { +- struct bbr *bbr = inet_csk_ca(sk); ++ const struct bbr *bbr = inet_csk_ca(sk); + +- return bbr->lt_use_bw ? bbr->lt_bw : bbr_max_bw(sk); ++ return min(bbr_max_bw(sk), bbr->bw_lo); + } + + /* Return maximum extra acked in past k-2k round trips, +@@ -241,15 +409,23 @@ static u16 bbr_extra_acked(const struct + * The order here is chosen carefully to avoid overflow of u64. This should + * work for input rates of up to 2.9Tbit/sec and gain of 2.89x. + */ +-static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain) ++static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain, ++ int margin) + { + unsigned int mss = tcp_sk(sk)->mss_cache; + + rate *= mss; + rate *= gain; + rate >>= BBR_SCALE; +- rate *= USEC_PER_SEC / 100 * (100 - bbr_pacing_margin_percent); +- return rate >> BW_SCALE; ++ rate *= USEC_PER_SEC / 100 * (100 - margin); ++ rate >>= BW_SCALE; ++ rate = max(rate, 1ULL); ++ return rate; ++} ++ ++static u64 bbr_bw_bytes_per_sec(struct sock *sk, u64 rate) ++{ ++ return bbr_rate_bytes_per_sec(sk, rate, BBR_UNIT, 0); + } + + /* Convert a BBR bw and gain factor to a pacing rate in bytes per second. */ +@@ -257,12 +433,13 @@ static unsigned long bbr_bw_to_pacing_ra + { + u64 rate = bw; + +- rate = bbr_rate_bytes_per_sec(sk, rate, gain); ++ rate = bbr_rate_bytes_per_sec(sk, rate, gain, ++ bbr_pacing_margin_percent); + rate = min_t(u64, rate, READ_ONCE(sk->sk_max_pacing_rate)); + return rate; + } + +-/* Initialize pacing rate to: high_gain * init_cwnd / RTT. */ ++/* Initialize pacing rate to: startup_pacing_gain * init_cwnd / RTT. */ + static void bbr_init_pacing_rate_from_rtt(struct sock *sk) + { + struct tcp_sock *tp = tcp_sk(sk); +@@ -279,7 +456,7 @@ static void bbr_init_pacing_rate_from_rt + bw = (u64)tcp_snd_cwnd(tp) * BW_UNIT; + do_div(bw, rtt_us); + WRITE_ONCE(sk->sk_pacing_rate, +- bbr_bw_to_pacing_rate(sk, bw, bbr_high_gain)); ++ bbr_bw_to_pacing_rate(sk, bw, bbr_param(sk, startup_pacing_gain))); + } + + /* Pace using current bw estimate and a gain factor. */ +@@ -295,31 +472,38 @@ static void bbr_set_pacing_rate(struct s + WRITE_ONCE(sk->sk_pacing_rate, rate); + } + +-/* override sysctl_tcp_min_tso_segs */ +-__bpf_kfunc static u32 bbr_min_tso_segs(struct sock *sk) +-{ +- return READ_ONCE(sk->sk_pacing_rate) < (bbr_min_tso_rate >> 3) ? 1 : 2; +-} +- +-/* Return the number of segments BBR would like in a TSO/GSO skb, given +- * a particular max gso size as a constraint. ++/* Return the number of segments BBR would like in a TSO/GSO skb, given a ++ * particular max gso size as a constraint. TODO: make this simpler and more ++ * consistent by switching bbr to just call tcp_tso_autosize(). + */ + static u32 bbr_tso_segs_generic(struct sock *sk, unsigned int mss_now, + u32 gso_max_size) + { +- u32 segs; ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 segs, r; + u64 bytes; + + /* Budget a TSO/GSO burst size allowance based on bw (pacing_rate). */ + bytes = READ_ONCE(sk->sk_pacing_rate) >> READ_ONCE(sk->sk_pacing_shift); + ++ /* Budget a TSO/GSO burst size allowance based on min_rtt. For every ++ * K = 2^tso_rtt_shift microseconds of min_rtt, halve the burst. ++ * The min_rtt-based burst allowance is: 64 KBytes / 2^(min_rtt/K) ++ */ ++ if (bbr_param(sk, tso_rtt_shift)) { ++ r = bbr->min_rtt_us >> bbr_param(sk, tso_rtt_shift); ++ if (r < BITS_PER_TYPE(u32)) /* prevent undefined behavior */ ++ bytes += GSO_LEGACY_MAX_SIZE >> r; ++ } ++ + bytes = min_t(u32, bytes, gso_max_size - 1 - MAX_TCP_HEADER); +- segs = max_t(u32, bytes / mss_now, bbr_min_tso_segs(sk)); ++ segs = max_t(u32, bytes / mss_now, ++ sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs); + return segs; + } + + /* Custom tcp_tso_autosize() for BBR, used at transmit time to cap skb size. */ +-static u32 bbr_tso_segs(struct sock *sk, unsigned int mss_now) ++__bpf_kfunc static u32 bbr_tso_segs(struct sock *sk, unsigned int mss_now) + { + return bbr_tso_segs_generic(sk, mss_now, sk->sk_gso_max_size); + } +@@ -329,7 +513,7 @@ static u32 bbr_tso_segs_goal(struct sock + { + struct tcp_sock *tp = tcp_sk(sk); + +- return bbr_tso_segs_generic(sk, tp->mss_cache, GSO_MAX_SIZE); ++ return bbr_tso_segs_generic(sk, tp->mss_cache, GSO_LEGACY_MAX_SIZE); + } + + /* Save "last known good" cwnd so we can restore it after losses or PROBE_RTT */ +@@ -349,7 +533,9 @@ __bpf_kfunc static void bbr_cwnd_event(s + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + +- if (event == CA_EVENT_TX_START && tp->app_limited) { ++ if (event == CA_EVENT_TX_START) { ++ if (!tp->app_limited) ++ return; + bbr->idle_restart = 1; + bbr->ack_epoch_mstamp = tp->tcp_mstamp; + bbr->ack_epoch_acked = 0; +@@ -360,6 +546,16 @@ __bpf_kfunc static void bbr_cwnd_event(s + bbr_set_pacing_rate(sk, bbr_bw(sk), BBR_UNIT); + else if (bbr->mode == BBR_PROBE_RTT) + bbr_check_probe_rtt_done(sk); ++ } else if ((event == CA_EVENT_ECN_IS_CE || ++ event == CA_EVENT_ECN_NO_CE) && ++ bbr_can_use_ecn(sk) && ++ bbr_param(sk, precise_ece_ack)) { ++ u32 state = bbr->ce_state; ++ dctcp_ece_ack_update(sk, event, &bbr->prior_rcv_nxt, &state); ++ bbr->ce_state = state; ++ } else if (event == CA_EVENT_TLP_RECOVERY && ++ bbr_param(sk, loss_probe_recovery)) { ++ bbr_run_loss_probe_recovery(sk); + } + } + +@@ -382,10 +578,10 @@ static u32 bbr_bdp(struct sock *sk, u32 + * default. This should only happen when the connection is not using TCP + * timestamps and has retransmitted all of the SYN/SYNACK/data packets + * ACKed so far. In this case, an RTO can cut cwnd to 1, in which +- * case we need to slow-start up toward something safe: TCP_INIT_CWND. ++ * case we need to slow-start up toward something safe: initial cwnd. + */ + if (unlikely(bbr->min_rtt_us == ~0U)) /* no valid RTT samples yet? */ +- return TCP_INIT_CWND; /* be safe: cap at default initial cwnd*/ ++ return bbr->init_cwnd; /* be safe: cap at initial cwnd */ + + w = (u64)bw * bbr->min_rtt_us; + +@@ -402,23 +598,23 @@ static u32 bbr_bdp(struct sock *sk, u32 + * - one skb in sending host Qdisc, + * - one skb in sending host TSO/GSO engine + * - one skb being received by receiver host LRO/GRO/delayed-ACK engine +- * Don't worry, at low rates (bbr_min_tso_rate) this won't bloat cwnd because +- * in such cases tso_segs_goal is 1. The minimum cwnd is 4 packets, ++ * Don't worry, at low rates this won't bloat cwnd because ++ * in such cases tso_segs_goal is small. The minimum cwnd is 4 packets, + * which allows 2 outstanding 2-packet sequences, to try to keep pipe + * full even with ACK-every-other-packet delayed ACKs. + */ + static u32 bbr_quantization_budget(struct sock *sk, u32 cwnd) + { + struct bbr *bbr = inet_csk_ca(sk); ++ u32 tso_segs_goal; + +- /* Allow enough full-sized skbs in flight to utilize end systems. */ +- cwnd += 3 * bbr_tso_segs_goal(sk); +- +- /* Reduce delayed ACKs by rounding up cwnd to the next even number. */ +- cwnd = (cwnd + 1) & ~1U; ++ tso_segs_goal = 3 * bbr_tso_segs_goal(sk); + ++ /* Allow enough full-sized skbs in flight to utilize end systems. */ ++ cwnd = max_t(u32, cwnd, tso_segs_goal); ++ cwnd = max_t(u32, cwnd, bbr_param(sk, cwnd_min_target)); + /* Ensure gain cycling gets inflight above BDP even for small BDPs. */ +- if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == 0) ++ if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == BBR_BW_PROBE_UP) + cwnd += 2; + + return cwnd; +@@ -473,10 +669,10 @@ static u32 bbr_ack_aggregation_cwnd(stru + { + u32 max_aggr_cwnd, aggr_cwnd = 0; + +- if (bbr_extra_acked_gain && bbr_full_bw_reached(sk)) { ++ if (bbr_param(sk, extra_acked_gain)) { + max_aggr_cwnd = ((u64)bbr_bw(sk) * bbr_extra_acked_max_us) + / BW_UNIT; +- aggr_cwnd = (bbr_extra_acked_gain * bbr_extra_acked(sk)) ++ aggr_cwnd = (bbr_param(sk, extra_acked_gain) * bbr_extra_acked(sk)) + >> BBR_SCALE; + aggr_cwnd = min(aggr_cwnd, max_aggr_cwnd); + } +@@ -484,66 +680,27 @@ static u32 bbr_ack_aggregation_cwnd(stru + return aggr_cwnd; + } + +-/* An optimization in BBR to reduce losses: On the first round of recovery, we +- * follow the packet conservation principle: send P packets per P packets acked. +- * After that, we slow-start and send at most 2*P packets per P packets acked. +- * After recovery finishes, or upon undo, we restore the cwnd we had when +- * recovery started (capped by the target cwnd based on estimated BDP). +- * +- * TODO(ycheng/ncardwell): implement a rate-based approach. +- */ +-static bool bbr_set_cwnd_to_recover_or_restore( +- struct sock *sk, const struct rate_sample *rs, u32 acked, u32 *new_cwnd) ++/* Returns the cwnd for PROBE_RTT mode. */ ++static u32 bbr_probe_rtt_cwnd(struct sock *sk) + { +- struct tcp_sock *tp = tcp_sk(sk); +- struct bbr *bbr = inet_csk_ca(sk); +- u8 prev_state = bbr->prev_ca_state, state = inet_csk(sk)->icsk_ca_state; +- u32 cwnd = tcp_snd_cwnd(tp); +- +- /* An ACK for P pkts should release at most 2*P packets. We do this +- * in two steps. First, here we deduct the number of lost packets. +- * Then, in bbr_set_cwnd() we slow start up toward the target cwnd. +- */ +- if (rs->losses > 0) +- cwnd = max_t(s32, cwnd - rs->losses, 1); +- +- if (state == TCP_CA_Recovery && prev_state != TCP_CA_Recovery) { +- /* Starting 1st round of Recovery, so do packet conservation. */ +- bbr->packet_conservation = 1; +- bbr->next_rtt_delivered = tp->delivered; /* start round now */ +- /* Cut unused cwnd from app behavior, TSQ, or TSO deferral: */ +- cwnd = tcp_packets_in_flight(tp) + acked; +- } else if (prev_state >= TCP_CA_Recovery && state < TCP_CA_Recovery) { +- /* Exiting loss recovery; restore cwnd saved before recovery. */ +- cwnd = max(cwnd, bbr->prior_cwnd); +- bbr->packet_conservation = 0; +- } +- bbr->prev_ca_state = state; +- +- if (bbr->packet_conservation) { +- *new_cwnd = max(cwnd, tcp_packets_in_flight(tp) + acked); +- return true; /* yes, using packet conservation */ +- } +- *new_cwnd = cwnd; +- return false; ++ return max_t(u32, bbr_param(sk, cwnd_min_target), ++ bbr_bdp(sk, bbr_bw(sk), bbr_param(sk, probe_rtt_cwnd_gain))); + } + + /* Slow-start up toward target cwnd (if bw estimate is growing, or packet loss + * has drawn us down below target), or snap down to target if we're above it. + */ + static void bbr_set_cwnd(struct sock *sk, const struct rate_sample *rs, +- u32 acked, u32 bw, int gain) ++ u32 acked, u32 bw, int gain, u32 cwnd, ++ struct bbr_context *ctx) + { + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); +- u32 cwnd = tcp_snd_cwnd(tp), target_cwnd = 0; ++ u32 target_cwnd = 0; + + if (!acked) + goto done; /* no packet fully ACKed; just apply caps */ + +- if (bbr_set_cwnd_to_recover_or_restore(sk, rs, acked, &cwnd)) +- goto done; +- + target_cwnd = bbr_bdp(sk, bw, gain); + + /* Increment the cwnd to account for excess ACKed data that seems +@@ -552,74 +709,26 @@ static void bbr_set_cwnd(struct sock *sk + target_cwnd += bbr_ack_aggregation_cwnd(sk); + target_cwnd = bbr_quantization_budget(sk, target_cwnd); + +- /* If we're below target cwnd, slow start cwnd toward target cwnd. */ +- if (bbr_full_bw_reached(sk)) /* only cut cwnd if we filled the pipe */ +- cwnd = min(cwnd + acked, target_cwnd); +- else if (cwnd < target_cwnd || tp->delivered < TCP_INIT_CWND) +- cwnd = cwnd + acked; +- cwnd = max(cwnd, bbr_cwnd_min_target); ++ /* Update cwnd and enable fast path if cwnd reaches target_cwnd. */ ++ bbr->try_fast_path = 0; ++ if (bbr_full_bw_reached(sk)) { /* only cut cwnd if we filled the pipe */ ++ cwnd += acked; ++ if (cwnd >= target_cwnd) { ++ cwnd = target_cwnd; ++ bbr->try_fast_path = 1; ++ } ++ } else if (cwnd < target_cwnd || cwnd < 2 * bbr->init_cwnd) { ++ cwnd += acked; ++ } else { ++ bbr->try_fast_path = 1; ++ } + ++ cwnd = max_t(u32, cwnd, bbr_param(sk, cwnd_min_target)); + done: +- tcp_snd_cwnd_set(tp, min(cwnd, tp->snd_cwnd_clamp)); /* apply global cap */ ++ tcp_snd_cwnd_set(tp, min(cwnd, tp->snd_cwnd_clamp)); /* global cap */ + if (bbr->mode == BBR_PROBE_RTT) /* drain queue, refresh min_rtt */ +- tcp_snd_cwnd_set(tp, min(tcp_snd_cwnd(tp), bbr_cwnd_min_target)); +-} +- +-/* End cycle phase if it's time and/or we hit the phase's in-flight target. */ +-static bool bbr_is_next_cycle_phase(struct sock *sk, +- const struct rate_sample *rs) +-{ +- struct tcp_sock *tp = tcp_sk(sk); +- struct bbr *bbr = inet_csk_ca(sk); +- bool is_full_length = +- tcp_stamp_us_delta(tp->delivered_mstamp, bbr->cycle_mstamp) > +- bbr->min_rtt_us; +- u32 inflight, bw; +- +- /* The pacing_gain of 1.0 paces at the estimated bw to try to fully +- * use the pipe without increasing the queue. +- */ +- if (bbr->pacing_gain == BBR_UNIT) +- return is_full_length; /* just use wall clock time */ +- +- inflight = bbr_packets_in_net_at_edt(sk, rs->prior_in_flight); +- bw = bbr_max_bw(sk); +- +- /* A pacing_gain > 1.0 probes for bw by trying to raise inflight to at +- * least pacing_gain*BDP; this may take more than min_rtt if min_rtt is +- * small (e.g. on a LAN). We do not persist if packets are lost, since +- * a path with small buffers may not hold that much. +- */ +- if (bbr->pacing_gain > BBR_UNIT) +- return is_full_length && +- (rs->losses || /* perhaps pacing_gain*BDP won't fit */ +- inflight >= bbr_inflight(sk, bw, bbr->pacing_gain)); +- +- /* A pacing_gain < 1.0 tries to drain extra queue we added if bw +- * probing didn't find more bw. If inflight falls to match BDP then we +- * estimate queue is drained; persisting would underutilize the pipe. +- */ +- return is_full_length || +- inflight <= bbr_inflight(sk, bw, BBR_UNIT); +-} +- +-static void bbr_advance_cycle_phase(struct sock *sk) +-{ +- struct tcp_sock *tp = tcp_sk(sk); +- struct bbr *bbr = inet_csk_ca(sk); +- +- bbr->cycle_idx = (bbr->cycle_idx + 1) & (CYCLE_LEN - 1); +- bbr->cycle_mstamp = tp->delivered_mstamp; +-} +- +-/* Gain cycling: cycle pacing gain to converge to fair share of available bw. */ +-static void bbr_update_cycle_phase(struct sock *sk, +- const struct rate_sample *rs) +-{ +- struct bbr *bbr = inet_csk_ca(sk); +- +- if (bbr->mode == BBR_PROBE_BW && bbr_is_next_cycle_phase(sk, rs)) +- bbr_advance_cycle_phase(sk); ++ tcp_snd_cwnd_set(tp, min_t(u32, tcp_snd_cwnd(tp), ++ bbr_probe_rtt_cwnd(sk))); + } + + static void bbr_reset_startup_mode(struct sock *sk) +@@ -629,191 +738,49 @@ static void bbr_reset_startup_mode(struc + bbr->mode = BBR_STARTUP; + } + +-static void bbr_reset_probe_bw_mode(struct sock *sk) +-{ +- struct bbr *bbr = inet_csk_ca(sk); +- +- bbr->mode = BBR_PROBE_BW; +- bbr->cycle_idx = CYCLE_LEN - 1 - get_random_u32_below(bbr_cycle_rand); +- bbr_advance_cycle_phase(sk); /* flip to next phase of gain cycle */ +-} +- +-static void bbr_reset_mode(struct sock *sk) +-{ +- if (!bbr_full_bw_reached(sk)) +- bbr_reset_startup_mode(sk); +- else +- bbr_reset_probe_bw_mode(sk); +-} +- +-/* Start a new long-term sampling interval. */ +-static void bbr_reset_lt_bw_sampling_interval(struct sock *sk) +-{ +- struct tcp_sock *tp = tcp_sk(sk); +- struct bbr *bbr = inet_csk_ca(sk); +- +- bbr->lt_last_stamp = div_u64(tp->delivered_mstamp, USEC_PER_MSEC); +- bbr->lt_last_delivered = tp->delivered; +- bbr->lt_last_lost = tp->lost; +- bbr->lt_rtt_cnt = 0; +-} +- +-/* Completely reset long-term bandwidth sampling. */ +-static void bbr_reset_lt_bw_sampling(struct sock *sk) +-{ +- struct bbr *bbr = inet_csk_ca(sk); +- +- bbr->lt_bw = 0; +- bbr->lt_use_bw = 0; +- bbr->lt_is_sampling = false; +- bbr_reset_lt_bw_sampling_interval(sk); +-} +- +-/* Long-term bw sampling interval is done. Estimate whether we're policed. */ +-static void bbr_lt_bw_interval_done(struct sock *sk, u32 bw) +-{ +- struct bbr *bbr = inet_csk_ca(sk); +- u32 diff; +- +- if (bbr->lt_bw) { /* do we have bw from a previous interval? */ +- /* Is new bw close to the lt_bw from the previous interval? */ +- diff = abs(bw - bbr->lt_bw); +- if ((diff * BBR_UNIT <= bbr_lt_bw_ratio * bbr->lt_bw) || +- (bbr_rate_bytes_per_sec(sk, diff, BBR_UNIT) <= +- bbr_lt_bw_diff)) { +- /* All criteria are met; estimate we're policed. */ +- bbr->lt_bw = (bw + bbr->lt_bw) >> 1; /* avg 2 intvls */ +- bbr->lt_use_bw = 1; +- bbr->pacing_gain = BBR_UNIT; /* try to avoid drops */ +- bbr->lt_rtt_cnt = 0; +- return; +- } +- } +- bbr->lt_bw = bw; +- bbr_reset_lt_bw_sampling_interval(sk); +-} +- +-/* Token-bucket traffic policers are common (see "An Internet-Wide Analysis of +- * Traffic Policing", SIGCOMM 2016). BBR detects token-bucket policers and +- * explicitly models their policed rate, to reduce unnecessary losses. We +- * estimate that we're policed if we see 2 consecutive sampling intervals with +- * consistent throughput and high packet loss. If we think we're being policed, +- * set lt_bw to the "long-term" average delivery rate from those 2 intervals. ++/* See if we have reached next round trip. Upon start of the new round, ++ * returns packets delivered since previous round start plus this ACK. + */ +-static void bbr_lt_bw_sampling(struct sock *sk, const struct rate_sample *rs) ++static u32 bbr_update_round_start(struct sock *sk, ++ const struct rate_sample *rs, struct bbr_context *ctx) + { + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); +- u32 lost, delivered; +- u64 bw; +- u32 t; +- +- if (bbr->lt_use_bw) { /* already using long-term rate, lt_bw? */ +- if (bbr->mode == BBR_PROBE_BW && bbr->round_start && +- ++bbr->lt_rtt_cnt >= bbr_lt_bw_max_rtts) { +- bbr_reset_lt_bw_sampling(sk); /* stop using lt_bw */ +- bbr_reset_probe_bw_mode(sk); /* restart gain cycling */ +- } +- return; +- } +- +- /* Wait for the first loss before sampling, to let the policer exhaust +- * its tokens and estimate the steady-state rate allowed by the policer. +- * Starting samples earlier includes bursts that over-estimate the bw. +- */ +- if (!bbr->lt_is_sampling) { +- if (!rs->losses) +- return; +- bbr_reset_lt_bw_sampling_interval(sk); +- bbr->lt_is_sampling = true; +- } +- +- /* To avoid underestimates, reset sampling if we run out of data. */ +- if (rs->is_app_limited) { +- bbr_reset_lt_bw_sampling(sk); +- return; +- } +- +- if (bbr->round_start) +- bbr->lt_rtt_cnt++; /* count round trips in this interval */ +- if (bbr->lt_rtt_cnt < bbr_lt_intvl_min_rtts) +- return; /* sampling interval needs to be longer */ +- if (bbr->lt_rtt_cnt > 4 * bbr_lt_intvl_min_rtts) { +- bbr_reset_lt_bw_sampling(sk); /* interval is too long */ +- return; +- } +- +- /* End sampling interval when a packet is lost, so we estimate the +- * policer tokens were exhausted. Stopping the sampling before the +- * tokens are exhausted under-estimates the policed rate. +- */ +- if (!rs->losses) +- return; +- +- /* Calculate packets lost and delivered in sampling interval. */ +- lost = tp->lost - bbr->lt_last_lost; +- delivered = tp->delivered - bbr->lt_last_delivered; +- /* Is loss rate (lost/delivered) >= lt_loss_thresh? If not, wait. */ +- if (!delivered || (lost << BBR_SCALE) < bbr_lt_loss_thresh * delivered) +- return; +- +- /* Find average delivery rate in this sampling interval. */ +- t = div_u64(tp->delivered_mstamp, USEC_PER_MSEC) - bbr->lt_last_stamp; +- if ((s32)t < 1) +- return; /* interval is less than one ms, so wait */ +- /* Check if can multiply without overflow */ +- if (t >= ~0U / USEC_PER_MSEC) { +- bbr_reset_lt_bw_sampling(sk); /* interval too long; reset */ +- return; +- } +- t *= USEC_PER_MSEC; +- bw = (u64)delivered * BW_UNIT; +- do_div(bw, t); +- bbr_lt_bw_interval_done(sk, bw); +-} +- +-/* Estimate the bandwidth based on how fast packets are delivered */ +-static void bbr_update_bw(struct sock *sk, const struct rate_sample *rs) +-{ +- struct tcp_sock *tp = tcp_sk(sk); +- struct bbr *bbr = inet_csk_ca(sk); +- u64 bw; ++ u32 round_delivered = 0; + + bbr->round_start = 0; +- if (rs->delivered < 0 || rs->interval_us <= 0) +- return; /* Not a valid observation */ + + /* See if we've reached the next RTT */ +- if (!before(rs->prior_delivered, bbr->next_rtt_delivered)) { ++ if (rs->interval_us > 0 && ++ !before(rs->prior_delivered, bbr->next_rtt_delivered)) { ++ round_delivered = tp->delivered - bbr->next_rtt_delivered; + bbr->next_rtt_delivered = tp->delivered; +- bbr->rtt_cnt++; + bbr->round_start = 1; +- bbr->packet_conservation = 0; + } ++ return round_delivered; ++} + +- bbr_lt_bw_sampling(sk, rs); ++/* Calculate the bandwidth based on how fast packets are delivered */ ++static void bbr_calculate_bw_sample(struct sock *sk, ++ const struct rate_sample *rs, struct bbr_context *ctx) ++{ ++ u64 bw = 0; + + /* Divide delivered by the interval to find a (lower bound) bottleneck + * bandwidth sample. Delivered is in packets and interval_us in uS and + * ratio will be <<1 for most connections. So delivered is first scaled. ++ * Round up to allow growth at low rates, even with integer division. + */ +- bw = div64_long((u64)rs->delivered * BW_UNIT, rs->interval_us); ++ if (rs->interval_us > 0) { ++ if (WARN_ONCE(rs->delivered < 0, ++ "negative delivered: %d interval_us: %ld\n", ++ rs->delivered, rs->interval_us)) ++ return; + +- /* If this sample is application-limited, it is likely to have a very +- * low delivered count that represents application behavior rather than +- * the available network rate. Such a sample could drag down estimated +- * bw, causing needless slow-down. Thus, to continue to send at the +- * last measured network rate, we filter out app-limited samples unless +- * they describe the path bw at least as well as our bw model. +- * +- * So the goal during app-limited phase is to proceed with the best +- * network rate no matter how long. We automatically leave this +- * phase when app writes faster than the network can deliver :) +- */ +- if (!rs->is_app_limited || bw >= bbr_max_bw(sk)) { +- /* Incorporate new sample into our max bw filter. */ +- minmax_running_max(&bbr->bw, bbr_bw_rtts, bbr->rtt_cnt, bw); ++ bw = DIV_ROUND_UP_ULL((u64)rs->delivered * BW_UNIT, rs->interval_us); + } ++ ++ ctx->sample_bw = bw; + } + + /* Estimates the windowed max degree of ack aggregation. +@@ -827,7 +794,7 @@ static void bbr_update_bw(struct sock *s + * + * Max extra_acked is clamped by cwnd and bw * bbr_extra_acked_max_us (100 ms). + * Max filter is an approximate sliding window of 5-10 (packet timed) round +- * trips. ++ * trips for non-startup phase, and 1-2 round trips for startup. + */ + static void bbr_update_ack_aggregation(struct sock *sk, + const struct rate_sample *rs) +@@ -835,15 +802,19 @@ static void bbr_update_ack_aggregation(s + u32 epoch_us, expected_acked, extra_acked; + struct bbr *bbr = inet_csk_ca(sk); + struct tcp_sock *tp = tcp_sk(sk); ++ u32 extra_acked_win_rtts_thresh = bbr_param(sk, extra_acked_win_rtts); + +- if (!bbr_extra_acked_gain || rs->acked_sacked <= 0 || ++ if (!bbr_param(sk, extra_acked_gain) || rs->acked_sacked <= 0 || + rs->delivered < 0 || rs->interval_us <= 0) + return; + + if (bbr->round_start) { + bbr->extra_acked_win_rtts = min(0x1F, + bbr->extra_acked_win_rtts + 1); +- if (bbr->extra_acked_win_rtts >= bbr_extra_acked_win_rtts) { ++ if (!bbr_full_bw_reached(sk)) ++ extra_acked_win_rtts_thresh = 1; ++ if (bbr->extra_acked_win_rtts >= ++ extra_acked_win_rtts_thresh) { + bbr->extra_acked_win_rtts = 0; + bbr->extra_acked_win_idx = bbr->extra_acked_win_idx ? + 0 : 1; +@@ -877,49 +848,6 @@ static void bbr_update_ack_aggregation(s + bbr->extra_acked[bbr->extra_acked_win_idx] = extra_acked; + } + +-/* Estimate when the pipe is full, using the change in delivery rate: BBR +- * estimates that STARTUP filled the pipe if the estimated bw hasn't changed by +- * at least bbr_full_bw_thresh (25%) after bbr_full_bw_cnt (3) non-app-limited +- * rounds. Why 3 rounds: 1: rwin autotuning grows the rwin, 2: we fill the +- * higher rwin, 3: we get higher delivery rate samples. Or transient +- * cross-traffic or radio noise can go away. CUBIC Hystart shares a similar +- * design goal, but uses delay and inter-ACK spacing instead of bandwidth. +- */ +-static void bbr_check_full_bw_reached(struct sock *sk, +- const struct rate_sample *rs) +-{ +- struct bbr *bbr = inet_csk_ca(sk); +- u32 bw_thresh; +- +- if (bbr_full_bw_reached(sk) || !bbr->round_start || rs->is_app_limited) +- return; +- +- bw_thresh = (u64)bbr->full_bw * bbr_full_bw_thresh >> BBR_SCALE; +- if (bbr_max_bw(sk) >= bw_thresh) { +- bbr->full_bw = bbr_max_bw(sk); +- bbr->full_bw_cnt = 0; +- return; +- } +- ++bbr->full_bw_cnt; +- bbr->full_bw_reached = bbr->full_bw_cnt >= bbr_full_bw_cnt; +-} +- +-/* If pipe is probably full, drain the queue and then enter steady-state. */ +-static void bbr_check_drain(struct sock *sk, const struct rate_sample *rs) +-{ +- struct bbr *bbr = inet_csk_ca(sk); +- +- if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) { +- bbr->mode = BBR_DRAIN; /* drain queue we created */ +- tcp_sk(sk)->snd_ssthresh = +- bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT); +- } /* fall through to check if in-flight is already small: */ +- if (bbr->mode == BBR_DRAIN && +- bbr_packets_in_net_at_edt(sk, tcp_packets_in_flight(tcp_sk(sk))) <= +- bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT)) +- bbr_reset_probe_bw_mode(sk); /* we estimate queue is drained */ +-} +- + static void bbr_check_probe_rtt_done(struct sock *sk) + { + struct tcp_sock *tp = tcp_sk(sk); +@@ -929,9 +857,9 @@ static void bbr_check_probe_rtt_done(str + after(tcp_jiffies32, bbr->probe_rtt_done_stamp))) + return; + +- bbr->min_rtt_stamp = tcp_jiffies32; /* wait a while until PROBE_RTT */ ++ bbr->probe_rtt_min_stamp = tcp_jiffies32; /* schedule next PROBE_RTT */ + tcp_snd_cwnd_set(tp, max(tcp_snd_cwnd(tp), bbr->prior_cwnd)); +- bbr_reset_mode(sk); ++ bbr_exit_probe_rtt(sk); + } + + /* The goal of PROBE_RTT mode is to have BBR flows cooperatively and +@@ -957,23 +885,35 @@ static void bbr_update_min_rtt(struct so + { + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); +- bool filter_expired; ++ bool probe_rtt_expired, min_rtt_expired; ++ u32 expire; + +- /* Track min RTT seen in the min_rtt_win_sec filter window: */ +- filter_expired = after(tcp_jiffies32, +- bbr->min_rtt_stamp + bbr_min_rtt_win_sec * HZ); ++ /* Track min RTT in probe_rtt_win_ms to time next PROBE_RTT state. */ ++ expire = bbr->probe_rtt_min_stamp + ++ msecs_to_jiffies(bbr_param(sk, probe_rtt_win_ms)); ++ probe_rtt_expired = after(tcp_jiffies32, expire); + if (rs->rtt_us >= 0 && +- (rs->rtt_us < bbr->min_rtt_us || +- (filter_expired && !rs->is_ack_delayed))) { +- bbr->min_rtt_us = rs->rtt_us; +- bbr->min_rtt_stamp = tcp_jiffies32; ++ (rs->rtt_us < bbr->probe_rtt_min_us || ++ (probe_rtt_expired && !rs->is_ack_delayed))) { ++ bbr->probe_rtt_min_us = rs->rtt_us; ++ bbr->probe_rtt_min_stamp = tcp_jiffies32; ++ } ++ /* Track min RTT seen in the min_rtt_win_sec filter window: */ ++ expire = bbr->min_rtt_stamp + bbr_param(sk, min_rtt_win_sec) * HZ; ++ min_rtt_expired = after(tcp_jiffies32, expire); ++ if (bbr->probe_rtt_min_us <= bbr->min_rtt_us || ++ min_rtt_expired) { ++ bbr->min_rtt_us = bbr->probe_rtt_min_us; ++ bbr->min_rtt_stamp = bbr->probe_rtt_min_stamp; + } + +- if (bbr_probe_rtt_mode_ms > 0 && filter_expired && ++ if (bbr_param(sk, probe_rtt_mode_ms) > 0 && probe_rtt_expired && + !bbr->idle_restart && bbr->mode != BBR_PROBE_RTT) { + bbr->mode = BBR_PROBE_RTT; /* dip, drain queue */ + bbr_save_cwnd(sk); /* note cwnd so we can restore it */ + bbr->probe_rtt_done_stamp = 0; ++ bbr->ack_phase = BBR_ACKS_PROBE_STOPPING; ++ bbr->next_rtt_delivered = tp->delivered; + } + + if (bbr->mode == BBR_PROBE_RTT) { +@@ -982,9 +922,9 @@ static void bbr_update_min_rtt(struct so + (tp->delivered + tcp_packets_in_flight(tp)) ? : 1; + /* Maintain min packets in flight for max(200 ms, 1 round). */ + if (!bbr->probe_rtt_done_stamp && +- tcp_packets_in_flight(tp) <= bbr_cwnd_min_target) { ++ tcp_packets_in_flight(tp) <= bbr_probe_rtt_cwnd(sk)) { + bbr->probe_rtt_done_stamp = tcp_jiffies32 + +- msecs_to_jiffies(bbr_probe_rtt_mode_ms); ++ msecs_to_jiffies(bbr_param(sk, probe_rtt_mode_ms)); + bbr->probe_rtt_round_done = 0; + bbr->next_rtt_delivered = tp->delivered; + } else if (bbr->probe_rtt_done_stamp) { +@@ -1005,18 +945,20 @@ static void bbr_update_gains(struct sock + + switch (bbr->mode) { + case BBR_STARTUP: +- bbr->pacing_gain = bbr_high_gain; +- bbr->cwnd_gain = bbr_high_gain; ++ bbr->pacing_gain = bbr_param(sk, startup_pacing_gain); ++ bbr->cwnd_gain = bbr_param(sk, startup_cwnd_gain); + break; + case BBR_DRAIN: +- bbr->pacing_gain = bbr_drain_gain; /* slow, to drain */ +- bbr->cwnd_gain = bbr_high_gain; /* keep cwnd */ ++ bbr->pacing_gain = bbr_param(sk, drain_gain); /* slow, to drain */ ++ bbr->cwnd_gain = bbr_param(sk, startup_cwnd_gain); /* keep cwnd */ + break; + case BBR_PROBE_BW: +- bbr->pacing_gain = (bbr->lt_use_bw ? +- BBR_UNIT : +- bbr_pacing_gain[bbr->cycle_idx]); +- bbr->cwnd_gain = bbr_cwnd_gain; ++ bbr->pacing_gain = bbr_pacing_gain[bbr->cycle_idx]; ++ bbr->cwnd_gain = bbr_param(sk, cwnd_gain); ++ if (bbr_param(sk, bw_probe_cwnd_gain) && ++ bbr->cycle_idx == BBR_BW_PROBE_UP) ++ bbr->cwnd_gain += ++ BBR_UNIT * bbr_param(sk, bw_probe_cwnd_gain) / 4; + break; + case BBR_PROBE_RTT: + bbr->pacing_gain = BBR_UNIT; +@@ -1028,27 +970,1108 @@ static void bbr_update_gains(struct sock + } + } + +-static void bbr_update_model(struct sock *sk, const struct rate_sample *rs) ++__bpf_kfunc static u32 bbr_sndbuf_expand(struct sock *sk) ++{ ++ /* Provision 3 * cwnd since BBR may slow-start even during recovery. */ ++ return 3; ++} ++ ++/* Incorporate a new bw sample into the current window of our max filter. */ ++static void bbr_take_max_bw_sample(struct sock *sk, u32 bw) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr->bw_hi[1] = max(bw, bbr->bw_hi[1]); ++} ++ ++/* Keep max of last 1-2 cycles. Each PROBE_BW cycle, flip filter window. */ ++static void bbr_advance_max_bw_filter(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ if (!bbr->bw_hi[1]) ++ return; /* no samples in this window; remember old window */ ++ bbr->bw_hi[0] = bbr->bw_hi[1]; ++ bbr->bw_hi[1] = 0; ++} ++ ++/* Reset the estimator for reaching full bandwidth based on bw plateau. */ ++static void bbr_reset_full_bw(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr->full_bw = 0; ++ bbr->full_bw_cnt = 0; ++ bbr->full_bw_now = 0; ++} ++ ++/* How much do we want in flight? Our BDP, unless congestion cut cwnd. */ ++static u32 bbr_target_inflight(struct sock *sk) ++{ ++ u32 bdp = bbr_inflight(sk, bbr_bw(sk), BBR_UNIT); ++ ++ return min(bdp, tcp_sk(sk)->snd_cwnd); ++} ++ ++static bool bbr_is_probing_bandwidth(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ return (bbr->mode == BBR_STARTUP) || ++ (bbr->mode == BBR_PROBE_BW && ++ (bbr->cycle_idx == BBR_BW_PROBE_REFILL || ++ bbr->cycle_idx == BBR_BW_PROBE_UP)); ++} ++ ++/* Has the given amount of time elapsed since we marked the phase start? */ ++static bool bbr_has_elapsed_in_phase(const struct sock *sk, u32 interval_us) ++{ ++ const struct tcp_sock *tp = tcp_sk(sk); ++ const struct bbr *bbr = inet_csk_ca(sk); ++ ++ return tcp_stamp_us_delta(tp->tcp_mstamp, ++ bbr->cycle_mstamp + interval_us) > 0; ++} ++ ++static void bbr_handle_queue_too_high_in_startup(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 bdp; /* estimated BDP in packets, with quantization budget */ ++ ++ bbr->full_bw_reached = 1; ++ ++ bdp = bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT); ++ bbr->inflight_hi = max(bdp, bbr->inflight_latest); ++} ++ ++/* Exit STARTUP upon N consecutive rounds with ECN mark rate > ecn_thresh. */ ++static void bbr_check_ecn_too_high_in_startup(struct sock *sk, u32 ce_ratio) + { +- bbr_update_bw(sk, rs); ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ if (bbr_full_bw_reached(sk) || !bbr->ecn_eligible || ++ !bbr_param(sk, full_ecn_cnt) || !bbr_param(sk, ecn_thresh)) ++ return; ++ ++ if (ce_ratio >= bbr_param(sk, ecn_thresh)) ++ bbr->startup_ecn_rounds++; ++ else ++ bbr->startup_ecn_rounds = 0; ++ ++ if (bbr->startup_ecn_rounds >= bbr_param(sk, full_ecn_cnt)) { ++ bbr_handle_queue_too_high_in_startup(sk); ++ return; ++ } ++} ++ ++/* Updates ecn_alpha and returns ce_ratio. -1 if not available. */ ++static int bbr_update_ecn_alpha(struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct net *net = sock_net(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ s32 delivered, delivered_ce; ++ u64 alpha, ce_ratio; ++ u32 gain; ++ bool want_ecn_alpha; ++ ++ /* See if we should use ECN sender logic for this connection. */ ++ if (!bbr->ecn_eligible && bbr_can_use_ecn(sk) && ++ bbr_param(sk, ecn_factor) && ++ (bbr->min_rtt_us <= bbr_ecn_max_rtt_us || ++ !bbr_ecn_max_rtt_us)) ++ bbr->ecn_eligible = 1; ++ ++ /* Skip updating alpha only if not ECN-eligible and PLB is disabled. */ ++ want_ecn_alpha = (bbr->ecn_eligible || ++ (bbr_can_use_ecn(sk) && ++ READ_ONCE(net->ipv4.sysctl_tcp_plb_enabled))); ++ if (!want_ecn_alpha) ++ return -1; ++ ++ delivered = tp->delivered - bbr->alpha_last_delivered; ++ delivered_ce = tp->delivered_ce - bbr->alpha_last_delivered_ce; ++ ++ if (delivered == 0 || /* avoid divide by zero */ ++ WARN_ON_ONCE(delivered < 0 || delivered_ce < 0)) /* backwards? */ ++ return -1; ++ ++ BUILD_BUG_ON(BBR_SCALE != TCP_PLB_SCALE); ++ ce_ratio = (u64)delivered_ce << BBR_SCALE; ++ do_div(ce_ratio, delivered); ++ ++ gain = bbr_param(sk, ecn_alpha_gain); ++ alpha = ((BBR_UNIT - gain) * bbr->ecn_alpha) >> BBR_SCALE; ++ alpha += (gain * ce_ratio) >> BBR_SCALE; ++ bbr->ecn_alpha = min_t(u32, alpha, BBR_UNIT); ++ ++ bbr->alpha_last_delivered = tp->delivered; ++ bbr->alpha_last_delivered_ce = tp->delivered_ce; ++ ++ bbr_check_ecn_too_high_in_startup(sk, ce_ratio); ++ return (int)ce_ratio; ++} ++ ++/* Protective Load Balancing (PLB). PLB rehashes outgoing data (to a new IPv6 ++ * flow label) if it encounters sustained congestion in the form of ECN marks. ++ */ ++static void bbr_plb(struct sock *sk, const struct rate_sample *rs, int ce_ratio) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ if (bbr->round_start && ce_ratio >= 0) ++ tcp_plb_update_state(sk, &bbr->plb, ce_ratio); ++ ++ tcp_plb_check_rehash(sk, &bbr->plb); ++} ++ ++/* Each round trip of BBR_BW_PROBE_UP, double volume of probing data. */ ++static void bbr_raise_inflight_hi_slope(struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 growth_this_round, cnt; ++ ++ /* Calculate "slope": packets S/Acked per inflight_hi increment. */ ++ growth_this_round = 1 << bbr->bw_probe_up_rounds; ++ bbr->bw_probe_up_rounds = min(bbr->bw_probe_up_rounds + 1, 30); ++ cnt = tcp_snd_cwnd(tp) / growth_this_round; ++ cnt = max(cnt, 1U); ++ bbr->bw_probe_up_cnt = cnt; ++} ++ ++/* In BBR_BW_PROBE_UP, not seeing high loss/ECN/queue, so raise inflight_hi. */ ++static void bbr_probe_inflight_hi_upward(struct sock *sk, ++ const struct rate_sample *rs) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 delta; ++ ++ if (!tp->is_cwnd_limited || tcp_snd_cwnd(tp) < bbr->inflight_hi) ++ return; /* not fully using inflight_hi, so don't grow it */ ++ ++ /* For each bw_probe_up_cnt packets ACKed, increase inflight_hi by 1. */ ++ bbr->bw_probe_up_acks += rs->acked_sacked; ++ if (bbr->bw_probe_up_acks >= bbr->bw_probe_up_cnt) { ++ delta = bbr->bw_probe_up_acks / bbr->bw_probe_up_cnt; ++ bbr->bw_probe_up_acks -= delta * bbr->bw_probe_up_cnt; ++ bbr->inflight_hi += delta; ++ bbr->try_fast_path = 0; /* Need to update cwnd */ ++ } ++ ++ if (bbr->round_start) ++ bbr_raise_inflight_hi_slope(sk); ++} ++ ++/* Does loss/ECN rate for this sample say inflight is "too high"? ++ * This is used by both the bbr_check_loss_too_high_in_startup() function, ++ * which can be used in either v1 or v2, and the PROBE_UP phase of v2, which ++ * uses it to notice when loss/ECN rates suggest inflight is too high. ++ */ ++static bool bbr_is_inflight_too_high(const struct sock *sk, ++ const struct rate_sample *rs) ++{ ++ const struct bbr *bbr = inet_csk_ca(sk); ++ u32 loss_thresh, ecn_thresh; ++ ++ if (rs->lost > 0 && rs->tx_in_flight) { ++ loss_thresh = (u64)rs->tx_in_flight * bbr_param(sk, loss_thresh) >> ++ BBR_SCALE; ++ if (rs->lost > loss_thresh) { ++ return true; ++ } ++ } ++ ++ if (rs->delivered_ce > 0 && rs->delivered > 0 && ++ bbr->ecn_eligible && bbr_param(sk, ecn_thresh)) { ++ ecn_thresh = (u64)rs->delivered * bbr_param(sk, ecn_thresh) >> ++ BBR_SCALE; ++ if (rs->delivered_ce > ecn_thresh) { ++ return true; ++ } ++ } ++ ++ return false; ++} ++ ++/* Calculate the tx_in_flight level that corresponded to excessive loss. ++ * We find "lost_prefix" segs of the skb where loss rate went too high, ++ * by solving for "lost_prefix" in the following equation: ++ * lost / inflight >= loss_thresh ++ * (lost_prev + lost_prefix) / (inflight_prev + lost_prefix) >= loss_thresh ++ * Then we take that equation, convert it to fixed point, and ++ * round up to the nearest packet. ++ */ ++static u32 bbr_inflight_hi_from_lost_skb(const struct sock *sk, ++ const struct rate_sample *rs, ++ const struct sk_buff *skb) ++{ ++ const struct tcp_sock *tp = tcp_sk(sk); ++ u32 loss_thresh = bbr_param(sk, loss_thresh); ++ u32 pcount, divisor, inflight_hi; ++ s32 inflight_prev, lost_prev; ++ u64 loss_budget, lost_prefix; ++ ++ pcount = tcp_skb_pcount(skb); ++ ++ /* How much data was in flight before this skb? */ ++ inflight_prev = rs->tx_in_flight - pcount; ++ if (inflight_prev < 0) { ++ WARN_ONCE(tcp_skb_tx_in_flight_is_suspicious( ++ pcount, ++ TCP_SKB_CB(skb)->sacked, ++ rs->tx_in_flight), ++ "tx_in_flight: %u pcount: %u reneg: %u", ++ rs->tx_in_flight, pcount, tcp_sk(sk)->is_sack_reneg); ++ return ~0U; ++ } ++ ++ /* How much inflight data was marked lost before this skb? */ ++ lost_prev = rs->lost - pcount; ++ if (WARN_ONCE(lost_prev < 0, ++ "cwnd: %u ca: %d out: %u lost: %u pif: %u " ++ "tx_in_flight: %u tx.lost: %u tp->lost: %u rs->lost: %d " ++ "lost_prev: %d pcount: %d seq: %u end_seq: %u reneg: %u", ++ tcp_snd_cwnd(tp), inet_csk(sk)->icsk_ca_state, ++ tp->packets_out, tp->lost_out, tcp_packets_in_flight(tp), ++ rs->tx_in_flight, TCP_SKB_CB(skb)->tx.lost, tp->lost, ++ rs->lost, lost_prev, pcount, ++ TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, ++ tp->is_sack_reneg)) ++ return ~0U; ++ ++ /* At what prefix of this lost skb did losss rate exceed loss_thresh? */ ++ loss_budget = (u64)inflight_prev * loss_thresh + BBR_UNIT - 1; ++ loss_budget >>= BBR_SCALE; ++ if (lost_prev >= loss_budget) { ++ lost_prefix = 0; /* previous losses crossed loss_thresh */ ++ } else { ++ lost_prefix = loss_budget - lost_prev; ++ lost_prefix <<= BBR_SCALE; ++ divisor = BBR_UNIT - loss_thresh; ++ if (WARN_ON_ONCE(!divisor)) /* loss_thresh is 8 bits */ ++ return ~0U; ++ do_div(lost_prefix, divisor); ++ } ++ ++ inflight_hi = inflight_prev + lost_prefix; ++ return inflight_hi; ++} ++ ++/* If loss/ECN rates during probing indicated we may have overfilled a ++ * buffer, return an operating point that tries to leave unutilized headroom in ++ * the path for other flows, for fairness convergence and lower RTTs and loss. ++ */ ++static u32 bbr_inflight_with_headroom(const struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 headroom, headroom_fraction; ++ ++ if (bbr->inflight_hi == ~0U) ++ return ~0U; ++ ++ headroom_fraction = bbr_param(sk, inflight_headroom); ++ headroom = ((u64)bbr->inflight_hi * headroom_fraction) >> BBR_SCALE; ++ headroom = max(headroom, 1U); ++ return max_t(s32, bbr->inflight_hi - headroom, ++ bbr_param(sk, cwnd_min_target)); ++} ++ ++/* Bound cwnd to a sensible level, based on our current probing state ++ * machine phase and model of a good inflight level (inflight_lo, inflight_hi). ++ */ ++static void bbr_bound_cwnd_for_inflight_model(struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 cap; ++ ++ /* tcp_rcv_synsent_state_process() currently calls tcp_ack() ++ * and thus cong_control() without first initializing us(!). ++ */ ++ if (!bbr->initialized) ++ return; ++ ++ cap = ~0U; ++ if (bbr->mode == BBR_PROBE_BW && ++ bbr->cycle_idx != BBR_BW_PROBE_CRUISE) { ++ /* Probe to see if more packets fit in the path. */ ++ cap = bbr->inflight_hi; ++ } else { ++ if (bbr->mode == BBR_PROBE_RTT || ++ (bbr->mode == BBR_PROBE_BW && ++ bbr->cycle_idx == BBR_BW_PROBE_CRUISE)) ++ cap = bbr_inflight_with_headroom(sk); ++ } ++ /* Adapt to any loss/ECN since our last bw probe. */ ++ cap = min(cap, bbr->inflight_lo); ++ ++ cap = max_t(u32, cap, bbr_param(sk, cwnd_min_target)); ++ tcp_snd_cwnd_set(tp, min(cap, tcp_snd_cwnd(tp))); ++} ++ ++/* How should we multiplicatively cut bw or inflight limits based on ECN? */ ++static u32 bbr_ecn_cut(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ return BBR_UNIT - ++ ((bbr->ecn_alpha * bbr_param(sk, ecn_factor)) >> BBR_SCALE); ++} ++ ++/* Init lower bounds if have not inited yet. */ ++static void bbr_init_lower_bounds(struct sock *sk, bool init_bw) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ if (init_bw && bbr->bw_lo == ~0U) ++ bbr->bw_lo = bbr_max_bw(sk); ++ if (bbr->inflight_lo == ~0U) ++ bbr->inflight_lo = tcp_snd_cwnd(tp); ++} ++ ++/* Reduce bw and inflight to (1 - beta). */ ++static void bbr_loss_lower_bounds(struct sock *sk, u32 *bw, u32 *inflight) ++{ ++ struct bbr* bbr = inet_csk_ca(sk); ++ u32 loss_cut = BBR_UNIT - bbr_param(sk, beta); ++ ++ *bw = max_t(u32, bbr->bw_latest, ++ (u64)bbr->bw_lo * loss_cut >> BBR_SCALE); ++ *inflight = max_t(u32, bbr->inflight_latest, ++ (u64)bbr->inflight_lo * loss_cut >> BBR_SCALE); ++} ++ ++/* Reduce inflight to (1 - alpha*ecn_factor). */ ++static void bbr_ecn_lower_bounds(struct sock *sk, u32 *inflight) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 ecn_cut = bbr_ecn_cut(sk); ++ ++ *inflight = (u64)bbr->inflight_lo * ecn_cut >> BBR_SCALE; ++} ++ ++/* Estimate a short-term lower bound on the capacity available now, based ++ * on measurements of the current delivery process and recent history. When we ++ * are seeing loss/ECN at times when we are not probing bw, then conservatively ++ * move toward flow balance by multiplicatively cutting our short-term ++ * estimated safe rate and volume of data (bw_lo and inflight_lo). We use a ++ * multiplicative decrease in order to converge to a lower capacity in time ++ * logarithmic in the magnitude of the decrease. ++ * ++ * However, we do not cut our short-term estimates lower than the current rate ++ * and volume of delivered data from this round trip, since from the current ++ * delivery process we can estimate the measured capacity available now. ++ * ++ * Anything faster than that approach would knowingly risk high loss, which can ++ * cause low bw for Reno/CUBIC and high loss recovery latency for ++ * request/response flows using any congestion control. ++ */ ++static void bbr_adapt_lower_bounds(struct sock *sk, ++ const struct rate_sample *rs) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 ecn_inflight_lo = ~0U; ++ ++ /* We only use lower-bound estimates when not probing bw. ++ * When probing we need to push inflight higher to probe bw. ++ */ ++ if (bbr_is_probing_bandwidth(sk)) ++ return; ++ ++ /* ECN response. */ ++ if (bbr->ecn_in_round && bbr_param(sk, ecn_factor)) { ++ bbr_init_lower_bounds(sk, false); ++ bbr_ecn_lower_bounds(sk, &ecn_inflight_lo); ++ } ++ ++ /* Loss response. */ ++ if (bbr->loss_in_round) { ++ bbr_init_lower_bounds(sk, true); ++ bbr_loss_lower_bounds(sk, &bbr->bw_lo, &bbr->inflight_lo); ++ } ++ ++ /* Adjust to the lower of the levels implied by loss/ECN. */ ++ bbr->inflight_lo = min(bbr->inflight_lo, ecn_inflight_lo); ++ bbr->bw_lo = max(1U, bbr->bw_lo); ++} ++ ++/* Reset any short-term lower-bound adaptation to congestion, so that we can ++ * push our inflight up. ++ */ ++static void bbr_reset_lower_bounds(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr->bw_lo = ~0U; ++ bbr->inflight_lo = ~0U; ++} ++ ++/* After bw probing (STARTUP/PROBE_UP), reset signals before entering a state ++ * machine phase where we adapt our lower bound based on congestion signals. ++ */ ++static void bbr_reset_congestion_signals(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr->loss_in_round = 0; ++ bbr->ecn_in_round = 0; ++ bbr->loss_in_cycle = 0; ++ bbr->ecn_in_cycle = 0; ++ bbr->bw_latest = 0; ++ bbr->inflight_latest = 0; ++} ++ ++static void bbr_exit_loss_recovery(struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ tcp_snd_cwnd_set(tp, max(tcp_snd_cwnd(tp), bbr->prior_cwnd)); ++ bbr->try_fast_path = 0; /* bound cwnd using latest model */ ++} ++ ++/* Update rate and volume of delivered data from latest round trip. */ ++static void bbr_update_latest_delivery_signals( ++ struct sock *sk, const struct rate_sample *rs, struct bbr_context *ctx) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr->loss_round_start = 0; ++ if (rs->interval_us <= 0 || !rs->acked_sacked) ++ return; /* Not a valid observation */ ++ ++ bbr->bw_latest = max_t(u32, bbr->bw_latest, ctx->sample_bw); ++ bbr->inflight_latest = max_t(u32, bbr->inflight_latest, rs->delivered); ++ ++ if (!before(rs->prior_delivered, bbr->loss_round_delivered)) { ++ bbr->loss_round_delivered = tp->delivered; ++ bbr->loss_round_start = 1; /* mark start of new round trip */ ++ } ++} ++ ++/* Once per round, reset filter for latest rate and volume of delivered data. */ ++static void bbr_advance_latest_delivery_signals( ++ struct sock *sk, const struct rate_sample *rs, struct bbr_context *ctx) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ /* If ACK matches a TLP retransmit, persist the filter. If we detect ++ * that a TLP retransmit plugged a tail loss, we'll want to remember ++ * how much data the path delivered before the tail loss. ++ */ ++ if (bbr->loss_round_start && !rs->is_acking_tlp_retrans_seq) { ++ bbr->bw_latest = ctx->sample_bw; ++ bbr->inflight_latest = rs->delivered; ++ } ++} ++ ++/* Update (most of) our congestion signals: track the recent rate and volume of ++ * delivered data, presence of loss, and EWMA degree of ECN marking. ++ */ ++static void bbr_update_congestion_signals( ++ struct sock *sk, const struct rate_sample *rs, struct bbr_context *ctx) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ u64 bw; ++ ++ if (rs->interval_us <= 0 || !rs->acked_sacked) ++ return; /* Not a valid observation */ ++ bw = ctx->sample_bw; ++ ++ if (!rs->is_app_limited || bw >= bbr_max_bw(sk)) ++ bbr_take_max_bw_sample(sk, bw); ++ ++ bbr->loss_in_round |= (rs->losses > 0); ++ ++ if (!bbr->loss_round_start) ++ return; /* skip the per-round-trip updates */ ++ /* Now do per-round-trip updates. */ ++ bbr_adapt_lower_bounds(sk, rs); ++ ++ bbr->loss_in_round = 0; ++ bbr->ecn_in_round = 0; ++} ++ ++/* Bandwidth probing can cause loss. To help coexistence with loss-based ++ * congestion control we spread out our probing in a Reno-conscious way. Due to ++ * the shape of the Reno sawtooth, the time required between loss epochs for an ++ * idealized Reno flow is a number of round trips that is the BDP of that ++ * flow. We count packet-timed round trips directly, since measured RTT can ++ * vary widely, and Reno is driven by packet-timed round trips. ++ */ ++static bool bbr_is_reno_coexistence_probe_time(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 rounds; ++ ++ /* Random loss can shave some small percentage off of our inflight ++ * in each round. To survive this, flows need robust periodic probes. ++ */ ++ rounds = min_t(u32, bbr_param(sk, bw_probe_max_rounds), bbr_target_inflight(sk)); ++ return bbr->rounds_since_probe >= rounds; ++} ++ ++/* How long do we want to wait before probing for bandwidth (and risking ++ * loss)? We randomize the wait, for better mixing and fairness convergence. ++ * ++ * We bound the Reno-coexistence inter-bw-probe time to be 62-63 round trips. ++ * This is calculated to allow fairness with a 25Mbps, 30ms Reno flow, ++ * (eg 4K video to a broadband user): ++ * BDP = 25Mbps * .030sec /(1514bytes) = 61.9 packets ++ * ++ * We bound the BBR-native inter-bw-probe wall clock time to be: ++ * (a) higher than 2 sec: to try to avoid causing loss for a long enough time ++ * to allow Reno at 30ms to get 4K video bw, the inter-bw-probe time must ++ * be at least: 25Mbps * .030sec / (1514bytes) * 0.030sec = 1.9secs ++ * (b) lower than 3 sec: to ensure flows can start probing in a reasonable ++ * amount of time to discover unutilized bw on human-scale interactive ++ * time-scales (e.g. perhaps traffic from a web page download that we ++ * were competing with is now complete). ++ */ ++static void bbr_pick_probe_wait(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ /* Decide the random round-trip bound for wait until probe: */ ++ bbr->rounds_since_probe = ++ get_random_u32_below(bbr_param(sk, bw_probe_rand_rounds)); ++ /* Decide the random wall clock bound for wait until probe: */ ++ bbr->probe_wait_us = bbr_param(sk, bw_probe_base_us) + ++ get_random_u32_below(bbr_param(sk, bw_probe_rand_us)); ++} ++ ++static void bbr_set_cycle_idx(struct sock *sk, int cycle_idx) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr->cycle_idx = cycle_idx; ++ /* New phase, so need to update cwnd and pacing rate. */ ++ bbr->try_fast_path = 0; ++} ++ ++/* Send at estimated bw to fill the pipe, but not queue. We need this phase ++ * before PROBE_UP, because as soon as we send faster than the available bw ++ * we will start building a queue, and if the buffer is shallow we can cause ++ * loss. If we do not fill the pipe before we cause this loss, our bw_hi and ++ * inflight_hi estimates will underestimate. ++ */ ++static void bbr_start_bw_probe_refill(struct sock *sk, u32 bw_probe_up_rounds) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr_reset_lower_bounds(sk); ++ bbr->bw_probe_up_rounds = bw_probe_up_rounds; ++ bbr->bw_probe_up_acks = 0; ++ bbr->stopped_risky_probe = 0; ++ bbr->ack_phase = BBR_ACKS_REFILLING; ++ bbr->next_rtt_delivered = tp->delivered; ++ bbr_set_cycle_idx(sk, BBR_BW_PROBE_REFILL); ++} ++ ++/* Now probe max deliverable data rate and volume. */ ++static void bbr_start_bw_probe_up(struct sock *sk, struct bbr_context *ctx) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr->ack_phase = BBR_ACKS_PROBE_STARTING; ++ bbr->next_rtt_delivered = tp->delivered; ++ bbr->cycle_mstamp = tp->tcp_mstamp; ++ bbr_reset_full_bw(sk); ++ bbr->full_bw = ctx->sample_bw; ++ bbr_set_cycle_idx(sk, BBR_BW_PROBE_UP); ++ bbr_raise_inflight_hi_slope(sk); ++} ++ ++/* Start a new PROBE_BW probing cycle of some wall clock length. Pick a wall ++ * clock time at which to probe beyond an inflight that we think to be ++ * safe. This will knowingly risk packet loss, so we want to do this rarely, to ++ * keep packet loss rates low. Also start a round-trip counter, to probe faster ++ * if we estimate a Reno flow at our BDP would probe faster. ++ */ ++static void bbr_start_bw_probe_down(struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr_reset_congestion_signals(sk); ++ bbr->bw_probe_up_cnt = ~0U; /* not growing inflight_hi any more */ ++ bbr_pick_probe_wait(sk); ++ bbr->cycle_mstamp = tp->tcp_mstamp; /* start wall clock */ ++ bbr->ack_phase = BBR_ACKS_PROBE_STOPPING; ++ bbr->next_rtt_delivered = tp->delivered; ++ bbr_set_cycle_idx(sk, BBR_BW_PROBE_DOWN); ++} ++ ++/* Cruise: maintain what we estimate to be a neutral, conservative ++ * operating point, without attempting to probe up for bandwidth or down for ++ * RTT, and only reducing inflight in response to loss/ECN signals. ++ */ ++static void bbr_start_bw_probe_cruise(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ if (bbr->inflight_lo != ~0U) ++ bbr->inflight_lo = min(bbr->inflight_lo, bbr->inflight_hi); ++ ++ bbr_set_cycle_idx(sk, BBR_BW_PROBE_CRUISE); ++} ++ ++/* Loss and/or ECN rate is too high while probing. ++ * Adapt (once per bw probe) by cutting inflight_hi and then restarting cycle. ++ */ ++static void bbr_handle_inflight_too_high(struct sock *sk, ++ const struct rate_sample *rs) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ const u32 beta = bbr_param(sk, beta); ++ ++ bbr->prev_probe_too_high = 1; ++ bbr->bw_probe_samples = 0; /* only react once per probe */ ++ /* If we are app-limited then we are not robustly ++ * probing the max volume of inflight data we think ++ * might be safe (analogous to how app-limited bw ++ * samples are not known to be robustly probing bw). ++ */ ++ if (!rs->is_app_limited) { ++ bbr->inflight_hi = max_t(u32, rs->tx_in_flight, ++ (u64)bbr_target_inflight(sk) * ++ (BBR_UNIT - beta) >> BBR_SCALE); ++ } ++ if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == BBR_BW_PROBE_UP) ++ bbr_start_bw_probe_down(sk); ++} ++ ++/* If we're seeing bw and loss samples reflecting our bw probing, adapt ++ * using the signals we see. If loss or ECN mark rate gets too high, then adapt ++ * inflight_hi downward. If we're able to push inflight higher without such ++ * signals, push higher: adapt inflight_hi upward. ++ */ ++static bool bbr_adapt_upper_bounds(struct sock *sk, ++ const struct rate_sample *rs, ++ struct bbr_context *ctx) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ /* Track when we'll see bw/loss samples resulting from our bw probes. */ ++ if (bbr->ack_phase == BBR_ACKS_PROBE_STARTING && bbr->round_start) ++ bbr->ack_phase = BBR_ACKS_PROBE_FEEDBACK; ++ if (bbr->ack_phase == BBR_ACKS_PROBE_STOPPING && bbr->round_start) { ++ /* End of samples from bw probing phase. */ ++ bbr->bw_probe_samples = 0; ++ bbr->ack_phase = BBR_ACKS_INIT; ++ /* At this point in the cycle, our current bw sample is also ++ * our best recent chance at finding the highest available bw ++ * for this flow. So now is the best time to forget the bw ++ * samples from the previous cycle, by advancing the window. ++ */ ++ if (bbr->mode == BBR_PROBE_BW && !rs->is_app_limited) ++ bbr_advance_max_bw_filter(sk); ++ /* If we had an inflight_hi, then probed and pushed inflight all ++ * the way up to hit that inflight_hi without seeing any ++ * high loss/ECN in all the resulting ACKs from that probing, ++ * then probe up again, this time letting inflight persist at ++ * inflight_hi for a round trip, then accelerating beyond. ++ */ ++ if (bbr->mode == BBR_PROBE_BW && ++ bbr->stopped_risky_probe && !bbr->prev_probe_too_high) { ++ bbr_start_bw_probe_refill(sk, 0); ++ return true; /* yes, decided state transition */ ++ } ++ } ++ if (bbr_is_inflight_too_high(sk, rs)) { ++ if (bbr->bw_probe_samples) /* sample is from bw probing? */ ++ bbr_handle_inflight_too_high(sk, rs); ++ } else { ++ /* Loss/ECN rate is declared safe. Adjust upper bound upward. */ ++ ++ if (bbr->inflight_hi == ~0U) ++ return false; /* no excess queue signals yet */ ++ ++ /* To be resilient to random loss, we must raise bw/inflight_hi ++ * if we observe in any phase that a higher level is safe. ++ */ ++ if (rs->tx_in_flight > bbr->inflight_hi) { ++ bbr->inflight_hi = rs->tx_in_flight; ++ } ++ ++ if (bbr->mode == BBR_PROBE_BW && ++ bbr->cycle_idx == BBR_BW_PROBE_UP) ++ bbr_probe_inflight_hi_upward(sk, rs); ++ } ++ ++ return false; ++} ++ ++/* Check if it's time to probe for bandwidth now, and if so, kick it off. */ ++static bool bbr_check_time_to_probe_bw(struct sock *sk, ++ const struct rate_sample *rs) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 n; ++ ++ /* If we seem to be at an operating point where we are not seeing loss ++ * but we are seeing ECN marks, then when the ECN marks cease we reprobe ++ * quickly (in case cross-traffic has ceased and freed up bw). ++ */ ++ if (bbr_param(sk, ecn_reprobe_gain) && bbr->ecn_eligible && ++ bbr->ecn_in_cycle && !bbr->loss_in_cycle && ++ inet_csk(sk)->icsk_ca_state == TCP_CA_Open) { ++ /* Calculate n so that when bbr_raise_inflight_hi_slope() ++ * computes growth_this_round as 2^n it will be roughly the ++ * desired volume of data (inflight_hi*ecn_reprobe_gain). ++ */ ++ n = ilog2((((u64)bbr->inflight_hi * ++ bbr_param(sk, ecn_reprobe_gain)) >> BBR_SCALE)); ++ bbr_start_bw_probe_refill(sk, n); ++ return true; ++ } ++ ++ if (bbr_has_elapsed_in_phase(sk, bbr->probe_wait_us) || ++ bbr_is_reno_coexistence_probe_time(sk)) { ++ bbr_start_bw_probe_refill(sk, 0); ++ return true; ++ } ++ return false; ++} ++ ++/* Is it time to transition from PROBE_DOWN to PROBE_CRUISE? */ ++static bool bbr_check_time_to_cruise(struct sock *sk, u32 inflight, u32 bw) ++{ ++ /* Always need to pull inflight down to leave headroom in queue. */ ++ if (inflight > bbr_inflight_with_headroom(sk)) ++ return false; ++ ++ return inflight <= bbr_inflight(sk, bw, BBR_UNIT); ++} ++ ++/* PROBE_BW state machine: cruise, refill, probe for bw, or drain? */ ++static void bbr_update_cycle_phase(struct sock *sk, ++ const struct rate_sample *rs, ++ struct bbr_context *ctx) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ bool is_bw_probe_done = false; ++ u32 inflight, bw; ++ ++ if (!bbr_full_bw_reached(sk)) ++ return; ++ ++ /* In DRAIN, PROBE_BW, or PROBE_RTT, adjust upper bounds. */ ++ if (bbr_adapt_upper_bounds(sk, rs, ctx)) ++ return; /* already decided state transition */ ++ ++ if (bbr->mode != BBR_PROBE_BW) ++ return; ++ ++ inflight = bbr_packets_in_net_at_edt(sk, rs->prior_in_flight); ++ bw = bbr_max_bw(sk); ++ ++ switch (bbr->cycle_idx) { ++ /* First we spend most of our time cruising with a pacing_gain of 1.0, ++ * which paces at the estimated bw, to try to fully use the pipe ++ * without building queue. If we encounter loss/ECN marks, we adapt ++ * by slowing down. ++ */ ++ case BBR_BW_PROBE_CRUISE: ++ if (bbr_check_time_to_probe_bw(sk, rs)) ++ return; /* already decided state transition */ ++ break; ++ ++ /* After cruising, when it's time to probe, we first "refill": we send ++ * at the estimated bw to fill the pipe, before probing higher and ++ * knowingly risking overflowing the bottleneck buffer (causing loss). ++ */ ++ case BBR_BW_PROBE_REFILL: ++ if (bbr->round_start) { ++ /* After one full round trip of sending in REFILL, we ++ * start to see bw samples reflecting our REFILL, which ++ * may be putting too much data in flight. ++ */ ++ bbr->bw_probe_samples = 1; ++ bbr_start_bw_probe_up(sk, ctx); ++ } ++ break; ++ ++ /* After we refill the pipe, we probe by using a pacing_gain > 1.0, to ++ * probe for bw. If we have not seen loss/ECN, we try to raise inflight ++ * to at least pacing_gain*BDP; note that this may take more than ++ * min_rtt if min_rtt is small (e.g. on a LAN). ++ * ++ * We terminate PROBE_UP bandwidth probing upon any of the following: ++ * ++ * (1) We've pushed inflight up to hit the inflight_hi target set in the ++ * most recent previous bw probe phase. Thus we want to start ++ * draining the queue immediately because it's very likely the most ++ * recently sent packets will fill the queue and cause drops. ++ * (2) If inflight_hi has not limited bandwidth growth recently, and ++ * yet delivered bandwidth has not increased much recently ++ * (bbr->full_bw_now). ++ * (3) Loss filter says loss rate is "too high". ++ * (4) ECN filter says ECN mark rate is "too high". ++ * ++ * (1) (2) checked here, (3) (4) checked in bbr_is_inflight_too_high() ++ */ ++ case BBR_BW_PROBE_UP: ++ if (bbr->prev_probe_too_high && ++ inflight >= bbr->inflight_hi) { ++ bbr->stopped_risky_probe = 1; ++ is_bw_probe_done = true; ++ } else { ++ if (tp->is_cwnd_limited && ++ tcp_snd_cwnd(tp) >= bbr->inflight_hi) { ++ /* inflight_hi is limiting bw growth */ ++ bbr_reset_full_bw(sk); ++ bbr->full_bw = ctx->sample_bw; ++ } else if (bbr->full_bw_now) { ++ /* Plateau in estimated bw. Pipe looks full. */ ++ is_bw_probe_done = true; ++ } ++ } ++ if (is_bw_probe_done) { ++ bbr->prev_probe_too_high = 0; /* no loss/ECN (yet) */ ++ bbr_start_bw_probe_down(sk); /* restart w/ down */ ++ } ++ break; ++ ++ /* After probing in PROBE_UP, we have usually accumulated some data in ++ * the bottleneck buffer (if bw probing didn't find more bw). We next ++ * enter PROBE_DOWN to try to drain any excess data from the queue. To ++ * do this, we use a pacing_gain < 1.0. We hold this pacing gain until ++ * our inflight is less then that target cruising point, which is the ++ * minimum of (a) the amount needed to leave headroom, and (b) the ++ * estimated BDP. Once inflight falls to match the target, we estimate ++ * the queue is drained; persisting would underutilize the pipe. ++ */ ++ case BBR_BW_PROBE_DOWN: ++ if (bbr_check_time_to_probe_bw(sk, rs)) ++ return; /* already decided state transition */ ++ if (bbr_check_time_to_cruise(sk, inflight, bw)) ++ bbr_start_bw_probe_cruise(sk); ++ break; ++ ++ default: ++ WARN_ONCE(1, "BBR invalid cycle index %u\n", bbr->cycle_idx); ++ } ++} ++ ++/* Exiting PROBE_RTT, so return to bandwidth probing in STARTUP or PROBE_BW. */ ++static void bbr_exit_probe_rtt(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr_reset_lower_bounds(sk); ++ if (bbr_full_bw_reached(sk)) { ++ bbr->mode = BBR_PROBE_BW; ++ /* Raising inflight after PROBE_RTT may cause loss, so reset ++ * the PROBE_BW clock and schedule the next bandwidth probe for ++ * a friendly and randomized future point in time. ++ */ ++ bbr_start_bw_probe_down(sk); ++ /* Since we are exiting PROBE_RTT, we know inflight is ++ * below our estimated BDP, so it is reasonable to cruise. ++ */ ++ bbr_start_bw_probe_cruise(sk); ++ } else { ++ bbr->mode = BBR_STARTUP; ++ } ++} ++ ++/* Exit STARTUP based on loss rate > 1% and loss gaps in round >= N. Wait until ++ * the end of the round in recovery to get a good estimate of how many packets ++ * have been lost, and how many we need to drain with a low pacing rate. ++ */ ++static void bbr_check_loss_too_high_in_startup(struct sock *sk, ++ const struct rate_sample *rs) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ if (bbr_full_bw_reached(sk)) ++ return; ++ ++ /* For STARTUP exit, check the loss rate at the end of each round trip ++ * of Recovery episodes in STARTUP. We check the loss rate at the end ++ * of the round trip to filter out noisy/low loss and have a better ++ * sense of inflight (extent of loss), so we can drain more accurately. ++ */ ++ if (rs->losses && bbr->loss_events_in_round < 0xf) ++ bbr->loss_events_in_round++; /* update saturating counter */ ++ if (bbr_param(sk, full_loss_cnt) && bbr->loss_round_start && ++ inet_csk(sk)->icsk_ca_state == TCP_CA_Recovery && ++ bbr->loss_events_in_round >= bbr_param(sk, full_loss_cnt) && ++ bbr_is_inflight_too_high(sk, rs)) { ++ bbr_handle_queue_too_high_in_startup(sk); ++ return; ++ } ++ if (bbr->loss_round_start) ++ bbr->loss_events_in_round = 0; ++} ++ ++/* Estimate when the pipe is full, using the change in delivery rate: BBR ++ * estimates bw probing filled the pipe if the estimated bw hasn't changed by ++ * at least bbr_full_bw_thresh (25%) after bbr_full_bw_cnt (3) non-app-limited ++ * rounds. Why 3 rounds: 1: rwin autotuning grows the rwin, 2: we fill the ++ * higher rwin, 3: we get higher delivery rate samples. Or transient ++ * cross-traffic or radio noise can go away. CUBIC Hystart shares a similar ++ * design goal, but uses delay and inter-ACK spacing instead of bandwidth. ++ */ ++static void bbr_check_full_bw_reached(struct sock *sk, ++ const struct rate_sample *rs, ++ struct bbr_context *ctx) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 bw_thresh, full_cnt, thresh; ++ ++ if (bbr->full_bw_now || rs->is_app_limited) ++ return; ++ ++ thresh = bbr_param(sk, full_bw_thresh); ++ full_cnt = bbr_param(sk, full_bw_cnt); ++ bw_thresh = (u64)bbr->full_bw * thresh >> BBR_SCALE; ++ if (ctx->sample_bw >= bw_thresh) { ++ bbr_reset_full_bw(sk); ++ bbr->full_bw = ctx->sample_bw; ++ return; ++ } ++ if (!bbr->round_start) ++ return; ++ ++bbr->full_bw_cnt; ++ bbr->full_bw_now = bbr->full_bw_cnt >= full_cnt; ++ bbr->full_bw_reached |= bbr->full_bw_now; ++} ++ ++/* If pipe is probably full, drain the queue and then enter steady-state. */ ++static void bbr_check_drain(struct sock *sk, const struct rate_sample *rs, ++ struct bbr_context *ctx) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) { ++ bbr->mode = BBR_DRAIN; /* drain queue we created */ ++ /* Set ssthresh to export purely for monitoring, to signal ++ * completion of initial STARTUP by setting to a non- ++ * TCP_INFINITE_SSTHRESH value (ssthresh is not used by BBR). ++ */ ++ tcp_sk(sk)->snd_ssthresh = ++ bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT); ++ bbr_reset_congestion_signals(sk); ++ } /* fall through to check if in-flight is already small: */ ++ if (bbr->mode == BBR_DRAIN && ++ bbr_packets_in_net_at_edt(sk, tcp_packets_in_flight(tcp_sk(sk))) <= ++ bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT)) { ++ bbr->mode = BBR_PROBE_BW; ++ bbr_start_bw_probe_down(sk); ++ } ++} ++ ++static void bbr_update_model(struct sock *sk, const struct rate_sample *rs, ++ struct bbr_context *ctx) ++{ ++ bbr_update_congestion_signals(sk, rs, ctx); + bbr_update_ack_aggregation(sk, rs); +- bbr_update_cycle_phase(sk, rs); +- bbr_check_full_bw_reached(sk, rs); +- bbr_check_drain(sk, rs); ++ bbr_check_loss_too_high_in_startup(sk, rs); ++ bbr_check_full_bw_reached(sk, rs, ctx); ++ bbr_check_drain(sk, rs, ctx); ++ bbr_update_cycle_phase(sk, rs, ctx); + bbr_update_min_rtt(sk, rs); +- bbr_update_gains(sk); ++} ++ ++/* Fast path for app-limited case. ++ * ++ * On each ack, we execute bbr state machine, which primarily consists of: ++ * 1) update model based on new rate sample, and ++ * 2) update control based on updated model or state change. ++ * ++ * There are certain workload/scenarios, e.g. app-limited case, where ++ * either we can skip updating model or we can skip update of both model ++ * as well as control. This provides signifcant softirq cpu savings for ++ * processing incoming acks. ++ * ++ * In case of app-limited, if there is no congestion (loss/ecn) and ++ * if observed bw sample is less than current estimated bw, then we can ++ * skip some of the computation in bbr state processing: ++ * ++ * - if there is no rtt/mode/phase change: In this case, since all the ++ * parameters of the network model are constant, we can skip model ++ * as well control update. ++ * ++ * - else we can skip rest of the model update. But we still need to ++ * update the control to account for the new rtt/mode/phase. ++ * ++ * Returns whether we can take fast path or not. ++ */ ++static bool bbr_run_fast_path(struct sock *sk, bool *update_model, ++ const struct rate_sample *rs, struct bbr_context *ctx) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 prev_min_rtt_us, prev_mode; ++ ++ if (bbr_param(sk, fast_path) && bbr->try_fast_path && ++ rs->is_app_limited && ctx->sample_bw < bbr_max_bw(sk) && ++ !bbr->loss_in_round && !bbr->ecn_in_round ) { ++ prev_mode = bbr->mode; ++ prev_min_rtt_us = bbr->min_rtt_us; ++ bbr_check_drain(sk, rs, ctx); ++ bbr_update_cycle_phase(sk, rs, ctx); ++ bbr_update_min_rtt(sk, rs); ++ ++ if (bbr->mode == prev_mode && ++ bbr->min_rtt_us == prev_min_rtt_us && ++ bbr->try_fast_path) { ++ return true; ++ } ++ ++ /* Skip model update, but control still needs to be updated */ ++ *update_model = false; ++ } ++ return false; + } + + __bpf_kfunc static void bbr_main(struct sock *sk, u32 ack, int flag, const struct rate_sample *rs) + { ++ struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); +- u32 bw; ++ struct bbr_context ctx = { 0 }; ++ bool update_model = true; ++ u32 bw, round_delivered; ++ int ce_ratio = -1; ++ ++ round_delivered = bbr_update_round_start(sk, rs, &ctx); ++ if (bbr->round_start) { ++ bbr->rounds_since_probe = ++ min_t(s32, bbr->rounds_since_probe + 1, 0xFF); ++ ce_ratio = bbr_update_ecn_alpha(sk); ++ } ++ bbr_plb(sk, rs, ce_ratio); ++ ++ bbr->ecn_in_round |= (bbr->ecn_eligible && rs->is_ece); ++ bbr_calculate_bw_sample(sk, rs, &ctx); ++ bbr_update_latest_delivery_signals(sk, rs, &ctx); + +- bbr_update_model(sk, rs); ++ if (bbr_run_fast_path(sk, &update_model, rs, &ctx)) ++ goto out; + ++ if (update_model) ++ bbr_update_model(sk, rs, &ctx); ++ ++ bbr_update_gains(sk); + bw = bbr_bw(sk); + bbr_set_pacing_rate(sk, bw, bbr->pacing_gain); +- bbr_set_cwnd(sk, rs, rs->acked_sacked, bw, bbr->cwnd_gain); ++ bbr_set_cwnd(sk, rs, rs->acked_sacked, bw, bbr->cwnd_gain, ++ tcp_snd_cwnd(tp), &ctx); ++ bbr_bound_cwnd_for_inflight_model(sk); ++ ++out: ++ bbr_advance_latest_delivery_signals(sk, rs, &ctx); ++ bbr->prev_ca_state = inet_csk(sk)->icsk_ca_state; ++ bbr->loss_in_cycle |= rs->lost > 0; ++ bbr->ecn_in_cycle |= rs->delivered_ce > 0; + } + + __bpf_kfunc static void bbr_init(struct sock *sk) +@@ -1056,20 +2079,21 @@ __bpf_kfunc static void bbr_init(struct + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + +- bbr->prior_cwnd = 0; ++ bbr->initialized = 1; ++ ++ bbr->init_cwnd = min(0x7FU, tcp_snd_cwnd(tp)); ++ bbr->prior_cwnd = tp->prior_cwnd; + tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; +- bbr->rtt_cnt = 0; + bbr->next_rtt_delivered = tp->delivered; + bbr->prev_ca_state = TCP_CA_Open; +- bbr->packet_conservation = 0; + + bbr->probe_rtt_done_stamp = 0; + bbr->probe_rtt_round_done = 0; ++ bbr->probe_rtt_min_us = tcp_min_rtt(tp); ++ bbr->probe_rtt_min_stamp = tcp_jiffies32; + bbr->min_rtt_us = tcp_min_rtt(tp); + bbr->min_rtt_stamp = tcp_jiffies32; + +- minmax_reset(&bbr->bw, bbr->rtt_cnt, 0); /* init max bw to 0 */ +- + bbr->has_seen_rtt = 0; + bbr_init_pacing_rate_from_rtt(sk); + +@@ -1080,7 +2104,7 @@ __bpf_kfunc static void bbr_init(struct + bbr->full_bw_cnt = 0; + bbr->cycle_mstamp = 0; + bbr->cycle_idx = 0; +- bbr_reset_lt_bw_sampling(sk); ++ + bbr_reset_startup_mode(sk); + + bbr->ack_epoch_mstamp = tp->tcp_mstamp; +@@ -1090,78 +2114,236 @@ __bpf_kfunc static void bbr_init(struct + bbr->extra_acked[0] = 0; + bbr->extra_acked[1] = 0; + ++ bbr->ce_state = 0; ++ bbr->prior_rcv_nxt = tp->rcv_nxt; ++ bbr->try_fast_path = 0; ++ + cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED); ++ ++ /* Start sampling ECN mark rate after first full flight is ACKed: */ ++ bbr->loss_round_delivered = tp->delivered + 1; ++ bbr->loss_round_start = 0; ++ bbr->undo_bw_lo = 0; ++ bbr->undo_inflight_lo = 0; ++ bbr->undo_inflight_hi = 0; ++ bbr->loss_events_in_round = 0; ++ bbr->startup_ecn_rounds = 0; ++ bbr_reset_congestion_signals(sk); ++ bbr->bw_lo = ~0U; ++ bbr->bw_hi[0] = 0; ++ bbr->bw_hi[1] = 0; ++ bbr->inflight_lo = ~0U; ++ bbr->inflight_hi = ~0U; ++ bbr_reset_full_bw(sk); ++ bbr->bw_probe_up_cnt = ~0U; ++ bbr->bw_probe_up_acks = 0; ++ bbr->bw_probe_up_rounds = 0; ++ bbr->probe_wait_us = 0; ++ bbr->stopped_risky_probe = 0; ++ bbr->ack_phase = BBR_ACKS_INIT; ++ bbr->rounds_since_probe = 0; ++ bbr->bw_probe_samples = 0; ++ bbr->prev_probe_too_high = 0; ++ bbr->ecn_eligible = 0; ++ bbr->ecn_alpha = bbr_param(sk, ecn_alpha_init); ++ bbr->alpha_last_delivered = 0; ++ bbr->alpha_last_delivered_ce = 0; ++ bbr->plb.pause_until = 0; ++ ++ tp->fast_ack_mode = bbr_fast_ack_mode ? 1 : 0; + } + +-__bpf_kfunc static u32 bbr_sndbuf_expand(struct sock *sk) ++/* BBR marks the current round trip as a loss round. */ ++static void bbr_note_loss(struct sock *sk) + { +- /* Provision 3 * cwnd since BBR may slow-start even during recovery. */ +- return 3; ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ /* Capture "current" data over the full round trip of loss, to ++ * have a better chance of observing the full capacity of the path. ++ */ ++ if (!bbr->loss_in_round) /* first loss in this round trip? */ ++ bbr->loss_round_delivered = tp->delivered; /* set round trip */ ++ bbr->loss_in_round = 1; ++ bbr->loss_in_cycle = 1; + } + +-/* In theory BBR does not need to undo the cwnd since it does not +- * always reduce cwnd on losses (see bbr_main()). Keep it for now. +- */ ++/* Core TCP stack informs us that the given skb was just marked lost. */ ++__bpf_kfunc static void bbr_skb_marked_lost(struct sock *sk, ++ const struct sk_buff *skb) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ struct tcp_skb_cb *scb = TCP_SKB_CB(skb); ++ struct rate_sample rs = {}; ++ ++ bbr_note_loss(sk); ++ ++ if (!bbr->bw_probe_samples) ++ return; /* not an skb sent while probing for bandwidth */ ++ if (unlikely(!scb->tx.delivered_mstamp)) ++ return; /* skb was SACKed, reneged, marked lost; ignore it */ ++ /* We are probing for bandwidth. Construct a rate sample that ++ * estimates what happened in the flight leading up to this lost skb, ++ * then see if the loss rate went too high, and if so at which packet. ++ */ ++ rs.tx_in_flight = scb->tx.in_flight; ++ rs.lost = tp->lost - scb->tx.lost; ++ rs.is_app_limited = scb->tx.is_app_limited; ++ if (bbr_is_inflight_too_high(sk, &rs)) { ++ rs.tx_in_flight = bbr_inflight_hi_from_lost_skb(sk, &rs, skb); ++ bbr_handle_inflight_too_high(sk, &rs); ++ } ++} ++ ++static void bbr_run_loss_probe_recovery(struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ struct rate_sample rs = {0}; ++ ++ bbr_note_loss(sk); ++ ++ if (!bbr->bw_probe_samples) ++ return; /* not sent while probing for bandwidth */ ++ /* We are probing for bandwidth. Construct a rate sample that ++ * estimates what happened in the flight leading up to this ++ * loss, then see if the loss rate went too high. ++ */ ++ rs.lost = 1; /* TLP probe repaired loss of a single segment */ ++ rs.tx_in_flight = bbr->inflight_latest + rs.lost; ++ rs.is_app_limited = tp->tlp_orig_data_app_limited; ++ if (bbr_is_inflight_too_high(sk, &rs)) ++ bbr_handle_inflight_too_high(sk, &rs); ++} ++ ++/* Revert short-term model if current loss recovery event was spurious. */ + __bpf_kfunc static u32 bbr_undo_cwnd(struct sock *sk) + { + struct bbr *bbr = inet_csk_ca(sk); + +- bbr->full_bw = 0; /* spurious slow-down; reset full pipe detection */ +- bbr->full_bw_cnt = 0; +- bbr_reset_lt_bw_sampling(sk); +- return tcp_snd_cwnd(tcp_sk(sk)); ++ bbr_reset_full_bw(sk); /* spurious slow-down; reset full bw detector */ ++ bbr->loss_in_round = 0; ++ ++ /* Revert to cwnd and other state saved before loss episode. */ ++ bbr->bw_lo = max(bbr->bw_lo, bbr->undo_bw_lo); ++ bbr->inflight_lo = max(bbr->inflight_lo, bbr->undo_inflight_lo); ++ bbr->inflight_hi = max(bbr->inflight_hi, bbr->undo_inflight_hi); ++ bbr->try_fast_path = 0; /* take slow path to set proper cwnd, pacing */ ++ return bbr->prior_cwnd; + } + +-/* Entering loss recovery, so save cwnd for when we exit or undo recovery. */ ++/* Entering loss recovery, so save state for when we undo recovery. */ + __bpf_kfunc static u32 bbr_ssthresh(struct sock *sk) + { ++ struct bbr *bbr = inet_csk_ca(sk); ++ + bbr_save_cwnd(sk); ++ /* For undo, save state that adapts based on loss signal. */ ++ bbr->undo_bw_lo = bbr->bw_lo; ++ bbr->undo_inflight_lo = bbr->inflight_lo; ++ bbr->undo_inflight_hi = bbr->inflight_hi; + return tcp_sk(sk)->snd_ssthresh; + } + ++static enum tcp_bbr_phase bbr_get_phase(struct bbr *bbr) ++{ ++ switch (bbr->mode) { ++ case BBR_STARTUP: ++ return BBR_PHASE_STARTUP; ++ case BBR_DRAIN: ++ return BBR_PHASE_DRAIN; ++ case BBR_PROBE_BW: ++ break; ++ case BBR_PROBE_RTT: ++ return BBR_PHASE_PROBE_RTT; ++ default: ++ return BBR_PHASE_INVALID; ++ } ++ switch (bbr->cycle_idx) { ++ case BBR_BW_PROBE_UP: ++ return BBR_PHASE_PROBE_BW_UP; ++ case BBR_BW_PROBE_DOWN: ++ return BBR_PHASE_PROBE_BW_DOWN; ++ case BBR_BW_PROBE_CRUISE: ++ return BBR_PHASE_PROBE_BW_CRUISE; ++ case BBR_BW_PROBE_REFILL: ++ return BBR_PHASE_PROBE_BW_REFILL; ++ default: ++ return BBR_PHASE_INVALID; ++ } ++} ++ + static size_t bbr_get_info(struct sock *sk, u32 ext, int *attr, +- union tcp_cc_info *info) ++ union tcp_cc_info *info) + { + if (ext & (1 << (INET_DIAG_BBRINFO - 1)) || + ext & (1 << (INET_DIAG_VEGASINFO - 1))) { +- struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); +- u64 bw = bbr_bw(sk); +- +- bw = bw * tp->mss_cache * USEC_PER_SEC >> BW_SCALE; +- memset(&info->bbr, 0, sizeof(info->bbr)); +- info->bbr.bbr_bw_lo = (u32)bw; +- info->bbr.bbr_bw_hi = (u32)(bw >> 32); +- info->bbr.bbr_min_rtt = bbr->min_rtt_us; +- info->bbr.bbr_pacing_gain = bbr->pacing_gain; +- info->bbr.bbr_cwnd_gain = bbr->cwnd_gain; ++ u64 bw = bbr_bw_bytes_per_sec(sk, bbr_bw(sk)); ++ u64 bw_hi = bbr_bw_bytes_per_sec(sk, bbr_max_bw(sk)); ++ u64 bw_lo = bbr->bw_lo == ~0U ? ++ ~0ULL : bbr_bw_bytes_per_sec(sk, bbr->bw_lo); ++ struct tcp_bbr_info *bbr_info = &info->bbr; ++ ++ memset(bbr_info, 0, sizeof(*bbr_info)); ++ bbr_info->bbr_bw_lo = (u32)bw; ++ bbr_info->bbr_bw_hi = (u32)(bw >> 32); ++ bbr_info->bbr_min_rtt = bbr->min_rtt_us; ++ bbr_info->bbr_pacing_gain = bbr->pacing_gain; ++ bbr_info->bbr_cwnd_gain = bbr->cwnd_gain; ++ bbr_info->bbr_bw_hi_lsb = (u32)bw_hi; ++ bbr_info->bbr_bw_hi_msb = (u32)(bw_hi >> 32); ++ bbr_info->bbr_bw_lo_lsb = (u32)bw_lo; ++ bbr_info->bbr_bw_lo_msb = (u32)(bw_lo >> 32); ++ bbr_info->bbr_mode = bbr->mode; ++ bbr_info->bbr_phase = (__u8)bbr_get_phase(bbr); ++ bbr_info->bbr_version = (__u8)BBR_VERSION; ++ bbr_info->bbr_inflight_lo = bbr->inflight_lo; ++ bbr_info->bbr_inflight_hi = bbr->inflight_hi; ++ bbr_info->bbr_extra_acked = bbr_extra_acked(sk); + *attr = INET_DIAG_BBRINFO; +- return sizeof(info->bbr); ++ return sizeof(*bbr_info); + } + return 0; + } + + __bpf_kfunc static void bbr_set_state(struct sock *sk, u8 new_state) + { ++ struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + + if (new_state == TCP_CA_Loss) { +- struct rate_sample rs = { .losses = 1 }; + + bbr->prev_ca_state = TCP_CA_Loss; +- bbr->full_bw = 0; +- bbr->round_start = 1; /* treat RTO like end of a round */ +- bbr_lt_bw_sampling(sk, &rs); ++ tcp_plb_update_state_upon_rto(sk, &bbr->plb); ++ /* The tcp_write_timeout() call to sk_rethink_txhash() likely ++ * repathed this flow, so re-learn the min network RTT on the ++ * new path: ++ */ ++ bbr_reset_full_bw(sk); ++ if (!bbr_is_probing_bandwidth(sk) && bbr->inflight_lo == ~0U) { ++ /* bbr_adapt_lower_bounds() needs cwnd before ++ * we suffered an RTO, to update inflight_lo: ++ */ ++ bbr->inflight_lo = ++ max(tcp_snd_cwnd(tp), bbr->prior_cwnd); ++ } ++ } else if (bbr->prev_ca_state == TCP_CA_Loss && ++ new_state != TCP_CA_Loss) { ++ bbr_exit_loss_recovery(sk); + } + } + ++ + static struct tcp_congestion_ops tcp_bbr_cong_ops __read_mostly = { +- .flags = TCP_CONG_NON_RESTRICTED, ++ .flags = TCP_CONG_NON_RESTRICTED | TCP_CONG_WANTS_CE_EVENTS, + .name = "bbr", + .owner = THIS_MODULE, + .init = bbr_init, + .cong_control = bbr_main, + .sndbuf_expand = bbr_sndbuf_expand, ++ .skb_marked_lost = bbr_skb_marked_lost, + .undo_cwnd = bbr_undo_cwnd, + .cwnd_event = bbr_cwnd_event, + .ssthresh = bbr_ssthresh, +@@ -1174,10 +2356,11 @@ BTF_KFUNCS_START(tcp_bbr_check_kfunc_ids + BTF_ID_FLAGS(func, bbr_init) + BTF_ID_FLAGS(func, bbr_main) + BTF_ID_FLAGS(func, bbr_sndbuf_expand) ++BTF_ID_FLAGS(func, bbr_skb_marked_lost) + BTF_ID_FLAGS(func, bbr_undo_cwnd) + BTF_ID_FLAGS(func, bbr_cwnd_event) + BTF_ID_FLAGS(func, bbr_ssthresh) +-BTF_ID_FLAGS(func, bbr_min_tso_segs) ++BTF_ID_FLAGS(func, bbr_tso_segs) + BTF_ID_FLAGS(func, bbr_set_state) + BTF_KFUNCS_END(tcp_bbr_check_kfunc_ids) + +@@ -1210,5 +2393,12 @@ MODULE_AUTHOR("Van Jacobson "); + MODULE_AUTHOR("Yuchung Cheng "); + MODULE_AUTHOR("Soheil Hassas Yeganeh "); ++MODULE_AUTHOR("Priyaranjan Jha "); ++MODULE_AUTHOR("Yousuk Seung "); ++MODULE_AUTHOR("Kevin Yang "); ++MODULE_AUTHOR("Arjun Roy "); ++MODULE_AUTHOR("David Morley "); ++ + MODULE_LICENSE("Dual BSD/GPL"); + MODULE_DESCRIPTION("TCP BBR (Bottleneck Bandwidth and RTT)"); ++MODULE_VERSION(__stringify(BBR_VERSION)); diff --git a/debian/patches/misc-bbr3/0017-net-tcp_bbr-v3-ensure-ECN-enabled-BBR-flows-set-ECT-.patch b/debian/patches/misc-bbr3/0017-net-tcp_bbr-v3-ensure-ECN-enabled-BBR-flows-set-ECT-.patch new file mode 100644 index 0000000..af8f105 --- /dev/null +++ b/debian/patches/misc-bbr3/0017-net-tcp_bbr-v3-ensure-ECN-enabled-BBR-flows-set-ECT-.patch @@ -0,0 +1,59 @@ +From 99e86f904f246ae9ec7a13d1b920eaf4a8c2d47b Mon Sep 17 00:00:00 2001 +From: Adithya Abraham Philip +Date: Fri, 11 Jun 2021 21:56:10 +0000 +Subject: [PATCH 17/19] net-tcp_bbr: v3: ensure ECN-enabled BBR flows set ECT + on retransmits + +Adds a new flag TCP_ECN_ECT_PERMANENT that is used by CCAs to +indicate that retransmitted packets and pure ACKs must have the +ECT bit set. This is necessary for BBR, which when using +ECN expects ECT to be set even on retransmitted packets and ACKs. + +Previous to this addition of TCP_ECN_ECT_PERMANENT, CCAs which can use +ECN but don't "need" it did not have a way to indicate that ECT should +be set on retransmissions/ACKs. + +Signed-off-by: Adithya Abraham Philip +Signed-off-by: Neal Cardwell +Change-Id: I8b048eaab35e136fe6501ef6cd89fd9faa15e6d2 +Signed-off-by: Alexandre Frade +--- + include/net/tcp.h | 1 + + net/ipv4/tcp_bbr.c | 3 +++ + net/ipv4/tcp_output.c | 3 ++- + 3 files changed, 6 insertions(+), 1 deletion(-) + +--- a/include/net/tcp.h ++++ b/include/net/tcp.h +@@ -376,6 +376,7 @@ static inline void tcp_dec_quickack_mode + #define TCP_ECN_DEMAND_CWR 4 + #define TCP_ECN_SEEN 8 + #define TCP_ECN_LOW 16 ++#define TCP_ECN_ECT_PERMANENT 32 + + enum tcp_tw_status { + TCP_TW_SUCCESS = 0, +--- a/net/ipv4/tcp_bbr.c ++++ b/net/ipv4/tcp_bbr.c +@@ -2151,6 +2151,9 @@ __bpf_kfunc static void bbr_init(struct + bbr->plb.pause_until = 0; + + tp->fast_ack_mode = bbr_fast_ack_mode ? 1 : 0; ++ ++ if (bbr_can_use_ecn(sk)) ++ tp->ecn_flags |= TCP_ECN_ECT_PERMANENT; + } + + /* BBR marks the current round trip as a loss round. */ +--- a/net/ipv4/tcp_output.c ++++ b/net/ipv4/tcp_output.c +@@ -390,7 +390,8 @@ static void tcp_ecn_send(struct sock *sk + th->cwr = 1; + skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN; + } +- } else if (!tcp_ca_needs_ecn(sk)) { ++ } else if (!(tp->ecn_flags & TCP_ECN_ECT_PERMANENT) && ++ !tcp_ca_needs_ecn(sk)) { + /* ACK or retransmitted segment: clear ECT|CE */ + INET_ECN_dontxmit(sk); + } diff --git a/debian/patches/misc-bbr3/0018-tcp-export-TCPI_OPT_ECN_LOW-in-tcp_info-tcpi_options.patch b/debian/patches/misc-bbr3/0018-tcp-export-TCPI_OPT_ECN_LOW-in-tcp_info-tcpi_options.patch new file mode 100644 index 0000000..a5d2395 --- /dev/null +++ b/debian/patches/misc-bbr3/0018-tcp-export-TCPI_OPT_ECN_LOW-in-tcp_info-tcpi_options.patch @@ -0,0 +1,38 @@ +From 5d7cb61552d374bcaaa95022129b4ca1eace1c33 Mon Sep 17 00:00:00 2001 +From: Neal Cardwell +Date: Sun, 23 Jul 2023 23:25:34 -0400 +Subject: [PATCH 18/19] tcp: export TCPI_OPT_ECN_LOW in tcp_info tcpi_options + field + +Analogous to other important ECN information, export TCPI_OPT_ECN_LOW +in tcp_info tcpi_options field. + +Signed-off-by: Neal Cardwell +Change-Id: I08d8d8c7e8780e6e37df54038ee50301ac5a0320 +Signed-off-by: Alexandre Frade +--- + include/uapi/linux/tcp.h | 1 + + net/ipv4/tcp.c | 2 ++ + 2 files changed, 3 insertions(+) + +--- a/include/uapi/linux/tcp.h ++++ b/include/uapi/linux/tcp.h +@@ -178,6 +178,7 @@ enum tcp_fastopen_client_fail { + #define TCPI_OPT_ECN_SEEN 16 /* we received at least one packet with ECT */ + #define TCPI_OPT_SYN_DATA 32 /* SYN-ACK acked data in SYN sent or rcvd */ + #define TCPI_OPT_USEC_TS 64 /* usec timestamps */ ++#define TCPI_OPT_ECN_LOW 128 /* Low-latency ECN configured at init */ + + /* + * Sender's congestion state indicating normal or abnormal situations +--- a/net/ipv4/tcp.c ++++ b/net/ipv4/tcp.c +@@ -3850,6 +3850,8 @@ void tcp_get_info(struct sock *sk, struc + info->tcpi_options |= TCPI_OPT_ECN; + if (tp->ecn_flags & TCP_ECN_SEEN) + info->tcpi_options |= TCPI_OPT_ECN_SEEN; ++ if (tp->ecn_flags & TCP_ECN_LOW) ++ info->tcpi_options |= TCPI_OPT_ECN_LOW; + if (tp->syn_data_acked) + info->tcpi_options |= TCPI_OPT_SYN_DATA; + if (tp->tcp_usec_ts) diff --git a/debian/patches/misc-bbr3/0019-x86-cfi-bpf-Add-tso_segs-and-skb_marked_lost-to-bpf_.patch b/debian/patches/misc-bbr3/0019-x86-cfi-bpf-Add-tso_segs-and-skb_marked_lost-to-bpf_.patch new file mode 100644 index 0000000..0fb0528 --- /dev/null +++ b/debian/patches/misc-bbr3/0019-x86-cfi-bpf-Add-tso_segs-and-skb_marked_lost-to-bpf_.patch @@ -0,0 +1,42 @@ +From 39838c2f0b09bec02004c092904aada85da2bc2e Mon Sep 17 00:00:00 2001 +From: Alexandre Frade +Date: Mon, 11 Mar 2024 12:01:13 -0300 +Subject: [PATCH 19/19] x86/cfi,bpf: Add tso_segs and skb_marked_lost to + bpf_struct_ops CFI + +Rebased-by: Oleksandr Natalenko +[ https://github.com/sirlucjan/kernel-patches/blob/master/6.8/bbr3-patches/0001-tcp-bbr3-initial-import.patch ] +Signed-off-by: Alexandre Frade +--- + net/ipv4/bpf_tcp_ca.c | 9 +++++++-- + 1 file changed, 7 insertions(+), 2 deletions(-) + +--- a/net/ipv4/bpf_tcp_ca.c ++++ b/net/ipv4/bpf_tcp_ca.c +@@ -305,11 +305,15 @@ static void bpf_tcp_ca_pkts_acked(struct + { + } + +-static u32 bpf_tcp_ca_min_tso_segs(struct sock *sk) ++static u32 bpf_tcp_ca_tso_segs(struct sock *sk, unsigned int mss_now) + { + return 0; + } + ++static void bpf_tcp_ca_skb_marked_lost(struct sock *sk, const struct sk_buff *skb) ++{ ++} ++ + static void bpf_tcp_ca_cong_control(struct sock *sk, u32 ack, int flag, + const struct rate_sample *rs) + { +@@ -340,7 +344,8 @@ static struct tcp_congestion_ops __bpf_o + .cwnd_event = bpf_tcp_ca_cwnd_event, + .in_ack_event = bpf_tcp_ca_in_ack_event, + .pkts_acked = bpf_tcp_ca_pkts_acked, +- .min_tso_segs = bpf_tcp_ca_min_tso_segs, ++ .tso_segs = bpf_tcp_ca_tso_segs, ++ .skb_marked_lost = bpf_tcp_ca_skb_marked_lost, + .cong_control = bpf_tcp_ca_cong_control, + .undo_cwnd = bpf_tcp_ca_undo_cwnd, + .sndbuf_expand = bpf_tcp_ca_sndbuf_expand, diff --git a/debian/patches/misc-ntsync5/0001-ntsync-Introduce-NTSYNC_IOC_WAIT_ANY.patch b/debian/patches/misc-ntsync5/0001-ntsync-Introduce-NTSYNC_IOC_WAIT_ANY.patch new file mode 100644 index 0000000..0072ea3 --- /dev/null +++ b/debian/patches/misc-ntsync5/0001-ntsync-Introduce-NTSYNC_IOC_WAIT_ANY.patch @@ -0,0 +1,375 @@ +From 60b01019526236e40466cf20bf1192074e5e1a7c Mon Sep 17 00:00:00 2001 +From: Elizabeth Figura +Date: Sun, 19 May 2024 15:24:27 -0500 +Subject: ntsync: Introduce NTSYNC_IOC_WAIT_ANY. + +This corresponds to part of the functionality of the NT syscall +NtWaitForMultipleObjects(). Specifically, it implements the behaviour where +the third argument (wait_any) is TRUE, and it does not handle alertable waits. +Those features have been split out into separate patches to ease review. + +This patch therefore implements the wait/wake infrastructure which comprises the +core of ntsync's functionality. + +NTSYNC_IOC_WAIT_ANY is a vectored wait function similar to poll(). Unlike +poll(), it "consumes" objects when they are signaled. For semaphores, this means +decreasing one from the internal counter. At most one object can be consumed by +this function. + +This wait/wake model is fundamentally different from that used anywhere else in +the kernel, and for that reason ntsync does not use any existing infrastructure, +such as futexes, kernel mutexes or semaphores, or wait_event(). + +Up to 64 objects can be waited on at once. As soon as one is signaled, the +object with the lowest index is consumed, and that index is returned via the +"index" field. + +A timeout is supported. The timeout is passed as a u64 nanosecond value, which +represents absolute time measured against either the MONOTONIC or REALTIME clock +(controlled by the flags argument). If U64_MAX is passed, the ioctl waits +indefinitely. + +This ioctl validates that all objects belong to the relevant device. This is not +necessary for any technical reason related to NTSYNC_IOC_WAIT_ANY, but will be +necessary for NTSYNC_IOC_WAIT_ALL introduced in the following patch. + +Some padding fields are added for alignment and for fields which will be added +in future patches (split out to ease review). + +Signed-off-by: Elizabeth Figura +--- + drivers/misc/ntsync.c | 245 ++++++++++++++++++++++++++++++++++++ + include/uapi/linux/ntsync.h | 14 +++ + 2 files changed, 259 insertions(+) + +--- a/drivers/misc/ntsync.c ++++ b/drivers/misc/ntsync.c +@@ -6,11 +6,16 @@ + */ + + #include ++#include + #include + #include ++#include ++#include + #include + #include + #include ++#include ++#include + #include + #include + #include +@@ -30,6 +35,8 @@ enum ntsync_type { + * + * Both rely on struct file for reference counting. Individual + * ntsync_obj objects take a reference to the device when created. ++ * Wait operations take a reference to each object being waited on for ++ * the duration of the wait. + */ + + struct ntsync_obj { +@@ -47,12 +54,55 @@ struct ntsync_obj { + __u32 max; + } sem; + } u; ++ ++ struct list_head any_waiters; ++}; ++ ++struct ntsync_q_entry { ++ struct list_head node; ++ struct ntsync_q *q; ++ struct ntsync_obj *obj; ++ __u32 index; ++}; ++ ++struct ntsync_q { ++ struct task_struct *task; ++ ++ /* ++ * Protected via atomic_try_cmpxchg(). Only the thread that wins the ++ * compare-and-swap may actually change object states and wake this ++ * task. ++ */ ++ atomic_t signaled; ++ ++ __u32 count; ++ struct ntsync_q_entry entries[]; + }; + + struct ntsync_device { + struct file *file; + }; + ++static void try_wake_any_sem(struct ntsync_obj *sem) ++{ ++ struct ntsync_q_entry *entry; ++ ++ lockdep_assert_held(&sem->lock); ++ ++ list_for_each_entry(entry, &sem->any_waiters, node) { ++ struct ntsync_q *q = entry->q; ++ int signaled = -1; ++ ++ if (!sem->u.sem.count) ++ break; ++ ++ if (atomic_try_cmpxchg(&q->signaled, &signaled, entry->index)) { ++ sem->u.sem.count--; ++ wake_up_process(q->task); ++ } ++ } ++} ++ + /* + * Actually change the semaphore state, returning -EOVERFLOW if it is made + * invalid. +@@ -88,6 +138,8 @@ static int ntsync_sem_post(struct ntsync + + prev_count = sem->u.sem.count; + ret = post_sem_state(sem, args); ++ if (!ret) ++ try_wake_any_sem(sem); + + spin_unlock(&sem->lock); + +@@ -141,6 +193,7 @@ static struct ntsync_obj *ntsync_alloc_o + obj->dev = dev; + get_file(dev->file); + spin_lock_init(&obj->lock); ++ INIT_LIST_HEAD(&obj->any_waiters); + + return obj; + } +@@ -191,6 +244,196 @@ static int ntsync_create_sem(struct ntsy + return put_user(fd, &user_args->sem); + } + ++static struct ntsync_obj *get_obj(struct ntsync_device *dev, int fd) ++{ ++ struct file *file = fget(fd); ++ struct ntsync_obj *obj; ++ ++ if (!file) ++ return NULL; ++ ++ if (file->f_op != &ntsync_obj_fops) { ++ fput(file); ++ return NULL; ++ } ++ ++ obj = file->private_data; ++ if (obj->dev != dev) { ++ fput(file); ++ return NULL; ++ } ++ ++ return obj; ++} ++ ++static void put_obj(struct ntsync_obj *obj) ++{ ++ fput(obj->file); ++} ++ ++static int ntsync_schedule(const struct ntsync_q *q, const struct ntsync_wait_args *args) ++{ ++ ktime_t timeout = ns_to_ktime(args->timeout); ++ clockid_t clock = CLOCK_MONOTONIC; ++ ktime_t *timeout_ptr; ++ int ret = 0; ++ ++ timeout_ptr = (args->timeout == U64_MAX ? NULL : &timeout); ++ ++ if (args->flags & NTSYNC_WAIT_REALTIME) ++ clock = CLOCK_REALTIME; ++ ++ do { ++ if (signal_pending(current)) { ++ ret = -ERESTARTSYS; ++ break; ++ } ++ ++ set_current_state(TASK_INTERRUPTIBLE); ++ if (atomic_read(&q->signaled) != -1) { ++ ret = 0; ++ break; ++ } ++ ret = schedule_hrtimeout_range_clock(timeout_ptr, 0, HRTIMER_MODE_ABS, clock); ++ } while (ret < 0); ++ __set_current_state(TASK_RUNNING); ++ ++ return ret; ++} ++ ++/* ++ * Allocate and initialize the ntsync_q structure, but do not queue us yet. ++ */ ++static int setup_wait(struct ntsync_device *dev, ++ const struct ntsync_wait_args *args, ++ struct ntsync_q **ret_q) ++{ ++ const __u32 count = args->count; ++ int fds[NTSYNC_MAX_WAIT_COUNT]; ++ struct ntsync_q *q; ++ __u32 i, j; ++ ++ if (args->pad[0] || args->pad[1] || args->pad[2] || (args->flags & ~NTSYNC_WAIT_REALTIME)) ++ return -EINVAL; ++ ++ if (args->count > NTSYNC_MAX_WAIT_COUNT) ++ return -EINVAL; ++ ++ if (copy_from_user(fds, u64_to_user_ptr(args->objs), ++ array_size(count, sizeof(*fds)))) ++ return -EFAULT; ++ ++ q = kmalloc(struct_size(q, entries, count), GFP_KERNEL); ++ if (!q) ++ return -ENOMEM; ++ q->task = current; ++ atomic_set(&q->signaled, -1); ++ q->count = count; ++ ++ for (i = 0; i < count; i++) { ++ struct ntsync_q_entry *entry = &q->entries[i]; ++ struct ntsync_obj *obj = get_obj(dev, fds[i]); ++ ++ if (!obj) ++ goto err; ++ ++ entry->obj = obj; ++ entry->q = q; ++ entry->index = i; ++ } ++ ++ *ret_q = q; ++ return 0; ++ ++err: ++ for (j = 0; j < i; j++) ++ put_obj(q->entries[j].obj); ++ kfree(q); ++ return -EINVAL; ++} ++ ++static void try_wake_any_obj(struct ntsync_obj *obj) ++{ ++ switch (obj->type) { ++ case NTSYNC_TYPE_SEM: ++ try_wake_any_sem(obj); ++ break; ++ } ++} ++ ++static int ntsync_wait_any(struct ntsync_device *dev, void __user *argp) ++{ ++ struct ntsync_wait_args args; ++ struct ntsync_q *q; ++ int signaled; ++ __u32 i; ++ int ret; ++ ++ if (copy_from_user(&args, argp, sizeof(args))) ++ return -EFAULT; ++ ++ ret = setup_wait(dev, &args, &q); ++ if (ret < 0) ++ return ret; ++ ++ /* queue ourselves */ ++ ++ for (i = 0; i < args.count; i++) { ++ struct ntsync_q_entry *entry = &q->entries[i]; ++ struct ntsync_obj *obj = entry->obj; ++ ++ spin_lock(&obj->lock); ++ list_add_tail(&entry->node, &obj->any_waiters); ++ spin_unlock(&obj->lock); ++ } ++ ++ /* check if we are already signaled */ ++ ++ for (i = 0; i < args.count; i++) { ++ struct ntsync_obj *obj = q->entries[i].obj; ++ ++ if (atomic_read(&q->signaled) != -1) ++ break; ++ ++ spin_lock(&obj->lock); ++ try_wake_any_obj(obj); ++ spin_unlock(&obj->lock); ++ } ++ ++ /* sleep */ ++ ++ ret = ntsync_schedule(q, &args); ++ ++ /* and finally, unqueue */ ++ ++ for (i = 0; i < args.count; i++) { ++ struct ntsync_q_entry *entry = &q->entries[i]; ++ struct ntsync_obj *obj = entry->obj; ++ ++ spin_lock(&obj->lock); ++ list_del(&entry->node); ++ spin_unlock(&obj->lock); ++ ++ put_obj(obj); ++ } ++ ++ signaled = atomic_read(&q->signaled); ++ if (signaled != -1) { ++ struct ntsync_wait_args __user *user_args = argp; ++ ++ /* even if we caught a signal, we need to communicate success */ ++ ret = 0; ++ ++ if (put_user(signaled, &user_args->index)) ++ ret = -EFAULT; ++ } else if (!ret) { ++ ret = -ETIMEDOUT; ++ } ++ ++ kfree(q); ++ return ret; ++} ++ + static int ntsync_char_open(struct inode *inode, struct file *file) + { + struct ntsync_device *dev; +@@ -222,6 +465,8 @@ static long ntsync_char_ioctl(struct fil + switch (cmd) { + case NTSYNC_IOC_CREATE_SEM: + return ntsync_create_sem(dev, argp); ++ case NTSYNC_IOC_WAIT_ANY: ++ return ntsync_wait_any(dev, argp); + default: + return -ENOIOCTLCMD; + } +--- a/include/uapi/linux/ntsync.h ++++ b/include/uapi/linux/ntsync.h +@@ -16,7 +16,21 @@ struct ntsync_sem_args { + __u32 max; + }; + ++#define NTSYNC_WAIT_REALTIME 0x1 ++ ++struct ntsync_wait_args { ++ __u64 timeout; ++ __u64 objs; ++ __u32 count; ++ __u32 index; ++ __u32 flags; ++ __u32 pad[3]; ++}; ++ ++#define NTSYNC_MAX_WAIT_COUNT 64 ++ + #define NTSYNC_IOC_CREATE_SEM _IOWR('N', 0x80, struct ntsync_sem_args) ++#define NTSYNC_IOC_WAIT_ANY _IOWR('N', 0x82, struct ntsync_wait_args) + + #define NTSYNC_IOC_SEM_POST _IOWR('N', 0x81, __u32) + diff --git a/debian/patches/misc-ntsync5/0002-ntsync-Introduce-NTSYNC_IOC_WAIT_ALL.patch b/debian/patches/misc-ntsync5/0002-ntsync-Introduce-NTSYNC_IOC_WAIT_ALL.patch new file mode 100644 index 0000000..c3cc639 --- /dev/null +++ b/debian/patches/misc-ntsync5/0002-ntsync-Introduce-NTSYNC_IOC_WAIT_ALL.patch @@ -0,0 +1,532 @@ +From 73fc33606fcb7028ec1ee6027a361de4e85ab5d6 Mon Sep 17 00:00:00 2001 +From: Elizabeth Figura +Date: Sun, 19 May 2024 15:24:28 -0500 +Subject: ntsync: Introduce NTSYNC_IOC_WAIT_ALL. +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +This is similar to NTSYNC_IOC_WAIT_ANY, but waits until all of the objects are +simultaneously signaled, and then acquires all of them as a single atomic +operation. + +Because acquisition of multiple objects is atomic, some complex locking is +required. We cannot simply spin-lock multiple objects simultaneously, as that +may disable preëmption for a problematically long time. + +Instead, modifying any object which may be involved in a wait-all operation takes +a device-wide sleeping mutex, "wait_all_lock", instead of the normal object +spinlock. + +Because wait-for-all is a rare operation, in order to optimize wait-for-any, +this lock is only taken when necessary. "all_hint" is used to mark objects which +are involved in a wait-for-all operation, and if an object is not, only its +spinlock is taken. + +The locking scheme used here was written by Peter Zijlstra. + +Signed-off-by: Elizabeth Figura +--- + drivers/misc/ntsync.c | 334 ++++++++++++++++++++++++++++++++++-- + include/uapi/linux/ntsync.h | 1 + + 2 files changed, 322 insertions(+), 13 deletions(-) + +--- a/drivers/misc/ntsync.c ++++ b/drivers/misc/ntsync.c +@@ -13,6 +13,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -41,6 +42,7 @@ enum ntsync_type { + + struct ntsync_obj { + spinlock_t lock; ++ int dev_locked; + + enum ntsync_type type; + +@@ -55,7 +57,30 @@ struct ntsync_obj { + } sem; + } u; + ++ /* ++ * any_waiters is protected by the object lock, but all_waiters is ++ * protected by the device wait_all_lock. ++ */ + struct list_head any_waiters; ++ struct list_head all_waiters; ++ ++ /* ++ * Hint describing how many tasks are queued on this object in a ++ * wait-all operation. ++ * ++ * Any time we do a wake, we may need to wake "all" waiters as well as ++ * "any" waiters. In order to atomically wake "all" waiters, we must ++ * lock all of the objects, and that means grabbing the wait_all_lock ++ * below (and, due to lock ordering rules, before locking this object). ++ * However, wait-all is a rare operation, and grabbing the wait-all ++ * lock for every wake would create unnecessary contention. ++ * Therefore we first check whether all_hint is zero, and, if it is, ++ * we skip trying to wake "all" waiters. ++ * ++ * Since wait requests must originate from user-space threads, we're ++ * limited here by PID_MAX_LIMIT, so there's no risk of overflow. ++ */ ++ atomic_t all_hint; + }; + + struct ntsync_q_entry { +@@ -75,19 +100,198 @@ struct ntsync_q { + */ + atomic_t signaled; + ++ bool all; + __u32 count; + struct ntsync_q_entry entries[]; + }; + + struct ntsync_device { ++ /* ++ * Wait-all operations must atomically grab all objects, and be totally ++ * ordered with respect to each other and wait-any operations. ++ * If one thread is trying to acquire several objects, another thread ++ * cannot touch the object at the same time. ++ * ++ * This device-wide lock is used to serialize wait-for-all ++ * operations, and operations on an object that is involved in a ++ * wait-for-all. ++ */ ++ struct mutex wait_all_lock; ++ + struct file *file; + }; + ++/* ++ * Single objects are locked using obj->lock. ++ * ++ * Multiple objects are 'locked' while holding dev->wait_all_lock. ++ * In this case however, individual objects are not locked by holding ++ * obj->lock, but by setting obj->dev_locked. ++ * ++ * This means that in order to lock a single object, the sequence is slightly ++ * more complicated than usual. Specifically it needs to check obj->dev_locked ++ * after acquiring obj->lock, if set, it needs to drop the lock and acquire ++ * dev->wait_all_lock in order to serialize against the multi-object operation. ++ */ ++ ++static void dev_lock_obj(struct ntsync_device *dev, struct ntsync_obj *obj) ++{ ++ lockdep_assert_held(&dev->wait_all_lock); ++ lockdep_assert(obj->dev == dev); ++ spin_lock(&obj->lock); ++ /* ++ * By setting obj->dev_locked inside obj->lock, it is ensured that ++ * anyone holding obj->lock must see the value. ++ */ ++ obj->dev_locked = 1; ++ spin_unlock(&obj->lock); ++} ++ ++static void dev_unlock_obj(struct ntsync_device *dev, struct ntsync_obj *obj) ++{ ++ lockdep_assert_held(&dev->wait_all_lock); ++ lockdep_assert(obj->dev == dev); ++ spin_lock(&obj->lock); ++ obj->dev_locked = 0; ++ spin_unlock(&obj->lock); ++} ++ ++static void obj_lock(struct ntsync_obj *obj) ++{ ++ struct ntsync_device *dev = obj->dev; ++ ++ for (;;) { ++ spin_lock(&obj->lock); ++ if (likely(!obj->dev_locked)) ++ break; ++ ++ spin_unlock(&obj->lock); ++ mutex_lock(&dev->wait_all_lock); ++ spin_lock(&obj->lock); ++ /* ++ * obj->dev_locked should be set and released under the same ++ * wait_all_lock section, since we now own this lock, it should ++ * be clear. ++ */ ++ lockdep_assert(!obj->dev_locked); ++ spin_unlock(&obj->lock); ++ mutex_unlock(&dev->wait_all_lock); ++ } ++} ++ ++static void obj_unlock(struct ntsync_obj *obj) ++{ ++ spin_unlock(&obj->lock); ++} ++ ++static bool ntsync_lock_obj(struct ntsync_device *dev, struct ntsync_obj *obj) ++{ ++ bool all; ++ ++ obj_lock(obj); ++ all = atomic_read(&obj->all_hint); ++ if (unlikely(all)) { ++ obj_unlock(obj); ++ mutex_lock(&dev->wait_all_lock); ++ dev_lock_obj(dev, obj); ++ } ++ ++ return all; ++} ++ ++static void ntsync_unlock_obj(struct ntsync_device *dev, struct ntsync_obj *obj, bool all) ++{ ++ if (all) { ++ dev_unlock_obj(dev, obj); ++ mutex_unlock(&dev->wait_all_lock); ++ } else { ++ obj_unlock(obj); ++ } ++} ++ ++#define ntsync_assert_held(obj) \ ++ lockdep_assert((lockdep_is_held(&(obj)->lock) != LOCK_STATE_NOT_HELD) || \ ++ ((lockdep_is_held(&(obj)->dev->wait_all_lock) != LOCK_STATE_NOT_HELD) && \ ++ (obj)->dev_locked)) ++ ++static bool is_signaled(struct ntsync_obj *obj) ++{ ++ ntsync_assert_held(obj); ++ ++ switch (obj->type) { ++ case NTSYNC_TYPE_SEM: ++ return !!obj->u.sem.count; ++ } ++ ++ WARN(1, "bad object type %#x\n", obj->type); ++ return false; ++} ++ ++/* ++ * "locked_obj" is an optional pointer to an object which is already locked and ++ * should not be locked again. This is necessary so that changing an object's ++ * state and waking it can be a single atomic operation. ++ */ ++static void try_wake_all(struct ntsync_device *dev, struct ntsync_q *q, ++ struct ntsync_obj *locked_obj) ++{ ++ __u32 count = q->count; ++ bool can_wake = true; ++ int signaled = -1; ++ __u32 i; ++ ++ lockdep_assert_held(&dev->wait_all_lock); ++ if (locked_obj) ++ lockdep_assert(locked_obj->dev_locked); ++ ++ for (i = 0; i < count; i++) { ++ if (q->entries[i].obj != locked_obj) ++ dev_lock_obj(dev, q->entries[i].obj); ++ } ++ ++ for (i = 0; i < count; i++) { ++ if (!is_signaled(q->entries[i].obj)) { ++ can_wake = false; ++ break; ++ } ++ } ++ ++ if (can_wake && atomic_try_cmpxchg(&q->signaled, &signaled, 0)) { ++ for (i = 0; i < count; i++) { ++ struct ntsync_obj *obj = q->entries[i].obj; ++ ++ switch (obj->type) { ++ case NTSYNC_TYPE_SEM: ++ obj->u.sem.count--; ++ break; ++ } ++ } ++ wake_up_process(q->task); ++ } ++ ++ for (i = 0; i < count; i++) { ++ if (q->entries[i].obj != locked_obj) ++ dev_unlock_obj(dev, q->entries[i].obj); ++ } ++} ++ ++static void try_wake_all_obj(struct ntsync_device *dev, struct ntsync_obj *obj) ++{ ++ struct ntsync_q_entry *entry; ++ ++ lockdep_assert_held(&dev->wait_all_lock); ++ lockdep_assert(obj->dev_locked); ++ ++ list_for_each_entry(entry, &obj->all_waiters, node) ++ try_wake_all(dev, entry->q, obj); ++} ++ + static void try_wake_any_sem(struct ntsync_obj *sem) + { + struct ntsync_q_entry *entry; + +- lockdep_assert_held(&sem->lock); ++ ntsync_assert_held(sem); ++ lockdep_assert(sem->type == NTSYNC_TYPE_SEM); + + list_for_each_entry(entry, &sem->any_waiters, node) { + struct ntsync_q *q = entry->q; +@@ -111,7 +315,7 @@ static int post_sem_state(struct ntsync_ + { + __u32 sum; + +- lockdep_assert_held(&sem->lock); ++ ntsync_assert_held(sem); + + if (check_add_overflow(sem->u.sem.count, count, &sum) || + sum > sem->u.sem.max) +@@ -123,9 +327,11 @@ static int post_sem_state(struct ntsync_ + + static int ntsync_sem_post(struct ntsync_obj *sem, void __user *argp) + { ++ struct ntsync_device *dev = sem->dev; + __u32 __user *user_args = argp; + __u32 prev_count; + __u32 args; ++ bool all; + int ret; + + if (copy_from_user(&args, argp, sizeof(args))) +@@ -134,14 +340,17 @@ static int ntsync_sem_post(struct ntsync + if (sem->type != NTSYNC_TYPE_SEM) + return -EINVAL; + +- spin_lock(&sem->lock); ++ all = ntsync_lock_obj(dev, sem); + + prev_count = sem->u.sem.count; + ret = post_sem_state(sem, args); +- if (!ret) ++ if (!ret) { ++ if (all) ++ try_wake_all_obj(dev, sem); + try_wake_any_sem(sem); ++ } + +- spin_unlock(&sem->lock); ++ ntsync_unlock_obj(dev, sem, all); + + if (!ret && put_user(prev_count, user_args)) + ret = -EFAULT; +@@ -194,6 +403,8 @@ static struct ntsync_obj *ntsync_alloc_o + get_file(dev->file); + spin_lock_init(&obj->lock); + INIT_LIST_HEAD(&obj->any_waiters); ++ INIT_LIST_HEAD(&obj->all_waiters); ++ atomic_set(&obj->all_hint, 0); + + return obj; + } +@@ -305,7 +516,7 @@ static int ntsync_schedule(const struct + * Allocate and initialize the ntsync_q structure, but do not queue us yet. + */ + static int setup_wait(struct ntsync_device *dev, +- const struct ntsync_wait_args *args, ++ const struct ntsync_wait_args *args, bool all, + struct ntsync_q **ret_q) + { + const __u32 count = args->count; +@@ -328,6 +539,7 @@ static int setup_wait(struct ntsync_devi + return -ENOMEM; + q->task = current; + atomic_set(&q->signaled, -1); ++ q->all = all; + q->count = count; + + for (i = 0; i < count; i++) { +@@ -337,6 +549,16 @@ static int setup_wait(struct ntsync_devi + if (!obj) + goto err; + ++ if (all) { ++ /* Check that the objects are all distinct. */ ++ for (j = 0; j < i; j++) { ++ if (obj == q->entries[j].obj) { ++ put_obj(obj); ++ goto err; ++ } ++ } ++ } ++ + entry->obj = obj; + entry->q = q; + entry->index = i; +@@ -366,13 +588,14 @@ static int ntsync_wait_any(struct ntsync + struct ntsync_wait_args args; + struct ntsync_q *q; + int signaled; ++ bool all; + __u32 i; + int ret; + + if (copy_from_user(&args, argp, sizeof(args))) + return -EFAULT; + +- ret = setup_wait(dev, &args, &q); ++ ret = setup_wait(dev, &args, false, &q); + if (ret < 0) + return ret; + +@@ -382,9 +605,9 @@ static int ntsync_wait_any(struct ntsync + struct ntsync_q_entry *entry = &q->entries[i]; + struct ntsync_obj *obj = entry->obj; + +- spin_lock(&obj->lock); ++ all = ntsync_lock_obj(dev, obj); + list_add_tail(&entry->node, &obj->any_waiters); +- spin_unlock(&obj->lock); ++ ntsync_unlock_obj(dev, obj, all); + } + + /* check if we are already signaled */ +@@ -395,9 +618,9 @@ static int ntsync_wait_any(struct ntsync + if (atomic_read(&q->signaled) != -1) + break; + +- spin_lock(&obj->lock); ++ all = ntsync_lock_obj(dev, obj); + try_wake_any_obj(obj); +- spin_unlock(&obj->lock); ++ ntsync_unlock_obj(dev, obj, all); + } + + /* sleep */ +@@ -410,13 +633,94 @@ static int ntsync_wait_any(struct ntsync + struct ntsync_q_entry *entry = &q->entries[i]; + struct ntsync_obj *obj = entry->obj; + +- spin_lock(&obj->lock); ++ all = ntsync_lock_obj(dev, obj); + list_del(&entry->node); +- spin_unlock(&obj->lock); ++ ntsync_unlock_obj(dev, obj, all); ++ ++ put_obj(obj); ++ } ++ ++ signaled = atomic_read(&q->signaled); ++ if (signaled != -1) { ++ struct ntsync_wait_args __user *user_args = argp; ++ ++ /* even if we caught a signal, we need to communicate success */ ++ ret = 0; ++ ++ if (put_user(signaled, &user_args->index)) ++ ret = -EFAULT; ++ } else if (!ret) { ++ ret = -ETIMEDOUT; ++ } ++ ++ kfree(q); ++ return ret; ++} ++ ++static int ntsync_wait_all(struct ntsync_device *dev, void __user *argp) ++{ ++ struct ntsync_wait_args args; ++ struct ntsync_q *q; ++ int signaled; ++ __u32 i; ++ int ret; ++ ++ if (copy_from_user(&args, argp, sizeof(args))) ++ return -EFAULT; ++ ++ ret = setup_wait(dev, &args, true, &q); ++ if (ret < 0) ++ return ret; ++ ++ /* queue ourselves */ ++ ++ mutex_lock(&dev->wait_all_lock); ++ ++ for (i = 0; i < args.count; i++) { ++ struct ntsync_q_entry *entry = &q->entries[i]; ++ struct ntsync_obj *obj = entry->obj; ++ ++ atomic_inc(&obj->all_hint); ++ ++ /* ++ * obj->all_waiters is protected by dev->wait_all_lock rather ++ * than obj->lock, so there is no need to acquire obj->lock ++ * here. ++ */ ++ list_add_tail(&entry->node, &obj->all_waiters); ++ } ++ ++ /* check if we are already signaled */ ++ ++ try_wake_all(dev, q, NULL); ++ ++ mutex_unlock(&dev->wait_all_lock); ++ ++ /* sleep */ ++ ++ ret = ntsync_schedule(q, &args); ++ ++ /* and finally, unqueue */ ++ ++ mutex_lock(&dev->wait_all_lock); ++ ++ for (i = 0; i < args.count; i++) { ++ struct ntsync_q_entry *entry = &q->entries[i]; ++ struct ntsync_obj *obj = entry->obj; ++ ++ /* ++ * obj->all_waiters is protected by dev->wait_all_lock rather ++ * than obj->lock, so there is no need to acquire it here. ++ */ ++ list_del(&entry->node); ++ ++ atomic_dec(&obj->all_hint); + + put_obj(obj); + } + ++ mutex_unlock(&dev->wait_all_lock); ++ + signaled = atomic_read(&q->signaled); + if (signaled != -1) { + struct ntsync_wait_args __user *user_args = argp; +@@ -442,6 +746,8 @@ static int ntsync_char_open(struct inode + if (!dev) + return -ENOMEM; + ++ mutex_init(&dev->wait_all_lock); ++ + file->private_data = dev; + dev->file = file; + return nonseekable_open(inode, file); +@@ -465,6 +771,8 @@ static long ntsync_char_ioctl(struct fil + switch (cmd) { + case NTSYNC_IOC_CREATE_SEM: + return ntsync_create_sem(dev, argp); ++ case NTSYNC_IOC_WAIT_ALL: ++ return ntsync_wait_all(dev, argp); + case NTSYNC_IOC_WAIT_ANY: + return ntsync_wait_any(dev, argp); + default: +--- a/include/uapi/linux/ntsync.h ++++ b/include/uapi/linux/ntsync.h +@@ -31,6 +31,7 @@ struct ntsync_wait_args { + + #define NTSYNC_IOC_CREATE_SEM _IOWR('N', 0x80, struct ntsync_sem_args) + #define NTSYNC_IOC_WAIT_ANY _IOWR('N', 0x82, struct ntsync_wait_args) ++#define NTSYNC_IOC_WAIT_ALL _IOWR('N', 0x83, struct ntsync_wait_args) + + #define NTSYNC_IOC_SEM_POST _IOWR('N', 0x81, __u32) + diff --git a/debian/patches/misc-ntsync5/0003-ntsync-Introduce-NTSYNC_IOC_CREATE_MUTEX.patch b/debian/patches/misc-ntsync5/0003-ntsync-Introduce-NTSYNC_IOC_CREATE_MUTEX.patch new file mode 100644 index 0000000..377e09d --- /dev/null +++ b/debian/patches/misc-ntsync5/0003-ntsync-Introduce-NTSYNC_IOC_CREATE_MUTEX.patch @@ -0,0 +1,226 @@ +From fdeceab49078a80987c665ed837ee4f1b8a942a8 Mon Sep 17 00:00:00 2001 +From: Elizabeth Figura +Date: Sun, 19 May 2024 15:24:29 -0500 +Subject: ntsync: Introduce NTSYNC_IOC_CREATE_MUTEX. + +This corresponds to the NT syscall NtCreateMutant(). + +An NT mutex is recursive, with a 32-bit recursion counter. When acquired via +NtWaitForMultipleObjects(), the recursion counter is incremented by one. The OS +records the thread which acquired it. + +The OS records the thread which acquired it. However, in order to keep this +driver self-contained, the owning thread ID is managed by user-space, and passed +as a parameter to all relevant ioctls. + +The initial owner and recursion count, if any, are specified when the mutex is +created. + +Signed-off-by: Elizabeth Figura +--- + drivers/misc/ntsync.c | 77 +++++++++++++++++++++++++++++++++++-- + include/uapi/linux/ntsync.h | 10 ++++- + 2 files changed, 83 insertions(+), 4 deletions(-) + +--- a/drivers/misc/ntsync.c ++++ b/drivers/misc/ntsync.c +@@ -25,6 +25,7 @@ + + enum ntsync_type { + NTSYNC_TYPE_SEM, ++ NTSYNC_TYPE_MUTEX, + }; + + /* +@@ -55,6 +56,10 @@ struct ntsync_obj { + __u32 count; + __u32 max; + } sem; ++ struct { ++ __u32 count; ++ pid_t owner; ++ } mutex; + } u; + + /* +@@ -92,6 +97,7 @@ struct ntsync_q_entry { + + struct ntsync_q { + struct task_struct *task; ++ __u32 owner; + + /* + * Protected via atomic_try_cmpxchg(). Only the thread that wins the +@@ -214,13 +220,17 @@ static void ntsync_unlock_obj(struct nts + ((lockdep_is_held(&(obj)->dev->wait_all_lock) != LOCK_STATE_NOT_HELD) && \ + (obj)->dev_locked)) + +-static bool is_signaled(struct ntsync_obj *obj) ++static bool is_signaled(struct ntsync_obj *obj, __u32 owner) + { + ntsync_assert_held(obj); + + switch (obj->type) { + case NTSYNC_TYPE_SEM: + return !!obj->u.sem.count; ++ case NTSYNC_TYPE_MUTEX: ++ if (obj->u.mutex.owner && obj->u.mutex.owner != owner) ++ return false; ++ return obj->u.mutex.count < UINT_MAX; + } + + WARN(1, "bad object type %#x\n", obj->type); +@@ -250,7 +260,7 @@ static void try_wake_all(struct ntsync_d + } + + for (i = 0; i < count; i++) { +- if (!is_signaled(q->entries[i].obj)) { ++ if (!is_signaled(q->entries[i].obj, q->owner)) { + can_wake = false; + break; + } +@@ -264,6 +274,10 @@ static void try_wake_all(struct ntsync_d + case NTSYNC_TYPE_SEM: + obj->u.sem.count--; + break; ++ case NTSYNC_TYPE_MUTEX: ++ obj->u.mutex.count++; ++ obj->u.mutex.owner = q->owner; ++ break; + } + } + wake_up_process(q->task); +@@ -307,6 +321,30 @@ static void try_wake_any_sem(struct ntsy + } + } + ++static void try_wake_any_mutex(struct ntsync_obj *mutex) ++{ ++ struct ntsync_q_entry *entry; ++ ++ ntsync_assert_held(mutex); ++ lockdep_assert(mutex->type == NTSYNC_TYPE_MUTEX); ++ ++ list_for_each_entry(entry, &mutex->any_waiters, node) { ++ struct ntsync_q *q = entry->q; ++ int signaled = -1; ++ ++ if (mutex->u.mutex.count == UINT_MAX) ++ break; ++ if (mutex->u.mutex.owner && mutex->u.mutex.owner != q->owner) ++ continue; ++ ++ if (atomic_try_cmpxchg(&q->signaled, &signaled, entry->index)) { ++ mutex->u.mutex.count++; ++ mutex->u.mutex.owner = q->owner; ++ wake_up_process(q->task); ++ } ++ } ++} ++ + /* + * Actually change the semaphore state, returning -EOVERFLOW if it is made + * invalid. +@@ -455,6 +493,33 @@ static int ntsync_create_sem(struct ntsy + return put_user(fd, &user_args->sem); + } + ++static int ntsync_create_mutex(struct ntsync_device *dev, void __user *argp) ++{ ++ struct ntsync_mutex_args __user *user_args = argp; ++ struct ntsync_mutex_args args; ++ struct ntsync_obj *mutex; ++ int fd; ++ ++ if (copy_from_user(&args, argp, sizeof(args))) ++ return -EFAULT; ++ ++ if (!args.owner != !args.count) ++ return -EINVAL; ++ ++ mutex = ntsync_alloc_obj(dev, NTSYNC_TYPE_MUTEX); ++ if (!mutex) ++ return -ENOMEM; ++ mutex->u.mutex.count = args.count; ++ mutex->u.mutex.owner = args.owner; ++ fd = ntsync_obj_get_fd(mutex); ++ if (fd < 0) { ++ kfree(mutex); ++ return fd; ++ } ++ ++ return put_user(fd, &user_args->mutex); ++} ++ + static struct ntsync_obj *get_obj(struct ntsync_device *dev, int fd) + { + struct file *file = fget(fd); +@@ -524,7 +589,7 @@ static int setup_wait(struct ntsync_devi + struct ntsync_q *q; + __u32 i, j; + +- if (args->pad[0] || args->pad[1] || args->pad[2] || (args->flags & ~NTSYNC_WAIT_REALTIME)) ++ if (args->pad[0] || args->pad[1] || (args->flags & ~NTSYNC_WAIT_REALTIME)) + return -EINVAL; + + if (args->count > NTSYNC_MAX_WAIT_COUNT) +@@ -538,6 +603,7 @@ static int setup_wait(struct ntsync_devi + if (!q) + return -ENOMEM; + q->task = current; ++ q->owner = args->owner; + atomic_set(&q->signaled, -1); + q->all = all; + q->count = count; +@@ -580,6 +646,9 @@ static void try_wake_any_obj(struct ntsy + case NTSYNC_TYPE_SEM: + try_wake_any_sem(obj); + break; ++ case NTSYNC_TYPE_MUTEX: ++ try_wake_any_mutex(obj); ++ break; + } + } + +@@ -769,6 +838,8 @@ static long ntsync_char_ioctl(struct fil + void __user *argp = (void __user *)parm; + + switch (cmd) { ++ case NTSYNC_IOC_CREATE_MUTEX: ++ return ntsync_create_mutex(dev, argp); + case NTSYNC_IOC_CREATE_SEM: + return ntsync_create_sem(dev, argp); + case NTSYNC_IOC_WAIT_ALL: +--- a/include/uapi/linux/ntsync.h ++++ b/include/uapi/linux/ntsync.h +@@ -16,6 +16,12 @@ struct ntsync_sem_args { + __u32 max; + }; + ++struct ntsync_mutex_args { ++ __u32 mutex; ++ __u32 owner; ++ __u32 count; ++}; ++ + #define NTSYNC_WAIT_REALTIME 0x1 + + struct ntsync_wait_args { +@@ -24,7 +30,8 @@ struct ntsync_wait_args { + __u32 count; + __u32 index; + __u32 flags; +- __u32 pad[3]; ++ __u32 owner; ++ __u32 pad[2]; + }; + + #define NTSYNC_MAX_WAIT_COUNT 64 +@@ -32,6 +39,7 @@ struct ntsync_wait_args { + #define NTSYNC_IOC_CREATE_SEM _IOWR('N', 0x80, struct ntsync_sem_args) + #define NTSYNC_IOC_WAIT_ANY _IOWR('N', 0x82, struct ntsync_wait_args) + #define NTSYNC_IOC_WAIT_ALL _IOWR('N', 0x83, struct ntsync_wait_args) ++#define NTSYNC_IOC_CREATE_MUTEX _IOWR('N', 0x84, struct ntsync_sem_args) + + #define NTSYNC_IOC_SEM_POST _IOWR('N', 0x81, __u32) + diff --git a/debian/patches/misc-ntsync5/0004-ntsync-Introduce-NTSYNC_IOC_MUTEX_UNLOCK.patch b/debian/patches/misc-ntsync5/0004-ntsync-Introduce-NTSYNC_IOC_MUTEX_UNLOCK.patch new file mode 100644 index 0000000..988aa19 --- /dev/null +++ b/debian/patches/misc-ntsync5/0004-ntsync-Introduce-NTSYNC_IOC_MUTEX_UNLOCK.patch @@ -0,0 +1,95 @@ +From cc9ade623cd90cd002fb86f3aa249af2e6e4019e Mon Sep 17 00:00:00 2001 +From: Elizabeth Figura +Date: Sun, 19 May 2024 15:24:30 -0500 +Subject: ntsync: Introduce NTSYNC_IOC_MUTEX_UNLOCK. + +This corresponds to the NT syscall NtReleaseMutant(). + +This syscall decrements the mutex's recursion count by one, and returns the +previous value. If the mutex is not owned by the current task, the function +instead fails and returns -EPERM. + +Signed-off-by: Elizabeth Figura +--- + drivers/misc/ntsync.c | 53 +++++++++++++++++++++++++++++++++++++ + include/uapi/linux/ntsync.h | 1 + + 2 files changed, 54 insertions(+) + +--- a/drivers/misc/ntsync.c ++++ b/drivers/misc/ntsync.c +@@ -396,6 +396,57 @@ static int ntsync_sem_post(struct ntsync + return ret; + } + ++/* ++ * Actually change the mutex state, returning -EPERM if not the owner. ++ */ ++static int unlock_mutex_state(struct ntsync_obj *mutex, ++ const struct ntsync_mutex_args *args) ++{ ++ ntsync_assert_held(mutex); ++ ++ if (mutex->u.mutex.owner != args->owner) ++ return -EPERM; ++ ++ if (!--mutex->u.mutex.count) ++ mutex->u.mutex.owner = 0; ++ return 0; ++} ++ ++static int ntsync_mutex_unlock(struct ntsync_obj *mutex, void __user *argp) ++{ ++ struct ntsync_mutex_args __user *user_args = argp; ++ struct ntsync_device *dev = mutex->dev; ++ struct ntsync_mutex_args args; ++ __u32 prev_count; ++ bool all; ++ int ret; ++ ++ if (copy_from_user(&args, argp, sizeof(args))) ++ return -EFAULT; ++ if (!args.owner) ++ return -EINVAL; ++ ++ if (mutex->type != NTSYNC_TYPE_MUTEX) ++ return -EINVAL; ++ ++ all = ntsync_lock_obj(dev, mutex); ++ ++ prev_count = mutex->u.mutex.count; ++ ret = unlock_mutex_state(mutex, &args); ++ if (!ret) { ++ if (all) ++ try_wake_all_obj(dev, mutex); ++ try_wake_any_mutex(mutex); ++ } ++ ++ ntsync_unlock_obj(dev, mutex, all); ++ ++ if (!ret && put_user(prev_count, &user_args->count)) ++ ret = -EFAULT; ++ ++ return ret; ++} ++ + static int ntsync_obj_release(struct inode *inode, struct file *file) + { + struct ntsync_obj *obj = file->private_data; +@@ -415,6 +466,8 @@ static long ntsync_obj_ioctl(struct file + switch (cmd) { + case NTSYNC_IOC_SEM_POST: + return ntsync_sem_post(obj, argp); ++ case NTSYNC_IOC_MUTEX_UNLOCK: ++ return ntsync_mutex_unlock(obj, argp); + default: + return -ENOIOCTLCMD; + } +--- a/include/uapi/linux/ntsync.h ++++ b/include/uapi/linux/ntsync.h +@@ -42,5 +42,6 @@ struct ntsync_wait_args { + #define NTSYNC_IOC_CREATE_MUTEX _IOWR('N', 0x84, struct ntsync_sem_args) + + #define NTSYNC_IOC_SEM_POST _IOWR('N', 0x81, __u32) ++#define NTSYNC_IOC_MUTEX_UNLOCK _IOWR('N', 0x85, struct ntsync_mutex_args) + + #endif diff --git a/debian/patches/misc-ntsync5/0005-ntsync-Introduce-NTSYNC_IOC_MUTEX_KILL.patch b/debian/patches/misc-ntsync5/0005-ntsync-Introduce-NTSYNC_IOC_MUTEX_KILL.patch new file mode 100644 index 0000000..8eb42fe --- /dev/null +++ b/debian/patches/misc-ntsync5/0005-ntsync-Introduce-NTSYNC_IOC_MUTEX_KILL.patch @@ -0,0 +1,156 @@ +From dca3fe766afa42e34f5d3f62c0f2850760663176 Mon Sep 17 00:00:00 2001 +From: Elizabeth Figura +Date: Sun, 19 May 2024 15:24:31 -0500 +Subject: ntsync: Introduce NTSYNC_IOC_MUTEX_KILL. + +This does not correspond to any NT syscall. Rather, when a thread dies, it +should be called by the NT emulator for each mutex, with the TID of the dying +thread. + +NT mutexes are robust (in the pthread sense). When an NT thread dies, any +mutexes it owned are immediately released. Acquisition of those mutexes by other +threads will return a special value indicating that the mutex was abandoned, +like EOWNERDEAD returned from pthread_mutex_lock(), and EOWNERDEAD is indeed +used here for that purpose. + +Signed-off-by: Elizabeth Figura +--- + drivers/misc/ntsync.c | 61 +++++++++++++++++++++++++++++++++++-- + include/uapi/linux/ntsync.h | 1 + + 2 files changed, 60 insertions(+), 2 deletions(-) + +--- a/drivers/misc/ntsync.c ++++ b/drivers/misc/ntsync.c +@@ -59,6 +59,7 @@ struct ntsync_obj { + struct { + __u32 count; + pid_t owner; ++ bool ownerdead; + } mutex; + } u; + +@@ -107,6 +108,7 @@ struct ntsync_q { + atomic_t signaled; + + bool all; ++ bool ownerdead; + __u32 count; + struct ntsync_q_entry entries[]; + }; +@@ -275,6 +277,9 @@ static void try_wake_all(struct ntsync_d + obj->u.sem.count--; + break; + case NTSYNC_TYPE_MUTEX: ++ if (obj->u.mutex.ownerdead) ++ q->ownerdead = true; ++ obj->u.mutex.ownerdead = false; + obj->u.mutex.count++; + obj->u.mutex.owner = q->owner; + break; +@@ -338,6 +343,9 @@ static void try_wake_any_mutex(struct nt + continue; + + if (atomic_try_cmpxchg(&q->signaled, &signaled, entry->index)) { ++ if (mutex->u.mutex.ownerdead) ++ q->ownerdead = true; ++ mutex->u.mutex.ownerdead = false; + mutex->u.mutex.count++; + mutex->u.mutex.owner = q->owner; + wake_up_process(q->task); +@@ -447,6 +455,52 @@ static int ntsync_mutex_unlock(struct nt + return ret; + } + ++/* ++ * Actually change the mutex state to mark its owner as dead, ++ * returning -EPERM if not the owner. ++ */ ++static int kill_mutex_state(struct ntsync_obj *mutex, __u32 owner) ++{ ++ ntsync_assert_held(mutex); ++ ++ if (mutex->u.mutex.owner != owner) ++ return -EPERM; ++ ++ mutex->u.mutex.ownerdead = true; ++ mutex->u.mutex.owner = 0; ++ mutex->u.mutex.count = 0; ++ return 0; ++} ++ ++static int ntsync_mutex_kill(struct ntsync_obj *mutex, void __user *argp) ++{ ++ struct ntsync_device *dev = mutex->dev; ++ __u32 owner; ++ bool all; ++ int ret; ++ ++ if (get_user(owner, (__u32 __user *)argp)) ++ return -EFAULT; ++ if (!owner) ++ return -EINVAL; ++ ++ if (mutex->type != NTSYNC_TYPE_MUTEX) ++ return -EINVAL; ++ ++ all = ntsync_lock_obj(dev, mutex); ++ ++ ret = kill_mutex_state(mutex, owner); ++ if (!ret) { ++ if (all) ++ try_wake_all_obj(dev, mutex); ++ try_wake_any_mutex(mutex); ++ } ++ ++ ntsync_unlock_obj(dev, mutex, all); ++ ++ return ret; ++} ++ + static int ntsync_obj_release(struct inode *inode, struct file *file) + { + struct ntsync_obj *obj = file->private_data; +@@ -468,6 +522,8 @@ static long ntsync_obj_ioctl(struct file + return ntsync_sem_post(obj, argp); + case NTSYNC_IOC_MUTEX_UNLOCK: + return ntsync_mutex_unlock(obj, argp); ++ case NTSYNC_IOC_MUTEX_KILL: ++ return ntsync_mutex_kill(obj, argp); + default: + return -ENOIOCTLCMD; + } +@@ -659,6 +715,7 @@ static int setup_wait(struct ntsync_devi + q->owner = args->owner; + atomic_set(&q->signaled, -1); + q->all = all; ++ q->ownerdead = false; + q->count = count; + + for (i = 0; i < count; i++) { +@@ -767,7 +824,7 @@ static int ntsync_wait_any(struct ntsync + struct ntsync_wait_args __user *user_args = argp; + + /* even if we caught a signal, we need to communicate success */ +- ret = 0; ++ ret = q->ownerdead ? -EOWNERDEAD : 0; + + if (put_user(signaled, &user_args->index)) + ret = -EFAULT; +@@ -848,7 +905,7 @@ static int ntsync_wait_all(struct ntsync + struct ntsync_wait_args __user *user_args = argp; + + /* even if we caught a signal, we need to communicate success */ +- ret = 0; ++ ret = q->ownerdead ? -EOWNERDEAD : 0; + + if (put_user(signaled, &user_args->index)) + ret = -EFAULT; +--- a/include/uapi/linux/ntsync.h ++++ b/include/uapi/linux/ntsync.h +@@ -43,5 +43,6 @@ struct ntsync_wait_args { + + #define NTSYNC_IOC_SEM_POST _IOWR('N', 0x81, __u32) + #define NTSYNC_IOC_MUTEX_UNLOCK _IOWR('N', 0x85, struct ntsync_mutex_args) ++#define NTSYNC_IOC_MUTEX_KILL _IOW ('N', 0x86, __u32) + + #endif diff --git a/debian/patches/misc-ntsync5/0006-ntsync-Introduce-NTSYNC_IOC_CREATE_EVENT.patch b/debian/patches/misc-ntsync5/0006-ntsync-Introduce-NTSYNC_IOC_CREATE_EVENT.patch new file mode 100644 index 0000000..5cbb5f2 --- /dev/null +++ b/debian/patches/misc-ntsync5/0006-ntsync-Introduce-NTSYNC_IOC_CREATE_EVENT.patch @@ -0,0 +1,166 @@ +From 3f3bbc85f1e613364261d685b8197c32ffdeaad0 Mon Sep 17 00:00:00 2001 +From: Elizabeth Figura +Date: Sun, 19 May 2024 15:24:32 -0500 +Subject: ntsync: Introduce NTSYNC_IOC_CREATE_EVENT. + +This correspond to the NT syscall NtCreateEvent(). + +An NT event holds a single bit of state denoting whether it is signaled or +unsignaled. + +There are two types of events: manual-reset and automatic-reset. When an +automatic-reset event is acquired via a wait function, its state is reset to +unsignaled. Manual-reset events are not affected by wait functions. + +Whether the event is manual-reset, and its initial state, are specified at +creation time. + +Signed-off-by: Elizabeth Figura +--- + drivers/misc/ntsync.c | 62 +++++++++++++++++++++++++++++++++++++ + include/uapi/linux/ntsync.h | 7 +++++ + 2 files changed, 69 insertions(+) + +--- a/drivers/misc/ntsync.c ++++ b/drivers/misc/ntsync.c +@@ -26,6 +26,7 @@ + enum ntsync_type { + NTSYNC_TYPE_SEM, + NTSYNC_TYPE_MUTEX, ++ NTSYNC_TYPE_EVENT, + }; + + /* +@@ -61,6 +62,10 @@ struct ntsync_obj { + pid_t owner; + bool ownerdead; + } mutex; ++ struct { ++ bool manual; ++ bool signaled; ++ } event; + } u; + + /* +@@ -233,6 +238,8 @@ static bool is_signaled(struct ntsync_ob + if (obj->u.mutex.owner && obj->u.mutex.owner != owner) + return false; + return obj->u.mutex.count < UINT_MAX; ++ case NTSYNC_TYPE_EVENT: ++ return obj->u.event.signaled; + } + + WARN(1, "bad object type %#x\n", obj->type); +@@ -283,6 +290,10 @@ static void try_wake_all(struct ntsync_d + obj->u.mutex.count++; + obj->u.mutex.owner = q->owner; + break; ++ case NTSYNC_TYPE_EVENT: ++ if (!obj->u.event.manual) ++ obj->u.event.signaled = false; ++ break; + } + } + wake_up_process(q->task); +@@ -353,6 +364,28 @@ static void try_wake_any_mutex(struct nt + } + } + ++static void try_wake_any_event(struct ntsync_obj *event) ++{ ++ struct ntsync_q_entry *entry; ++ ++ ntsync_assert_held(event); ++ lockdep_assert(event->type == NTSYNC_TYPE_EVENT); ++ ++ list_for_each_entry(entry, &event->any_waiters, node) { ++ struct ntsync_q *q = entry->q; ++ int signaled = -1; ++ ++ if (!event->u.event.signaled) ++ break; ++ ++ if (atomic_try_cmpxchg(&q->signaled, &signaled, entry->index)) { ++ if (!event->u.event.manual) ++ event->u.event.signaled = false; ++ wake_up_process(q->task); ++ } ++ } ++} ++ + /* + * Actually change the semaphore state, returning -EOVERFLOW if it is made + * invalid. +@@ -629,6 +662,30 @@ static int ntsync_create_mutex(struct nt + return put_user(fd, &user_args->mutex); + } + ++static int ntsync_create_event(struct ntsync_device *dev, void __user *argp) ++{ ++ struct ntsync_event_args __user *user_args = argp; ++ struct ntsync_event_args args; ++ struct ntsync_obj *event; ++ int fd; ++ ++ if (copy_from_user(&args, argp, sizeof(args))) ++ return -EFAULT; ++ ++ event = ntsync_alloc_obj(dev, NTSYNC_TYPE_EVENT); ++ if (!event) ++ return -ENOMEM; ++ event->u.event.manual = args.manual; ++ event->u.event.signaled = args.signaled; ++ fd = ntsync_obj_get_fd(event); ++ if (fd < 0) { ++ kfree(event); ++ return fd; ++ } ++ ++ return put_user(fd, &user_args->event); ++} ++ + static struct ntsync_obj *get_obj(struct ntsync_device *dev, int fd) + { + struct file *file = fget(fd); +@@ -759,6 +816,9 @@ static void try_wake_any_obj(struct ntsy + case NTSYNC_TYPE_MUTEX: + try_wake_any_mutex(obj); + break; ++ case NTSYNC_TYPE_EVENT: ++ try_wake_any_event(obj); ++ break; + } + } + +@@ -948,6 +1008,8 @@ static long ntsync_char_ioctl(struct fil + void __user *argp = (void __user *)parm; + + switch (cmd) { ++ case NTSYNC_IOC_CREATE_EVENT: ++ return ntsync_create_event(dev, argp); + case NTSYNC_IOC_CREATE_MUTEX: + return ntsync_create_mutex(dev, argp); + case NTSYNC_IOC_CREATE_SEM: +--- a/include/uapi/linux/ntsync.h ++++ b/include/uapi/linux/ntsync.h +@@ -22,6 +22,12 @@ struct ntsync_mutex_args { + __u32 count; + }; + ++struct ntsync_event_args { ++ __u32 event; ++ __u32 manual; ++ __u32 signaled; ++}; ++ + #define NTSYNC_WAIT_REALTIME 0x1 + + struct ntsync_wait_args { +@@ -40,6 +46,7 @@ struct ntsync_wait_args { + #define NTSYNC_IOC_WAIT_ANY _IOWR('N', 0x82, struct ntsync_wait_args) + #define NTSYNC_IOC_WAIT_ALL _IOWR('N', 0x83, struct ntsync_wait_args) + #define NTSYNC_IOC_CREATE_MUTEX _IOWR('N', 0x84, struct ntsync_sem_args) ++#define NTSYNC_IOC_CREATE_EVENT _IOWR('N', 0x87, struct ntsync_event_args) + + #define NTSYNC_IOC_SEM_POST _IOWR('N', 0x81, __u32) + #define NTSYNC_IOC_MUTEX_UNLOCK _IOWR('N', 0x85, struct ntsync_mutex_args) diff --git a/debian/patches/misc-ntsync5/0007-ntsync-Introduce-NTSYNC_IOC_EVENT_SET.patch b/debian/patches/misc-ntsync5/0007-ntsync-Introduce-NTSYNC_IOC_EVENT_SET.patch new file mode 100644 index 0000000..af5f192 --- /dev/null +++ b/debian/patches/misc-ntsync5/0007-ntsync-Introduce-NTSYNC_IOC_EVENT_SET.patch @@ -0,0 +1,67 @@ +From a6f107f17a976008b85c3e269bf4196e595d3f52 Mon Sep 17 00:00:00 2001 +From: Elizabeth Figura +Date: Sun, 19 May 2024 15:24:33 -0500 +Subject: ntsync: Introduce NTSYNC_IOC_EVENT_SET. + +This corresponds to the NT syscall NtSetEvent(). + +This sets the event to the signaled state, and returns its previous state. + +Signed-off-by: Elizabeth Figura +--- + drivers/misc/ntsync.c | 27 +++++++++++++++++++++++++++ + include/uapi/linux/ntsync.h | 1 + + 2 files changed, 28 insertions(+) + +--- a/drivers/misc/ntsync.c ++++ b/drivers/misc/ntsync.c +@@ -534,6 +534,31 @@ static int ntsync_mutex_kill(struct ntsy + return ret; + } + ++static int ntsync_event_set(struct ntsync_obj *event, void __user *argp) ++{ ++ struct ntsync_device *dev = event->dev; ++ __u32 prev_state; ++ bool all; ++ ++ if (event->type != NTSYNC_TYPE_EVENT) ++ return -EINVAL; ++ ++ all = ntsync_lock_obj(dev, event); ++ ++ prev_state = event->u.event.signaled; ++ event->u.event.signaled = true; ++ if (all) ++ try_wake_all_obj(dev, event); ++ try_wake_any_event(event); ++ ++ ntsync_unlock_obj(dev, event, all); ++ ++ if (put_user(prev_state, (__u32 __user *)argp)) ++ return -EFAULT; ++ ++ return 0; ++} ++ + static int ntsync_obj_release(struct inode *inode, struct file *file) + { + struct ntsync_obj *obj = file->private_data; +@@ -557,6 +582,8 @@ static long ntsync_obj_ioctl(struct file + return ntsync_mutex_unlock(obj, argp); + case NTSYNC_IOC_MUTEX_KILL: + return ntsync_mutex_kill(obj, argp); ++ case NTSYNC_IOC_EVENT_SET: ++ return ntsync_event_set(obj, argp); + default: + return -ENOIOCTLCMD; + } +--- a/include/uapi/linux/ntsync.h ++++ b/include/uapi/linux/ntsync.h +@@ -51,5 +51,6 @@ struct ntsync_wait_args { + #define NTSYNC_IOC_SEM_POST _IOWR('N', 0x81, __u32) + #define NTSYNC_IOC_MUTEX_UNLOCK _IOWR('N', 0x85, struct ntsync_mutex_args) + #define NTSYNC_IOC_MUTEX_KILL _IOW ('N', 0x86, __u32) ++#define NTSYNC_IOC_EVENT_SET _IOR ('N', 0x88, __u32) + + #endif diff --git a/debian/patches/misc-ntsync5/0008-ntsync-Introduce-NTSYNC_IOC_EVENT_RESET.patch b/debian/patches/misc-ntsync5/0008-ntsync-Introduce-NTSYNC_IOC_EVENT_RESET.patch new file mode 100644 index 0000000..1500576 --- /dev/null +++ b/debian/patches/misc-ntsync5/0008-ntsync-Introduce-NTSYNC_IOC_EVENT_RESET.patch @@ -0,0 +1,64 @@ +From aa3ebb5870eb9ed259aba2ed4e07e9993e6cd978 Mon Sep 17 00:00:00 2001 +From: Elizabeth Figura +Date: Sun, 19 May 2024 15:24:34 -0500 +Subject: ntsync: Introduce NTSYNC_IOC_EVENT_RESET. + +This corresponds to the NT syscall NtResetEvent(). + +This sets the event to the unsignaled state, and returns its previous state. + +Signed-off-by: Elizabeth Figura +--- + drivers/misc/ntsync.c | 24 ++++++++++++++++++++++++ + include/uapi/linux/ntsync.h | 1 + + 2 files changed, 25 insertions(+) + +--- a/drivers/misc/ntsync.c ++++ b/drivers/misc/ntsync.c +@@ -559,6 +559,28 @@ static int ntsync_event_set(struct ntsyn + return 0; + } + ++static int ntsync_event_reset(struct ntsync_obj *event, void __user *argp) ++{ ++ struct ntsync_device *dev = event->dev; ++ __u32 prev_state; ++ bool all; ++ ++ if (event->type != NTSYNC_TYPE_EVENT) ++ return -EINVAL; ++ ++ all = ntsync_lock_obj(dev, event); ++ ++ prev_state = event->u.event.signaled; ++ event->u.event.signaled = false; ++ ++ ntsync_unlock_obj(dev, event, all); ++ ++ if (put_user(prev_state, (__u32 __user *)argp)) ++ return -EFAULT; ++ ++ return 0; ++} ++ + static int ntsync_obj_release(struct inode *inode, struct file *file) + { + struct ntsync_obj *obj = file->private_data; +@@ -584,6 +606,8 @@ static long ntsync_obj_ioctl(struct file + return ntsync_mutex_kill(obj, argp); + case NTSYNC_IOC_EVENT_SET: + return ntsync_event_set(obj, argp); ++ case NTSYNC_IOC_EVENT_RESET: ++ return ntsync_event_reset(obj, argp); + default: + return -ENOIOCTLCMD; + } +--- a/include/uapi/linux/ntsync.h ++++ b/include/uapi/linux/ntsync.h +@@ -52,5 +52,6 @@ struct ntsync_wait_args { + #define NTSYNC_IOC_MUTEX_UNLOCK _IOWR('N', 0x85, struct ntsync_mutex_args) + #define NTSYNC_IOC_MUTEX_KILL _IOW ('N', 0x86, __u32) + #define NTSYNC_IOC_EVENT_SET _IOR ('N', 0x88, __u32) ++#define NTSYNC_IOC_EVENT_RESET _IOR ('N', 0x89, __u32) + + #endif diff --git a/debian/patches/misc-ntsync5/0009-ntsync-Introduce-NTSYNC_IOC_EVENT_PULSE.patch b/debian/patches/misc-ntsync5/0009-ntsync-Introduce-NTSYNC_IOC_EVENT_PULSE.patch new file mode 100644 index 0000000..6f14ed4 --- /dev/null +++ b/debian/patches/misc-ntsync5/0009-ntsync-Introduce-NTSYNC_IOC_EVENT_PULSE.patch @@ -0,0 +1,60 @@ +From 99bca5d776a3011214041c42107a210fe315a35e Mon Sep 17 00:00:00 2001 +From: Elizabeth Figura +Date: Sun, 19 May 2024 15:24:35 -0500 +Subject: ntsync: Introduce NTSYNC_IOC_EVENT_PULSE. + +This corresponds to the NT syscall NtPulseEvent(). + +This wakes up any waiters as if the event had been set, but does not set the +event, instead resetting it if it had been signalled. Thus, for a manual-reset +event, all waiters are woken, whereas for an auto-reset event, at most one +waiter is woken. + +Signed-off-by: Elizabeth Figura +--- + drivers/misc/ntsync.c | 8 ++++++-- + include/uapi/linux/ntsync.h | 1 + + 2 files changed, 7 insertions(+), 2 deletions(-) + +--- a/drivers/misc/ntsync.c ++++ b/drivers/misc/ntsync.c +@@ -534,7 +534,7 @@ static int ntsync_mutex_kill(struct ntsy + return ret; + } + +-static int ntsync_event_set(struct ntsync_obj *event, void __user *argp) ++static int ntsync_event_set(struct ntsync_obj *event, void __user *argp, bool pulse) + { + struct ntsync_device *dev = event->dev; + __u32 prev_state; +@@ -550,6 +550,8 @@ static int ntsync_event_set(struct ntsyn + if (all) + try_wake_all_obj(dev, event); + try_wake_any_event(event); ++ if (pulse) ++ event->u.event.signaled = false; + + ntsync_unlock_obj(dev, event, all); + +@@ -605,9 +607,11 @@ static long ntsync_obj_ioctl(struct file + case NTSYNC_IOC_MUTEX_KILL: + return ntsync_mutex_kill(obj, argp); + case NTSYNC_IOC_EVENT_SET: +- return ntsync_event_set(obj, argp); ++ return ntsync_event_set(obj, argp, false); + case NTSYNC_IOC_EVENT_RESET: + return ntsync_event_reset(obj, argp); ++ case NTSYNC_IOC_EVENT_PULSE: ++ return ntsync_event_set(obj, argp, true); + default: + return -ENOIOCTLCMD; + } +--- a/include/uapi/linux/ntsync.h ++++ b/include/uapi/linux/ntsync.h +@@ -53,5 +53,6 @@ struct ntsync_wait_args { + #define NTSYNC_IOC_MUTEX_KILL _IOW ('N', 0x86, __u32) + #define NTSYNC_IOC_EVENT_SET _IOR ('N', 0x88, __u32) + #define NTSYNC_IOC_EVENT_RESET _IOR ('N', 0x89, __u32) ++#define NTSYNC_IOC_EVENT_PULSE _IOR ('N', 0x8a, __u32) + + #endif diff --git a/debian/patches/misc-ntsync5/0010-ntsync-Introduce-NTSYNC_IOC_SEM_READ.patch b/debian/patches/misc-ntsync5/0010-ntsync-Introduce-NTSYNC_IOC_SEM_READ.patch new file mode 100644 index 0000000..19f8345 --- /dev/null +++ b/debian/patches/misc-ntsync5/0010-ntsync-Introduce-NTSYNC_IOC_SEM_READ.patch @@ -0,0 +1,66 @@ +From 1ef0ea672662bd19e7c6a4eac1067d11e50844b2 Mon Sep 17 00:00:00 2001 +From: Elizabeth Figura +Date: Sun, 19 May 2024 15:24:36 -0500 +Subject: ntsync: Introduce NTSYNC_IOC_SEM_READ. + +This corresponds to the NT syscall NtQuerySemaphore(). + +This returns the current count and maximum count of the semaphore. + +Signed-off-by: Elizabeth Figura +--- + drivers/misc/ntsync.c | 26 ++++++++++++++++++++++++++ + include/uapi/linux/ntsync.h | 1 + + 2 files changed, 27 insertions(+) + +--- a/drivers/misc/ntsync.c ++++ b/drivers/misc/ntsync.c +@@ -583,6 +583,30 @@ static int ntsync_event_reset(struct nts + return 0; + } + ++static int ntsync_sem_read(struct ntsync_obj *sem, void __user *argp) ++{ ++ struct ntsync_sem_args __user *user_args = argp; ++ struct ntsync_device *dev = sem->dev; ++ struct ntsync_sem_args args; ++ bool all; ++ ++ if (sem->type != NTSYNC_TYPE_SEM) ++ return -EINVAL; ++ ++ args.sem = 0; ++ ++ all = ntsync_lock_obj(dev, sem); ++ ++ args.count = sem->u.sem.count; ++ args.max = sem->u.sem.max; ++ ++ ntsync_unlock_obj(dev, sem, all); ++ ++ if (copy_to_user(user_args, &args, sizeof(args))) ++ return -EFAULT; ++ return 0; ++} ++ + static int ntsync_obj_release(struct inode *inode, struct file *file) + { + struct ntsync_obj *obj = file->private_data; +@@ -602,6 +626,8 @@ static long ntsync_obj_ioctl(struct file + switch (cmd) { + case NTSYNC_IOC_SEM_POST: + return ntsync_sem_post(obj, argp); ++ case NTSYNC_IOC_SEM_READ: ++ return ntsync_sem_read(obj, argp); + case NTSYNC_IOC_MUTEX_UNLOCK: + return ntsync_mutex_unlock(obj, argp); + case NTSYNC_IOC_MUTEX_KILL: +--- a/include/uapi/linux/ntsync.h ++++ b/include/uapi/linux/ntsync.h +@@ -54,5 +54,6 @@ struct ntsync_wait_args { + #define NTSYNC_IOC_EVENT_SET _IOR ('N', 0x88, __u32) + #define NTSYNC_IOC_EVENT_RESET _IOR ('N', 0x89, __u32) + #define NTSYNC_IOC_EVENT_PULSE _IOR ('N', 0x8a, __u32) ++#define NTSYNC_IOC_SEM_READ _IOR ('N', 0x8b, struct ntsync_sem_args) + + #endif diff --git a/debian/patches/misc-ntsync5/0011-ntsync-Introduce-NTSYNC_IOC_MUTEX_READ.patch b/debian/patches/misc-ntsync5/0011-ntsync-Introduce-NTSYNC_IOC_MUTEX_READ.patch new file mode 100644 index 0000000..d75f8c3 --- /dev/null +++ b/debian/patches/misc-ntsync5/0011-ntsync-Introduce-NTSYNC_IOC_MUTEX_READ.patch @@ -0,0 +1,68 @@ +From 7891b7d15abd12975aebb955821fbc43353b45d6 Mon Sep 17 00:00:00 2001 +From: Elizabeth Figura +Date: Sun, 19 May 2024 15:24:37 -0500 +Subject: ntsync: Introduce NTSYNC_IOC_MUTEX_READ. + +This corresponds to the NT syscall NtQueryMutant(). + +This returns the recursion count, owner, and abandoned state of the mutex. + +Signed-off-by: Elizabeth Figura +--- + drivers/misc/ntsync.c | 28 ++++++++++++++++++++++++++++ + include/uapi/linux/ntsync.h | 1 + + 2 files changed, 29 insertions(+) + +--- a/drivers/misc/ntsync.c ++++ b/drivers/misc/ntsync.c +@@ -607,6 +607,32 @@ static int ntsync_sem_read(struct ntsync + return 0; + } + ++static int ntsync_mutex_read(struct ntsync_obj *mutex, void __user *argp) ++{ ++ struct ntsync_mutex_args __user *user_args = argp; ++ struct ntsync_device *dev = mutex->dev; ++ struct ntsync_mutex_args args; ++ bool all; ++ int ret; ++ ++ if (mutex->type != NTSYNC_TYPE_MUTEX) ++ return -EINVAL; ++ ++ args.mutex = 0; ++ ++ all = ntsync_lock_obj(dev, mutex); ++ ++ args.count = mutex->u.mutex.count; ++ args.owner = mutex->u.mutex.owner; ++ ret = mutex->u.mutex.ownerdead ? -EOWNERDEAD : 0; ++ ++ ntsync_unlock_obj(dev, mutex, all); ++ ++ if (copy_to_user(user_args, &args, sizeof(args))) ++ return -EFAULT; ++ return ret; ++} ++ + static int ntsync_obj_release(struct inode *inode, struct file *file) + { + struct ntsync_obj *obj = file->private_data; +@@ -632,6 +658,8 @@ static long ntsync_obj_ioctl(struct file + return ntsync_mutex_unlock(obj, argp); + case NTSYNC_IOC_MUTEX_KILL: + return ntsync_mutex_kill(obj, argp); ++ case NTSYNC_IOC_MUTEX_READ: ++ return ntsync_mutex_read(obj, argp); + case NTSYNC_IOC_EVENT_SET: + return ntsync_event_set(obj, argp, false); + case NTSYNC_IOC_EVENT_RESET: +--- a/include/uapi/linux/ntsync.h ++++ b/include/uapi/linux/ntsync.h +@@ -55,5 +55,6 @@ struct ntsync_wait_args { + #define NTSYNC_IOC_EVENT_RESET _IOR ('N', 0x89, __u32) + #define NTSYNC_IOC_EVENT_PULSE _IOR ('N', 0x8a, __u32) + #define NTSYNC_IOC_SEM_READ _IOR ('N', 0x8b, struct ntsync_sem_args) ++#define NTSYNC_IOC_MUTEX_READ _IOR ('N', 0x8c, struct ntsync_mutex_args) + + #endif diff --git a/debian/patches/misc-ntsync5/0012-ntsync-Introduce-NTSYNC_IOC_EVENT_READ.patch b/debian/patches/misc-ntsync5/0012-ntsync-Introduce-NTSYNC_IOC_EVENT_READ.patch new file mode 100644 index 0000000..a72cd1b --- /dev/null +++ b/debian/patches/misc-ntsync5/0012-ntsync-Introduce-NTSYNC_IOC_EVENT_READ.patch @@ -0,0 +1,66 @@ +From 35ff252f99aa4002e0c2ecef37314a422969791b Mon Sep 17 00:00:00 2001 +From: Elizabeth Figura +Date: Sun, 19 May 2024 15:24:38 -0500 +Subject: ntsync: Introduce NTSYNC_IOC_EVENT_READ. + +This corresponds to the NT syscall NtQueryEvent(). + +This returns the signaled state of the event and whether it is manual-reset. + +Signed-off-by: Elizabeth Figura +--- + drivers/misc/ntsync.c | 26 ++++++++++++++++++++++++++ + include/uapi/linux/ntsync.h | 1 + + 2 files changed, 27 insertions(+) + +--- a/drivers/misc/ntsync.c ++++ b/drivers/misc/ntsync.c +@@ -633,6 +633,30 @@ static int ntsync_mutex_read(struct ntsy + return ret; + } + ++static int ntsync_event_read(struct ntsync_obj *event, void __user *argp) ++{ ++ struct ntsync_event_args __user *user_args = argp; ++ struct ntsync_device *dev = event->dev; ++ struct ntsync_event_args args; ++ bool all; ++ ++ if (event->type != NTSYNC_TYPE_EVENT) ++ return -EINVAL; ++ ++ args.event = 0; ++ ++ all = ntsync_lock_obj(dev, event); ++ ++ args.manual = event->u.event.manual; ++ args.signaled = event->u.event.signaled; ++ ++ ntsync_unlock_obj(dev, event, all); ++ ++ if (copy_to_user(user_args, &args, sizeof(args))) ++ return -EFAULT; ++ return 0; ++} ++ + static int ntsync_obj_release(struct inode *inode, struct file *file) + { + struct ntsync_obj *obj = file->private_data; +@@ -666,6 +690,8 @@ static long ntsync_obj_ioctl(struct file + return ntsync_event_reset(obj, argp); + case NTSYNC_IOC_EVENT_PULSE: + return ntsync_event_set(obj, argp, true); ++ case NTSYNC_IOC_EVENT_READ: ++ return ntsync_event_read(obj, argp); + default: + return -ENOIOCTLCMD; + } +--- a/include/uapi/linux/ntsync.h ++++ b/include/uapi/linux/ntsync.h +@@ -56,5 +56,6 @@ struct ntsync_wait_args { + #define NTSYNC_IOC_EVENT_PULSE _IOR ('N', 0x8a, __u32) + #define NTSYNC_IOC_SEM_READ _IOR ('N', 0x8b, struct ntsync_sem_args) + #define NTSYNC_IOC_MUTEX_READ _IOR ('N', 0x8c, struct ntsync_mutex_args) ++#define NTSYNC_IOC_EVENT_READ _IOR ('N', 0x8d, struct ntsync_event_args) + + #endif diff --git a/debian/patches/misc-ntsync5/0013-ntsync-Introduce-alertable-waits.patch b/debian/patches/misc-ntsync5/0013-ntsync-Introduce-alertable-waits.patch new file mode 100644 index 0000000..88580a0 --- /dev/null +++ b/debian/patches/misc-ntsync5/0013-ntsync-Introduce-alertable-waits.patch @@ -0,0 +1,187 @@ +From 2c391d57d1393cd46bf8bab08232ddc3dd32d5e5 Mon Sep 17 00:00:00 2001 +From: Elizabeth Figura +Date: Sun, 19 May 2024 15:24:39 -0500 +Subject: ntsync: Introduce alertable waits. + +NT waits can optionally be made "alertable". This is a special channel for +thread wakeup that is mildly similar to SIGIO. A thread has an internal single +bit of "alerted" state, and if a thread is alerted while an alertable wait, the +wait will return a special value, consume the "alerted" state, and will not +consume any of its objects. + +Alerts are implemented using events; the user-space NT emulator is expected to +create an internal ntsync event for each thread and pass that event to wait +functions. + +Signed-off-by: Elizabeth Figura +--- + drivers/misc/ntsync.c | 70 ++++++++++++++++++++++++++++++++----- + include/uapi/linux/ntsync.h | 3 +- + 2 files changed, 63 insertions(+), 10 deletions(-) + +--- a/drivers/misc/ntsync.c ++++ b/drivers/misc/ntsync.c +@@ -885,22 +885,29 @@ static int setup_wait(struct ntsync_devi + const struct ntsync_wait_args *args, bool all, + struct ntsync_q **ret_q) + { ++ int fds[NTSYNC_MAX_WAIT_COUNT + 1]; + const __u32 count = args->count; +- int fds[NTSYNC_MAX_WAIT_COUNT]; + struct ntsync_q *q; ++ __u32 total_count; + __u32 i, j; + +- if (args->pad[0] || args->pad[1] || (args->flags & ~NTSYNC_WAIT_REALTIME)) ++ if (args->pad || (args->flags & ~NTSYNC_WAIT_REALTIME)) + return -EINVAL; + + if (args->count > NTSYNC_MAX_WAIT_COUNT) + return -EINVAL; + ++ total_count = count; ++ if (args->alert) ++ total_count++; ++ + if (copy_from_user(fds, u64_to_user_ptr(args->objs), + array_size(count, sizeof(*fds)))) + return -EFAULT; ++ if (args->alert) ++ fds[count] = args->alert; + +- q = kmalloc(struct_size(q, entries, count), GFP_KERNEL); ++ q = kmalloc(struct_size(q, entries, total_count), GFP_KERNEL); + if (!q) + return -ENOMEM; + q->task = current; +@@ -910,7 +917,7 @@ static int setup_wait(struct ntsync_devi + q->ownerdead = false; + q->count = count; + +- for (i = 0; i < count; i++) { ++ for (i = 0; i < total_count; i++) { + struct ntsync_q_entry *entry = &q->entries[i]; + struct ntsync_obj *obj = get_obj(dev, fds[i]); + +@@ -960,10 +967,10 @@ static void try_wake_any_obj(struct ntsy + static int ntsync_wait_any(struct ntsync_device *dev, void __user *argp) + { + struct ntsync_wait_args args; ++ __u32 i, total_count; + struct ntsync_q *q; + int signaled; + bool all; +- __u32 i; + int ret; + + if (copy_from_user(&args, argp, sizeof(args))) +@@ -973,9 +980,13 @@ static int ntsync_wait_any(struct ntsync + if (ret < 0) + return ret; + ++ total_count = args.count; ++ if (args.alert) ++ total_count++; ++ + /* queue ourselves */ + +- for (i = 0; i < args.count; i++) { ++ for (i = 0; i < total_count; i++) { + struct ntsync_q_entry *entry = &q->entries[i]; + struct ntsync_obj *obj = entry->obj; + +@@ -984,9 +995,15 @@ static int ntsync_wait_any(struct ntsync + ntsync_unlock_obj(dev, obj, all); + } + +- /* check if we are already signaled */ ++ /* ++ * Check if we are already signaled. ++ * ++ * Note that the API requires that normal objects are checked before ++ * the alert event. Hence we queue the alert event last, and check ++ * objects in order. ++ */ + +- for (i = 0; i < args.count; i++) { ++ for (i = 0; i < total_count; i++) { + struct ntsync_obj *obj = q->entries[i].obj; + + if (atomic_read(&q->signaled) != -1) +@@ -1003,7 +1020,7 @@ static int ntsync_wait_any(struct ntsync + + /* and finally, unqueue */ + +- for (i = 0; i < args.count; i++) { ++ for (i = 0; i < total_count; i++) { + struct ntsync_q_entry *entry = &q->entries[i]; + struct ntsync_obj *obj = entry->obj; + +@@ -1063,6 +1080,14 @@ static int ntsync_wait_all(struct ntsync + */ + list_add_tail(&entry->node, &obj->all_waiters); + } ++ if (args.alert) { ++ struct ntsync_q_entry *entry = &q->entries[args.count]; ++ struct ntsync_obj *obj = entry->obj; ++ ++ dev_lock_obj(dev, obj); ++ list_add_tail(&entry->node, &obj->any_waiters); ++ dev_unlock_obj(dev, obj); ++ } + + /* check if we are already signaled */ + +@@ -1070,6 +1095,21 @@ static int ntsync_wait_all(struct ntsync + + mutex_unlock(&dev->wait_all_lock); + ++ /* ++ * Check if the alert event is signaled, making sure to do so only ++ * after checking if the other objects are signaled. ++ */ ++ ++ if (args.alert) { ++ struct ntsync_obj *obj = q->entries[args.count].obj; ++ ++ if (atomic_read(&q->signaled) == -1) { ++ bool all = ntsync_lock_obj(dev, obj); ++ try_wake_any_obj(obj); ++ ntsync_unlock_obj(dev, obj, all); ++ } ++ } ++ + /* sleep */ + + ret = ntsync_schedule(q, &args); +@@ -1095,6 +1135,18 @@ static int ntsync_wait_all(struct ntsync + + mutex_unlock(&dev->wait_all_lock); + ++ if (args.alert) { ++ struct ntsync_q_entry *entry = &q->entries[args.count]; ++ struct ntsync_obj *obj = entry->obj; ++ bool all; ++ ++ all = ntsync_lock_obj(dev, obj); ++ list_del(&entry->node); ++ ntsync_unlock_obj(dev, obj, all); ++ ++ put_obj(obj); ++ } ++ + signaled = atomic_read(&q->signaled); + if (signaled != -1) { + struct ntsync_wait_args __user *user_args = argp; +--- a/include/uapi/linux/ntsync.h ++++ b/include/uapi/linux/ntsync.h +@@ -37,7 +37,8 @@ struct ntsync_wait_args { + __u32 index; + __u32 flags; + __u32 owner; +- __u32 pad[2]; ++ __u32 alert; ++ __u32 pad; + }; + + #define NTSYNC_MAX_WAIT_COUNT 64 diff --git a/debian/patches/misc-ntsync5/0014-maintainers-Add-an-entry-for-ntsync.patch b/debian/patches/misc-ntsync5/0014-maintainers-Add-an-entry-for-ntsync.patch new file mode 100644 index 0000000..8baae5c --- /dev/null +++ b/debian/patches/misc-ntsync5/0014-maintainers-Add-an-entry-for-ntsync.patch @@ -0,0 +1,30 @@ +From 8d87043cd76368bb9996ba541d12e40cbb4201e5 Mon Sep 17 00:00:00 2001 +From: Elizabeth Figura +Date: Sun, 19 May 2024 15:24:52 -0500 +Subject: maintainers: Add an entry for ntsync. + +Add myself as maintainer, supported by CodeWeavers. + +Signed-off-by: Elizabeth Figura +--- + MAINTAINERS | 9 +++++++++ + 1 file changed, 9 insertions(+) + +--- a/MAINTAINERS ++++ b/MAINTAINERS +@@ -16319,6 +16319,15 @@ T: git https://github.com/Paragon-Softwa + F: Documentation/filesystems/ntfs3.rst + F: fs/ntfs3/ + ++NTSYNC SYNCHRONIZATION PRIMITIVE DRIVER ++M: Elizabeth Figura ++L: wine-devel@winehq.org ++S: Supported ++F: Documentation/userspace-api/ntsync.rst ++F: drivers/misc/ntsync.c ++F: include/uapi/linux/ntsync.h ++F: tools/testing/selftests/drivers/ntsync/ ++ + NUBUS SUBSYSTEM + M: Finn Thain + L: linux-m68k@lists.linux-m68k.org diff --git a/debian/patches/misc-ntsync5/0015-docs-ntsync-Add-documentation-for-the-ntsync-uAPI.patch b/debian/patches/misc-ntsync5/0015-docs-ntsync-Add-documentation-for-the-ntsync-uAPI.patch new file mode 100644 index 0000000..a5f7b7e --- /dev/null +++ b/debian/patches/misc-ntsync5/0015-docs-ntsync-Add-documentation-for-the-ntsync-uAPI.patch @@ -0,0 +1,426 @@ +From 4cb25d42d38f1e0b144b084674591b70afa60bb0 Mon Sep 17 00:00:00 2001 +From: Elizabeth Figura +Date: Sun, 19 May 2024 15:24:53 -0500 +Subject: docs: ntsync: Add documentation for the ntsync uAPI. + +Add an overall explanation of the driver architecture, and complete and precise +specification for its intended behaviour. + +Signed-off-by: Elizabeth Figura +--- + Documentation/userspace-api/index.rst | 1 + + Documentation/userspace-api/ntsync.rst | 398 +++++++++++++++++++++++++ + 2 files changed, 399 insertions(+) + create mode 100644 Documentation/userspace-api/ntsync.rst + +--- a/Documentation/userspace-api/index.rst ++++ b/Documentation/userspace-api/index.rst +@@ -63,6 +63,7 @@ Everything else + vduse + futex2 + perf_ring_buffer ++ ntsync + + .. only:: subproject and html + +--- /dev/null ++++ b/Documentation/userspace-api/ntsync.rst +@@ -0,0 +1,398 @@ ++=================================== ++NT synchronization primitive driver ++=================================== ++ ++This page documents the user-space API for the ntsync driver. ++ ++ntsync is a support driver for emulation of NT synchronization ++primitives by user-space NT emulators. It exists because implementation ++in user-space, using existing tools, cannot match Windows performance ++while offering accurate semantics. It is implemented entirely in ++software, and does not drive any hardware device. ++ ++This interface is meant as a compatibility tool only, and should not ++be used for general synchronization. Instead use generic, versatile ++interfaces such as futex(2) and poll(2). ++ ++Synchronization primitives ++========================== ++ ++The ntsync driver exposes three types of synchronization primitives: ++semaphores, mutexes, and events. ++ ++A semaphore holds a single volatile 32-bit counter, and a static 32-bit ++integer denoting the maximum value. It is considered signaled (that is, ++can be acquired without contention, or will wake up a waiting thread) ++when the counter is nonzero. The counter is decremented by one when a ++wait is satisfied. Both the initial and maximum count are established ++when the semaphore is created. ++ ++A mutex holds a volatile 32-bit recursion count, and a volatile 32-bit ++identifier denoting its owner. A mutex is considered signaled when its ++owner is zero (indicating that it is not owned). The recursion count is ++incremented when a wait is satisfied, and ownership is set to the given ++identifier. ++ ++A mutex also holds an internal flag denoting whether its previous owner ++has died; such a mutex is said to be abandoned. Owner death is not ++tracked automatically based on thread death, but rather must be ++communicated using ``NTSYNC_IOC_MUTEX_KILL``. An abandoned mutex is ++inherently considered unowned. ++ ++Except for the "unowned" semantics of zero, the actual value of the ++owner identifier is not interpreted by the ntsync driver at all. The ++intended use is to store a thread identifier; however, the ntsync ++driver does not actually validate that a calling thread provides ++consistent or unique identifiers. ++ ++An event is similar to a semaphore with a maximum count of one. It holds ++a volatile boolean state denoting whether it is signaled or not. There ++are two types of events, auto-reset and manual-reset. An auto-reset ++event is designaled when a wait is satisfied; a manual-reset event is ++not. The event type is specified when the event is created. ++ ++Unless specified otherwise, all operations on an object are atomic and ++totally ordered with respect to other operations on the same object. ++ ++Objects are represented by files. When all file descriptors to an ++object are closed, that object is deleted. ++ ++Char device ++=========== ++ ++The ntsync driver creates a single char device /dev/ntsync. Each file ++description opened on the device represents a unique instance intended ++to back an individual NT virtual machine. Objects created by one ntsync ++instance may only be used with other objects created by the same ++instance. ++ ++ioctl reference ++=============== ++ ++All operations on the device are done through ioctls. There are four ++structures used in ioctl calls:: ++ ++ struct ntsync_sem_args { ++ __u32 sem; ++ __u32 count; ++ __u32 max; ++ }; ++ ++ struct ntsync_mutex_args { ++ __u32 mutex; ++ __u32 owner; ++ __u32 count; ++ }; ++ ++ struct ntsync_event_args { ++ __u32 event; ++ __u32 signaled; ++ __u32 manual; ++ }; ++ ++ struct ntsync_wait_args { ++ __u64 timeout; ++ __u64 objs; ++ __u32 count; ++ __u32 owner; ++ __u32 index; ++ __u32 alert; ++ __u32 flags; ++ __u32 pad; ++ }; ++ ++Depending on the ioctl, members of the structure may be used as input, ++output, or not at all. All ioctls return 0 on success. ++ ++The ioctls on the device file are as follows: ++ ++.. c:macro:: NTSYNC_IOC_CREATE_SEM ++ ++ Create a semaphore object. Takes a pointer to struct ++ :c:type:`ntsync_sem_args`, which is used as follows: ++ ++ .. list-table:: ++ ++ * - ``sem`` ++ - On output, contains a file descriptor to the created semaphore. ++ * - ``count`` ++ - Initial count of the semaphore. ++ * - ``max`` ++ - Maximum count of the semaphore. ++ ++ Fails with ``EINVAL`` if ``count`` is greater than ``max``. ++ ++.. c:macro:: NTSYNC_IOC_CREATE_MUTEX ++ ++ Create a mutex object. Takes a pointer to struct ++ :c:type:`ntsync_mutex_args`, which is used as follows: ++ ++ .. list-table:: ++ ++ * - ``mutex`` ++ - On output, contains a file descriptor to the created mutex. ++ * - ``count`` ++ - Initial recursion count of the mutex. ++ * - ``owner`` ++ - Initial owner of the mutex. ++ ++ If ``owner`` is nonzero and ``count`` is zero, or if ``owner`` is ++ zero and ``count`` is nonzero, the function fails with ``EINVAL``. ++ ++.. c:macro:: NTSYNC_IOC_CREATE_EVENT ++ ++ Create an event object. Takes a pointer to struct ++ :c:type:`ntsync_event_args`, which is used as follows: ++ ++ .. list-table:: ++ ++ * - ``event`` ++ - On output, contains a file descriptor to the created event. ++ * - ``signaled`` ++ - If nonzero, the event is initially signaled, otherwise ++ nonsignaled. ++ * - ``manual`` ++ - If nonzero, the event is a manual-reset event, otherwise ++ auto-reset. ++ ++The ioctls on the individual objects are as follows: ++ ++.. c:macro:: NTSYNC_IOC_SEM_POST ++ ++ Post to a semaphore object. Takes a pointer to a 32-bit integer, ++ which on input holds the count to be added to the semaphore, and on ++ output contains its previous count. ++ ++ If adding to the semaphore's current count would raise the latter ++ past the semaphore's maximum count, the ioctl fails with ++ ``EOVERFLOW`` and the semaphore is not affected. If raising the ++ semaphore's count causes it to become signaled, eligible threads ++ waiting on this semaphore will be woken and the semaphore's count ++ decremented appropriately. ++ ++.. c:macro:: NTSYNC_IOC_MUTEX_UNLOCK ++ ++ Release a mutex object. Takes a pointer to struct ++ :c:type:`ntsync_mutex_args`, which is used as follows: ++ ++ .. list-table:: ++ ++ * - ``mutex`` ++ - Ignored. ++ * - ``owner`` ++ - Specifies the owner trying to release this mutex. ++ * - ``count`` ++ - On output, contains the previous recursion count. ++ ++ If ``owner`` is zero, the ioctl fails with ``EINVAL``. If ``owner`` ++ is not the current owner of the mutex, the ioctl fails with ++ ``EPERM``. ++ ++ The mutex's count will be decremented by one. If decrementing the ++ mutex's count causes it to become zero, the mutex is marked as ++ unowned and signaled, and eligible threads waiting on it will be ++ woken as appropriate. ++ ++.. c:macro:: NTSYNC_IOC_SET_EVENT ++ ++ Signal an event object. Takes a pointer to a 32-bit integer, which on ++ output contains the previous state of the event. ++ ++ Eligible threads will be woken, and auto-reset events will be ++ designaled appropriately. ++ ++.. c:macro:: NTSYNC_IOC_RESET_EVENT ++ ++ Designal an event object. Takes a pointer to a 32-bit integer, which ++ on output contains the previous state of the event. ++ ++.. c:macro:: NTSYNC_IOC_PULSE_EVENT ++ ++ Wake threads waiting on an event object while leaving it in an ++ unsignaled state. Takes a pointer to a 32-bit integer, which on ++ output contains the previous state of the event. ++ ++ A pulse operation can be thought of as a set followed by a reset, ++ performed as a single atomic operation. If two threads are waiting on ++ an auto-reset event which is pulsed, only one will be woken. If two ++ threads are waiting a manual-reset event which is pulsed, both will ++ be woken. However, in both cases, the event will be unsignaled ++ afterwards, and a simultaneous read operation will always report the ++ event as unsignaled. ++ ++.. c:macro:: NTSYNC_IOC_READ_SEM ++ ++ Read the current state of a semaphore object. Takes a pointer to ++ struct :c:type:`ntsync_sem_args`, which is used as follows: ++ ++ .. list-table:: ++ ++ * - ``sem`` ++ - Ignored. ++ * - ``count`` ++ - On output, contains the current count of the semaphore. ++ * - ``max`` ++ - On output, contains the maximum count of the semaphore. ++ ++.. c:macro:: NTSYNC_IOC_READ_MUTEX ++ ++ Read the current state of a mutex object. Takes a pointer to struct ++ :c:type:`ntsync_mutex_args`, which is used as follows: ++ ++ .. list-table:: ++ ++ * - ``mutex`` ++ - Ignored. ++ * - ``owner`` ++ - On output, contains the current owner of the mutex, or zero ++ if the mutex is not currently owned. ++ * - ``count`` ++ - On output, contains the current recursion count of the mutex. ++ ++ If the mutex is marked as abandoned, the function fails with ++ ``EOWNERDEAD``. In this case, ``count`` and ``owner`` are set to ++ zero. ++ ++.. c:macro:: NTSYNC_IOC_READ_EVENT ++ ++ Read the current state of an event object. Takes a pointer to struct ++ :c:type:`ntsync_event_args`, which is used as follows: ++ ++ .. list-table:: ++ ++ * - ``event`` ++ - Ignored. ++ * - ``signaled`` ++ - On output, contains the current state of the event. ++ * - ``manual`` ++ - On output, contains 1 if the event is a manual-reset event, ++ and 0 otherwise. ++ ++.. c:macro:: NTSYNC_IOC_KILL_OWNER ++ ++ Mark a mutex as unowned and abandoned if it is owned by the given ++ owner. Takes an input-only pointer to a 32-bit integer denoting the ++ owner. If the owner is zero, the ioctl fails with ``EINVAL``. If the ++ owner does not own the mutex, the function fails with ``EPERM``. ++ ++ Eligible threads waiting on the mutex will be woken as appropriate ++ (and such waits will fail with ``EOWNERDEAD``, as described below). ++ ++.. c:macro:: NTSYNC_IOC_WAIT_ANY ++ ++ Poll on any of a list of objects, atomically acquiring at most one. ++ Takes a pointer to struct :c:type:`ntsync_wait_args`, which is ++ used as follows: ++ ++ .. list-table:: ++ ++ * - ``timeout`` ++ - Absolute timeout in nanoseconds. If ``NTSYNC_WAIT_REALTIME`` ++ is set, the timeout is measured against the REALTIME clock; ++ otherwise it is measured against the MONOTONIC clock. If the ++ timeout is equal to or earlier than the current time, the ++ function returns immediately without sleeping. If ``timeout`` ++ is U64_MAX, the function will sleep until an object is ++ signaled, and will not fail with ``ETIMEDOUT``. ++ * - ``objs`` ++ - Pointer to an array of ``count`` file descriptors ++ (specified as an integer so that the structure has the same ++ size regardless of architecture). If any object is ++ invalid, the function fails with ``EINVAL``. ++ * - ``count`` ++ - Number of objects specified in the ``objs`` array. ++ If greater than ``NTSYNC_MAX_WAIT_COUNT``, the function fails ++ with ``EINVAL``. ++ * - ``owner`` ++ - Mutex owner identifier. If any object in ``objs`` is a mutex, ++ the ioctl will attempt to acquire that mutex on behalf of ++ ``owner``. If ``owner`` is zero, the ioctl fails with ++ ``EINVAL``. ++ * - ``index`` ++ - On success, contains the index (into ``objs``) of the object ++ which was signaled. If ``alert`` was signaled instead, ++ this contains ``count``. ++ * - ``alert`` ++ - Optional event object file descriptor. If nonzero, this ++ specifies an "alert" event object which, if signaled, will ++ terminate the wait. If nonzero, the identifier must point to a ++ valid event. ++ * - ``flags`` ++ - Zero or more flags. Currently the only flag is ++ ``NTSYNC_WAIT_REALTIME``, which causes the timeout to be ++ measured against the REALTIME clock instead of MONOTONIC. ++ * - ``pad`` ++ - Unused, must be set to zero. ++ ++ This function attempts to acquire one of the given objects. If unable ++ to do so, it sleeps until an object becomes signaled, subsequently ++ acquiring it, or the timeout expires. In the latter case the ioctl ++ fails with ``ETIMEDOUT``. The function only acquires one object, even ++ if multiple objects are signaled. ++ ++ A semaphore is considered to be signaled if its count is nonzero, and ++ is acquired by decrementing its count by one. A mutex is considered ++ to be signaled if it is unowned or if its owner matches the ``owner`` ++ argument, and is acquired by incrementing its recursion count by one ++ and setting its owner to the ``owner`` argument. An auto-reset event ++ is acquired by designaling it; a manual-reset event is not affected ++ by acquisition. ++ ++ Acquisition is atomic and totally ordered with respect to other ++ operations on the same object. If two wait operations (with different ++ ``owner`` identifiers) are queued on the same mutex, only one is ++ signaled. If two wait operations are queued on the same semaphore, ++ and a value of one is posted to it, only one is signaled. ++ ++ If an abandoned mutex is acquired, the ioctl fails with ++ ``EOWNERDEAD``. Although this is a failure return, the function may ++ otherwise be considered successful. The mutex is marked as owned by ++ the given owner (with a recursion count of 1) and as no longer ++ abandoned, and ``index`` is still set to the index of the mutex. ++ ++ The ``alert`` argument is an "extra" event which can terminate the ++ wait, independently of all other objects. ++ ++ It is valid to pass the same object more than once, including by ++ passing the same event in the ``objs`` array and in ``alert``. If a ++ wakeup occurs due to that object being signaled, ``index`` is set to ++ the lowest index corresponding to that object. ++ ++ The function may fail with ``EINTR`` if a signal is received. ++ ++.. c:macro:: NTSYNC_IOC_WAIT_ALL ++ ++ Poll on a list of objects, atomically acquiring all of them. Takes a ++ pointer to struct :c:type:`ntsync_wait_args`, which is used ++ identically to ``NTSYNC_IOC_WAIT_ANY``, except that ``index`` is ++ always filled with zero on success if not woken via alert. ++ ++ This function attempts to simultaneously acquire all of the given ++ objects. If unable to do so, it sleeps until all objects become ++ simultaneously signaled, subsequently acquiring them, or the timeout ++ expires. In the latter case the ioctl fails with ``ETIMEDOUT`` and no ++ objects are modified. ++ ++ Objects may become signaled and subsequently designaled (through ++ acquisition by other threads) while this thread is sleeping. Only ++ once all objects are simultaneously signaled does the ioctl acquire ++ them and return. The entire acquisition is atomic and totally ordered ++ with respect to other operations on any of the given objects. ++ ++ If an abandoned mutex is acquired, the ioctl fails with ++ ``EOWNERDEAD``. Similarly to ``NTSYNC_IOC_WAIT_ANY``, all objects are ++ nevertheless marked as acquired. Note that if multiple mutex objects ++ are specified, there is no way to know which were marked as ++ abandoned. ++ ++ As with "any" waits, the ``alert`` argument is an "extra" event which ++ can terminate the wait. Critically, however, an "all" wait will ++ succeed if all members in ``objs`` are signaled, *or* if ``alert`` is ++ signaled. In the latter case ``index`` will be set to ``count``. As ++ with "any" waits, if both conditions are filled, the former takes ++ priority, and objects in ``objs`` will be acquired. ++ ++ Unlike ``NTSYNC_IOC_WAIT_ANY``, it is not valid to pass the same ++ object more than once, nor is it valid to pass the same object in ++ ``objs`` and in ``alert``. If this is attempted, the function fails ++ with ``EINVAL``. diff --git a/debian/patches/misc-ntsync5/0016-Revert-misc-ntsync-mark-driver-as-broken-to-prevent-.patch b/debian/patches/misc-ntsync5/0016-Revert-misc-ntsync-mark-driver-as-broken-to-prevent-.patch new file mode 100644 index 0000000..b46522c --- /dev/null +++ b/debian/patches/misc-ntsync5/0016-Revert-misc-ntsync-mark-driver-as-broken-to-prevent-.patch @@ -0,0 +1,21 @@ +From 06bc88f16094c6f38e0890992af4a32415716c5d Mon Sep 17 00:00:00 2001 +From: "Jan Alexander Steffens (heftig)" +Date: Thu, 18 Jul 2024 21:19:39 +0200 +Subject: Revert "misc: ntsync: mark driver as "broken" to prevent from + building" + +This reverts commit f5b335dc025cfee90957efa90dc72fada0d5abb4. +--- + drivers/misc/Kconfig | 1 - + 1 file changed, 1 deletion(-) + +--- a/drivers/misc/Kconfig ++++ b/drivers/misc/Kconfig +@@ -507,7 +507,6 @@ config OPEN_DICE + + config NTSYNC + tristate "NT synchronization primitive emulation" +- depends on BROKEN + help + This module provides kernel support for emulation of Windows NT + synchronization primitives. It is not a hardware driver. diff --git a/debian/patches/misc-openwrt/0001-mac80211-ignore-AP-power-level-when-tx-power-type-is.patch b/debian/patches/misc-openwrt/0001-mac80211-ignore-AP-power-level-when-tx-power-type-is.patch new file mode 100644 index 0000000..9a93cc6 --- /dev/null +++ b/debian/patches/misc-openwrt/0001-mac80211-ignore-AP-power-level-when-tx-power-type-is.patch @@ -0,0 +1,32 @@ +From 1fd57f0a5e998d99035ecb3b110cbdb588403c83 Mon Sep 17 00:00:00 2001 +From: Felix Fietkau +Date: Sat, 5 Dec 2015 15:07:03 +0100 +Subject: [PATCH] mac80211: ignore AP power level when tx power type is "fixed" + +In some cases a user might want to connect to a far away access point, +which announces a low tx power limit. Using the AP's power limit can +make the connection significantly more unstable or even impossible, and +mac80211 currently provides no way to disable this behavior. + +To fix this, use the currently unused distinction between limited and +fixed tx power to decide whether a remote AP's power limit should be +accepted. + +Signed-off-by: Felix Fietkau +Signed-off-by: Alexandre Frade +--- + net/mac80211/iface.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +--- a/net/mac80211/iface.c ++++ b/net/mac80211/iface.c +@@ -62,7 +62,8 @@ bool __ieee80211_recalc_txpower(struct i + if (sdata->deflink.user_power_level != IEEE80211_UNSET_POWER_LEVEL) + power = min(power, sdata->deflink.user_power_level); + +- if (sdata->deflink.ap_power_level != IEEE80211_UNSET_POWER_LEVEL) ++ if (sdata->deflink.ap_power_level != IEEE80211_UNSET_POWER_LEVEL && ++ sdata->vif.bss_conf.txpower_type != NL80211_TX_POWER_FIXED) + power = min(power, sdata->deflink.ap_power_level); + + if (power != sdata->vif.bss_conf.txpower) { diff --git a/debian/patches/misc-openwrt/0002-net-enable-fraglist-GRO-by-default.patch b/debian/patches/misc-openwrt/0002-net-enable-fraglist-GRO-by-default.patch new file mode 100644 index 0000000..51f9900 --- /dev/null +++ b/debian/patches/misc-openwrt/0002-net-enable-fraglist-GRO-by-default.patch @@ -0,0 +1,24 @@ +From: Felix Fietkau +Date: Tue, 23 Apr 2024 12:35:21 +0200 +Subject: [PATCH] net: enable fraglist GRO by default + +This can significantly improve performance for packet forwarding/bridging + +Signed-off-by: Felix Fietkau +--- + +--- a/include/linux/netdev_features.h ++++ b/include/linux/netdev_features.h +@@ -242,10 +242,10 @@ static inline int find_next_netdev_featu + #define NETIF_F_UPPER_DISABLES NETIF_F_LRO + + /* changeable features with no special hardware requirements */ +-#define NETIF_F_SOFT_FEATURES (NETIF_F_GSO | NETIF_F_GRO) ++#define NETIF_F_SOFT_FEATURES (NETIF_F_GSO | NETIF_F_GRO | NETIF_F_GRO_FRAGLIST) + + /* Changeable features with no special hardware requirements that defaults to off. */ +-#define NETIF_F_SOFT_FEATURES_OFF (NETIF_F_GRO_FRAGLIST | NETIF_F_GRO_UDP_FWD) ++#define NETIF_F_SOFT_FEATURES_OFF (NETIF_F_GRO_UDP_FWD) + + #define NETIF_F_VLAN_FEATURES (NETIF_F_HW_VLAN_CTAG_FILTER | \ + NETIF_F_HW_VLAN_CTAG_RX | \ diff --git a/debian/patches/misc-openwrt/0003-net-remove-NETIF_F_GSO_FRAGLIST-from-NETIF_F_GSO_SOF.patch b/debian/patches/misc-openwrt/0003-net-remove-NETIF_F_GSO_FRAGLIST-from-NETIF_F_GSO_SOF.patch new file mode 100644 index 0000000..9240cc2 --- /dev/null +++ b/debian/patches/misc-openwrt/0003-net-remove-NETIF_F_GSO_FRAGLIST-from-NETIF_F_GSO_SOF.patch @@ -0,0 +1,129 @@ +From: Felix Fietkau +Date: Thu, 15 Aug 2024 21:15:13 +0200 +Subject: [PATCH] net: remove NETIF_F_GSO_FRAGLIST from NETIF_F_GSO_SOFTWARE + +Several drivers set NETIF_F_GSO_SOFTWARE, but mangle fraglist GRO packets +in a way that they can't be properly segmented anymore. +In order to properly deal with this, remove fraglist GSO from +NETIF_F_GSO_SOFTWARE and switch to NETIF_F_GSO_SOFTWARE_ALL (which includes +fraglist GSO) in places where it's safe to add. + +Signed-off-by: Felix Fietkau +--- + +--- a/drivers/net/dummy.c ++++ b/drivers/net/dummy.c +@@ -110,7 +110,7 @@ static void dummy_setup(struct net_devic + dev->flags &= ~IFF_MULTICAST; + dev->priv_flags |= IFF_LIVE_ADDR_CHANGE | IFF_NO_QUEUE; + dev->features |= NETIF_F_SG | NETIF_F_FRAGLIST; +- dev->features |= NETIF_F_GSO_SOFTWARE; ++ dev->features |= NETIF_F_GSO_SOFTWARE_ALL; + dev->features |= NETIF_F_HW_CSUM | NETIF_F_HIGHDMA | NETIF_F_LLTX; + dev->features |= NETIF_F_GSO_ENCAP_ALL; + dev->hw_features |= dev->features; +--- a/drivers/net/loopback.c ++++ b/drivers/net/loopback.c +@@ -172,7 +172,7 @@ static void gen_lo_setup(struct net_devi + dev->flags = IFF_LOOPBACK; + dev->priv_flags |= IFF_LIVE_ADDR_CHANGE | IFF_NO_QUEUE; + netif_keep_dst(dev); +- dev->hw_features = NETIF_F_GSO_SOFTWARE; ++ dev->hw_features = NETIF_F_GSO_SOFTWARE_ALL; + dev->features = NETIF_F_SG | NETIF_F_FRAGLIST + | NETIF_F_GSO_SOFTWARE + | NETIF_F_HW_CSUM +--- a/drivers/net/macvlan.c ++++ b/drivers/net/macvlan.c +@@ -897,7 +897,7 @@ static int macvlan_hwtstamp_set(struct n + static struct lock_class_key macvlan_netdev_addr_lock_key; + + #define ALWAYS_ON_OFFLOADS \ +- (NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_GSO_SOFTWARE | \ ++ (NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_GSO_SOFTWARE_ALL | \ + NETIF_F_GSO_ROBUST | NETIF_F_GSO_ENCAP_ALL) + + #define ALWAYS_ON_FEATURES (ALWAYS_ON_OFFLOADS | NETIF_F_LLTX) +--- a/include/linux/netdev_features.h ++++ b/include/linux/netdev_features.h +@@ -219,13 +219,14 @@ static inline int find_next_netdev_featu + + /* List of features with software fallbacks. */ + #define NETIF_F_GSO_SOFTWARE (NETIF_F_ALL_TSO | NETIF_F_GSO_SCTP | \ +- NETIF_F_GSO_UDP_L4 | NETIF_F_GSO_FRAGLIST) ++ NETIF_F_GSO_UDP_L4) ++#define NETIF_F_GSO_SOFTWARE_ALL (NETIF_F_GSO_SOFTWARE | NETIF_F_GSO_FRAGLIST) + + /* + * If one device supports one of these features, then enable them + * for all in netdev_increment_features. + */ +-#define NETIF_F_ONE_FOR_ALL (NETIF_F_GSO_SOFTWARE | NETIF_F_GSO_ROBUST | \ ++#define NETIF_F_ONE_FOR_ALL (NETIF_F_GSO_SOFTWARE_ALL | NETIF_F_GSO_ROBUST | \ + NETIF_F_SG | NETIF_F_HIGHDMA | \ + NETIF_F_FRAGLIST | NETIF_F_VLAN_CHALLENGED) + +--- a/net/8021q/vlan.h ++++ b/net/8021q/vlan.h +@@ -108,7 +108,7 @@ static inline netdev_features_t vlan_tnl + netdev_features_t ret; + + ret = real_dev->hw_enc_features & +- (NETIF_F_CSUM_MASK | NETIF_F_GSO_SOFTWARE | ++ (NETIF_F_CSUM_MASK | NETIF_F_GSO_SOFTWARE_ALL | + NETIF_F_GSO_ENCAP_ALL); + + if ((ret & NETIF_F_GSO_ENCAP_ALL) && (ret & NETIF_F_CSUM_MASK)) +--- a/net/8021q/vlan_dev.c ++++ b/net/8021q/vlan_dev.c +@@ -561,7 +561,7 @@ static int vlan_dev_init(struct net_devi + dev->state |= (1 << __LINK_STATE_NOCARRIER); + + dev->hw_features = NETIF_F_HW_CSUM | NETIF_F_SG | +- NETIF_F_FRAGLIST | NETIF_F_GSO_SOFTWARE | ++ NETIF_F_FRAGLIST | NETIF_F_GSO_SOFTWARE_ALL | + NETIF_F_GSO_ENCAP_ALL | + NETIF_F_HIGHDMA | NETIF_F_SCTP_CRC | + NETIF_F_ALL_FCOE; +@@ -654,7 +654,7 @@ static netdev_features_t vlan_dev_fix_fe + if (lower_features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM)) + lower_features |= NETIF_F_HW_CSUM; + features = netdev_intersect_features(features, lower_features); +- features |= old_features & (NETIF_F_SOFT_FEATURES | NETIF_F_GSO_SOFTWARE); ++ features |= old_features & (NETIF_F_SOFT_FEATURES | NETIF_F_GSO_SOFTWARE_ALL); + features |= NETIF_F_LLTX; + + return features; +--- a/net/core/sock.c ++++ b/net/core/sock.c +@@ -2451,7 +2451,7 @@ void sk_setup_caps(struct sock *sk, stru + if (sk_is_tcp(sk)) + sk->sk_route_caps |= NETIF_F_GSO; + if (sk->sk_route_caps & NETIF_F_GSO) +- sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE; ++ sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE_ALL; + if (unlikely(sk->sk_gso_disabled)) + sk->sk_route_caps &= ~NETIF_F_GSO_MASK; + if (sk_can_gso(sk)) { +--- a/net/mac80211/ieee80211_i.h ++++ b/net/mac80211/ieee80211_i.h +@@ -2009,7 +2009,7 @@ void ieee80211_color_collision_detection + /* interface handling */ + #define MAC80211_SUPPORTED_FEATURES_TX (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM | \ + NETIF_F_HW_CSUM | NETIF_F_SG | \ +- NETIF_F_HIGHDMA | NETIF_F_GSO_SOFTWARE | \ ++ NETIF_F_HIGHDMA | NETIF_F_GSO_SOFTWARE_ALL | \ + NETIF_F_HW_TC) + #define MAC80211_SUPPORTED_FEATURES_RX (NETIF_F_RXCSUM) + #define MAC80211_SUPPORTED_FEATURES (MAC80211_SUPPORTED_FEATURES_TX | \ +--- a/net/openvswitch/vport-internal_dev.c ++++ b/net/openvswitch/vport-internal_dev.c +@@ -109,7 +109,7 @@ static void do_setup(struct net_device * + + netdev->features = NETIF_F_LLTX | NETIF_F_SG | NETIF_F_FRAGLIST | + NETIF_F_HIGHDMA | NETIF_F_HW_CSUM | +- NETIF_F_GSO_SOFTWARE | NETIF_F_GSO_ENCAP_ALL; ++ NETIF_F_GSO_SOFTWARE_ALL | NETIF_F_GSO_ENCAP_ALL; + + netdev->vlan_features = netdev->features; + netdev->hw_enc_features = netdev->features; diff --git a/debian/patches/mixed-arch/0001-ZEN-Add-graysky-s-more-ISA-levels-and-uarches.patch b/debian/patches/mixed-arch/0001-ZEN-Add-graysky-s-more-ISA-levels-and-uarches.patch new file mode 100644 index 0000000..dda4678 --- /dev/null +++ b/debian/patches/mixed-arch/0001-ZEN-Add-graysky-s-more-ISA-levels-and-uarches.patch @@ -0,0 +1,762 @@ +From a657df31affbb91d8cb2718e70f42cf8ed6e9a7c Mon Sep 17 00:00:00 2001 +From: graysky +Date: Mon, 16 Sep 2024 05:55:58 -0400 +Subject: ZEN: Add graysky's more-ISA-levels-and-uarches +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From https://github.com/graysky2/kernel_compiler_patch + +more-ISA-levels-and-uarches-for-kernel-6.8-rc4+.patch + +FEATURES +This patch adds additional tunings via new x86-64 ISA levels and +more micro-architecture options to the Linux kernel in three classes. + +1. New generic x86-64 ISA levels + +These are selectable under: + Processor type and features ---> x86-64 compiler ISA level + +• x86-64 A value of (1) is the default +• x86-64-v2 A value of (2) brings support for vector + instructions up to Streaming SIMD Extensions 4.2 (SSE4.2) + and Supplemental Streaming SIMD Extensions 3 (SSSE3), the + POPCNT instruction, and CMPXCHG16B. +• x86-64-v3 A value of (3) adds vector instructions up to AVX2, MOVBE, + and additional bit-manipulation instructions. + +There is also x86-64-v4 but including this makes little sense as +the kernel does not use any of the AVX512 instructions anyway. + +Users of glibc 2.33 and above can see which level is supported by running: + /lib/ld-linux-x86-64.so.2 --help | grep supported +Or + /lib64/ld-linux-x86-64.so.2 --help | grep supported + +2. New micro-architectures + +These are selectable under: + Processor type and features ---> Processor family + +• AMD Improved K8-family +• AMD K10-family +• AMD Family 10h (Barcelona) +• AMD Family 14h (Bobcat) +• AMD Family 16h (Jaguar) +• AMD Family 15h (Bulldozer) +• AMD Family 15h (Piledriver) +• AMD Family 15h (Steamroller) +• AMD Family 15h (Excavator) +• AMD Family 17h (Zen) +• AMD Family 17h (Zen 2) +• AMD Family 19h (Zen 3)** +• AMD Family 19h (Zen 4)‡ +• AMD Family 1Ah (Zen 5)§ +• Intel Silvermont low-power processors +• Intel Goldmont low-power processors (Apollo Lake and Denverton) +• Intel Goldmont Plus low-power processors (Gemini Lake) +• Intel 1st Gen Core i3/i5/i7 (Nehalem) +• Intel 1.5 Gen Core i3/i5/i7 (Westmere) +• Intel 2nd Gen Core i3/i5/i7 (Sandybridge) +• Intel 3rd Gen Core i3/i5/i7 (Ivybridge) +• Intel 4th Gen Core i3/i5/i7 (Haswell) +• Intel 5th Gen Core i3/i5/i7 (Broadwell) +• Intel 6th Gen Core i3/i5/i7 (Skylake) +• Intel 6th Gen Core i7/i9 (Skylake X) +• Intel 8th Gen Core i3/i5/i7 (Cannon Lake) +• Intel 10th Gen Core i7/i9 (Ice Lake) +• Intel Xeon (Cascade Lake) +• Intel Xeon (Cooper Lake)* +• Intel 3rd Gen 10nm++ i3/i5/i7/i9-family (Tiger Lake)* +• Intel 4th Gen 10nm++ Xeon (Sapphire Rapids)† +• Intel 11th Gen i3/i5/i7/i9-family (Rocket Lake)† +• Intel 12th Gen i3/i5/i7/i9-family (Alder Lake)† +• Intel 13th Gen i3/i5/i7/i9-family (Raptor Lake)‡ +• Intel 14th Gen i3/i5/i7/i9-family (Meteor Lake)‡ +• Intel 5th Gen 10nm++ Xeon (Emerald Rapids)‡ + +Notes: If not otherwise noted, gcc >=9.1 is required for support. + *Requires gcc >=10.1 or clang >=10.0 + **Required gcc >=10.3 or clang >=12.0 + †Required gcc >=11.1 or clang >=12.0 + ‡Required gcc >=13.0 or clang >=15.0.5 + §Required gcc >14.0 or clang >=19.0? + +3. Auto-detected micro-architecture levels + +Compile by passing the '-march=native' option which, "selects the CPU +to generate code for at compilation time by determining the processor type of +the compiling machine. Using -march=native enables all instruction subsets +supported by the local machine and will produce code optimized for the local +machine under the constraints of the selected instruction set."[1] + +Users of Intel CPUs should select the 'Intel-Native' option and users of AMD +CPUs should select the 'AMD-Native' option. + +MINOR NOTES RELATING TO INTEL ATOM PROCESSORS +This patch also changes -march=atom to -march=bonnell in accordance with the +gcc v4.9 changes. Upstream is using the deprecated -match=atom flags when I +believe it should use the newer -march=bonnell flag for atom processors.[2] + +It is not recommended to compile on Atom-CPUs with the 'native' option.[3] The +recommendation is to use the 'atom' option instead. + +BENEFITS +Small but real speed increases are measurable using a make endpoint comparing +a generic kernel to one built with one of the respective microarchs. + +See the following experimental evidence supporting this statement: +https://github.com/graysky2/kernel_compiler_patch?tab=readme-ov-file#benchmarks + +REQUIREMENTS +linux version 6.8-rc3+ +gcc version >=9.0 or clang version >=9.0 + +ACKNOWLEDGMENTS +This patch builds on the seminal work by Jeroen.[4] + +REFERENCES +1. https://gcc.gnu.org/onlinedocs/gcc/x86-Options.html#index-x86-Options +2. https://bugzilla.kernel.org/show_bug.cgi?id=77461 +3. https://github.com/graysky2/kernel_gcc_patch/issues/15 +4. http://www.linuxforge.net/docs/linux/linux-gcc.php +--- + arch/x86/Kconfig.cpu | 363 ++++++++++++++++++++++++++++++-- + arch/x86/Makefile | 89 +++++++- + arch/x86/include/asm/vermagic.h | 70 ++++++ + 3 files changed, 506 insertions(+), 16 deletions(-) + +--- a/arch/x86/Kconfig.cpu ++++ b/arch/x86/Kconfig.cpu +@@ -155,9 +155,8 @@ config MPENTIUM4 + -Paxville + -Dempsey + +- + config MK6 +- bool "K6/K6-II/K6-III" ++ bool "AMD K6/K6-II/K6-III" + depends on X86_32 + help + Select this for an AMD K6-family processor. Enables use of +@@ -165,7 +164,7 @@ config MK6 + flags to GCC. + + config MK7 +- bool "Athlon/Duron/K7" ++ bool "AMD Athlon/Duron/K7" + depends on X86_32 + help + Select this for an AMD Athlon K7-family processor. Enables use of +@@ -173,12 +172,114 @@ config MK7 + flags to GCC. + + config MK8 +- bool "Opteron/Athlon64/Hammer/K8" ++ bool "AMD Opteron/Athlon64/Hammer/K8" + help + Select this for an AMD Opteron or Athlon64 Hammer-family processor. + Enables use of some extended instructions, and passes appropriate + optimization flags to GCC. + ++config MK8SSE3 ++ bool "AMD Opteron/Athlon64/Hammer/K8 with SSE3" ++ help ++ Select this for improved AMD Opteron or Athlon64 Hammer-family processors. ++ Enables use of some extended instructions, and passes appropriate ++ optimization flags to GCC. ++ ++config MK10 ++ bool "AMD 61xx/7x50/PhenomX3/X4/II/K10" ++ help ++ Select this for an AMD 61xx Eight-Core Magny-Cours, Athlon X2 7x50, ++ Phenom X3/X4/II, Athlon II X2/X3/X4, or Turion II-family processor. ++ Enables use of some extended instructions, and passes appropriate ++ optimization flags to GCC. ++ ++config MBARCELONA ++ bool "AMD Barcelona" ++ help ++ Select this for AMD Family 10h Barcelona processors. ++ ++ Enables -march=barcelona ++ ++config MBOBCAT ++ bool "AMD Bobcat" ++ help ++ Select this for AMD Family 14h Bobcat processors. ++ ++ Enables -march=btver1 ++ ++config MJAGUAR ++ bool "AMD Jaguar" ++ help ++ Select this for AMD Family 16h Jaguar processors. ++ ++ Enables -march=btver2 ++ ++config MBULLDOZER ++ bool "AMD Bulldozer" ++ help ++ Select this for AMD Family 15h Bulldozer processors. ++ ++ Enables -march=bdver1 ++ ++config MPILEDRIVER ++ bool "AMD Piledriver" ++ help ++ Select this for AMD Family 15h Piledriver processors. ++ ++ Enables -march=bdver2 ++ ++config MSTEAMROLLER ++ bool "AMD Steamroller" ++ help ++ Select this for AMD Family 15h Steamroller processors. ++ ++ Enables -march=bdver3 ++ ++config MEXCAVATOR ++ bool "AMD Excavator" ++ help ++ Select this for AMD Family 15h Excavator processors. ++ ++ Enables -march=bdver4 ++ ++config MZEN ++ bool "AMD Zen" ++ help ++ Select this for AMD Family 17h Zen processors. ++ ++ Enables -march=znver1 ++ ++config MZEN2 ++ bool "AMD Zen 2" ++ help ++ Select this for AMD Family 17h Zen 2 processors. ++ ++ Enables -march=znver2 ++ ++config MZEN3 ++ bool "AMD Zen 3" ++ depends on (CC_IS_GCC && GCC_VERSION >= 100300) || (CC_IS_CLANG && CLANG_VERSION >= 120000) ++ help ++ Select this for AMD Family 19h Zen 3 processors. ++ ++ Enables -march=znver3 ++ ++config MZEN4 ++ bool "AMD Zen 4" ++ depends on (CC_IS_GCC && GCC_VERSION >= 130000) || (CC_IS_CLANG && CLANG_VERSION >= 160000) ++ help ++ Select this for AMD Family 19h Zen 4 processors. ++ ++ Enables -march=znver4 ++ ++config MZEN5 ++ bool "AMD Zen 5" ++ depends on (CC_IS_GCC && GCC_VERSION > 140000) || (CC_IS_CLANG && CLANG_VERSION >= 191000) ++ help ++ Select this for AMD Family 19h Zen 5 processors. ++ ++ Enables -march=znver5 ++ + config MCRUSOE + bool "Crusoe" + depends on X86_32 +@@ -269,8 +370,17 @@ config MPSC + using the cpu family field + in /proc/cpuinfo. Family 15 is an older Xeon, Family 6 a newer one. + ++config MATOM ++ bool "Intel Atom" ++ help ++ ++ Select this for the Intel Atom platform. Intel Atom CPUs have an ++ in-order pipelining architecture and thus can benefit from ++ accordingly optimized code. Use a recent GCC with specific Atom ++ support in order to fully benefit from selecting this option. ++ + config MCORE2 +- bool "Core 2/newer Xeon" ++ bool "Intel Core 2" + help + + Select this for Intel Core 2 and newer Core 2 Xeons (Xeon 51xx and +@@ -278,14 +388,191 @@ config MCORE2 + family in /proc/cpuinfo. Newer ones have 6 and older ones 15 + (not a typo) + +-config MATOM +- bool "Intel Atom" ++ Enables -march=core2 ++ ++config MNEHALEM ++ bool "Intel Nehalem" + help + +- Select this for the Intel Atom platform. Intel Atom CPUs have an +- in-order pipelining architecture and thus can benefit from +- accordingly optimized code. Use a recent GCC with specific Atom +- support in order to fully benefit from selecting this option. ++ Select this for 1st Gen Core processors in the Nehalem family. ++ ++ Enables -march=nehalem ++ ++config MWESTMERE ++ bool "Intel Westmere" ++ help ++ ++ Select this for the Intel Westmere formerly Nehalem-C family. ++ ++ Enables -march=westmere ++ ++config MSILVERMONT ++ bool "Intel Silvermont" ++ help ++ ++ Select this for the Intel Silvermont platform. ++ ++ Enables -march=silvermont ++ ++config MGOLDMONT ++ bool "Intel Goldmont" ++ help ++ ++ Select this for the Intel Goldmont platform including Apollo Lake and Denverton. ++ ++ Enables -march=goldmont ++ ++config MGOLDMONTPLUS ++ bool "Intel Goldmont Plus" ++ help ++ ++ Select this for the Intel Goldmont Plus platform including Gemini Lake. ++ ++ Enables -march=goldmont-plus ++ ++config MSANDYBRIDGE ++ bool "Intel Sandy Bridge" ++ help ++ ++ Select this for 2nd Gen Core processors in the Sandy Bridge family. ++ ++ Enables -march=sandybridge ++ ++config MIVYBRIDGE ++ bool "Intel Ivy Bridge" ++ help ++ ++ Select this for 3rd Gen Core processors in the Ivy Bridge family. ++ ++ Enables -march=ivybridge ++ ++config MHASWELL ++ bool "Intel Haswell" ++ help ++ ++ Select this for 4th Gen Core processors in the Haswell family. ++ ++ Enables -march=haswell ++ ++config MBROADWELL ++ bool "Intel Broadwell" ++ help ++ ++ Select this for 5th Gen Core processors in the Broadwell family. ++ ++ Enables -march=broadwell ++ ++config MSKYLAKE ++ bool "Intel Skylake" ++ help ++ ++ Select this for 6th Gen Core processors in the Skylake family. ++ ++ Enables -march=skylake ++ ++config MSKYLAKEX ++ bool "Intel Skylake X" ++ help ++ ++ Select this for 6th Gen Core processors in the Skylake X family. ++ ++ Enables -march=skylake-avx512 ++ ++config MCANNONLAKE ++ bool "Intel Cannon Lake" ++ help ++ ++ Select this for 8th Gen Core processors ++ ++ Enables -march=cannonlake ++ ++config MICELAKE ++ bool "Intel Ice Lake" ++ help ++ ++ Select this for 10th Gen Core processors in the Ice Lake family. ++ ++ Enables -march=icelake-client ++ ++config MCASCADELAKE ++ bool "Intel Cascade Lake" ++ help ++ ++ Select this for Xeon processors in the Cascade Lake family. ++ ++ Enables -march=cascadelake ++ ++config MCOOPERLAKE ++ bool "Intel Cooper Lake" ++ depends on (CC_IS_GCC && GCC_VERSION > 100100) || (CC_IS_CLANG && CLANG_VERSION >= 100000) ++ help ++ ++ Select this for Xeon processors in the Cooper Lake family. ++ ++ Enables -march=cooperlake ++ ++config MTIGERLAKE ++ bool "Intel Tiger Lake" ++ depends on (CC_IS_GCC && GCC_VERSION > 100100) || (CC_IS_CLANG && CLANG_VERSION >= 100000) ++ help ++ ++ Select this for third-generation 10 nm process processors in the Tiger Lake family. ++ ++ Enables -march=tigerlake ++ ++config MSAPPHIRERAPIDS ++ bool "Intel Sapphire Rapids" ++ depends on (CC_IS_GCC && GCC_VERSION > 110000) || (CC_IS_CLANG && CLANG_VERSION >= 120000) ++ help ++ ++ Select this for fourth-generation 10 nm process processors in the Sapphire Rapids family. ++ ++ Enables -march=sapphirerapids ++ ++config MROCKETLAKE ++ bool "Intel Rocket Lake" ++ depends on (CC_IS_GCC && GCC_VERSION > 110000) || (CC_IS_CLANG && CLANG_VERSION >= 120000) ++ help ++ ++ Select this for eleventh-generation processors in the Rocket Lake family. ++ ++ Enables -march=rocketlake ++ ++config MALDERLAKE ++ bool "Intel Alder Lake" ++ depends on (CC_IS_GCC && GCC_VERSION > 110000) || (CC_IS_CLANG && CLANG_VERSION >= 120000) ++ help ++ ++ Select this for twelfth-generation processors in the Alder Lake family. ++ ++ Enables -march=alderlake ++ ++config MRAPTORLAKE ++ bool "Intel Raptor Lake" ++ depends on (CC_IS_GCC && GCC_VERSION >= 130000) || (CC_IS_CLANG && CLANG_VERSION >= 150500) ++ help ++ ++ Select this for thirteenth-generation processors in the Raptor Lake family. ++ ++ Enables -march=raptorlake ++ ++config MMETEORLAKE ++ bool "Intel Meteor Lake" ++ depends on (CC_IS_GCC && GCC_VERSION >= 130000) || (CC_IS_CLANG && CLANG_VERSION >= 150500) ++ help ++ ++ Select this for fourteenth-generation processors in the Meteor Lake family. ++ ++ Enables -march=meteorlake ++ ++config MEMERALDRAPIDS ++ bool "Intel Emerald Rapids" ++ depends on (CC_IS_GCC && GCC_VERSION > 130000) || (CC_IS_CLANG && CLANG_VERSION >= 150500) ++ help ++ ++ Select this for fifth-generation 10 nm process processors in the Emerald Rapids family. ++ ++ Enables -march=emeraldrapids + + config GENERIC_CPU + bool "Generic-x86-64" +@@ -294,8 +581,32 @@ config GENERIC_CPU + Generic x86-64 CPU. + Run equally well on all x86-64 CPUs. + ++config MNATIVE_INTEL ++ bool "Intel-Native optimizations autodetected by the compiler" ++ help ++ ++ Clang 3.8, GCC 4.2 and above support -march=native, which automatically detects ++ the optimum settings to use based on your processor. Do NOT use this ++ for AMD CPUs. Intel Only! ++ ++ Enables -march=native ++ ++config MNATIVE_AMD ++ bool "AMD-Native optimizations autodetected by the compiler" ++ help ++ ++ Clang 3.8, GCC 4.2 and above support -march=native, which automatically detects ++ the optimum settings to use based on your processor. Do NOT use this ++ for Intel CPUs. AMD Only! ++ ++ Enables -march=native ++ + endchoice + ++config SUPPORT_MARCH_CODEVERS ++ bool ++ default y if (CC_IS_GCC && GCC_VERSION > 110000) || (CC_IS_CLANG && CLANG_VERSION >= 120000) ++ + config X86_GENERIC + bool "Generic x86 support" + depends on X86_32 +@@ -308,6 +619,30 @@ config X86_GENERIC + This is really intended for distributors who need more + generic optimizations. + ++config X86_64_VERSION ++ int "x86-64 compiler ISA level" ++ range 1 3 ++ depends on (CC_IS_GCC && GCC_VERSION > 110000) || (CC_IS_CLANG && CLANG_VERSION >= 120000) ++ depends on X86_64 && GENERIC_CPU ++ help ++ Specify a specific x86-64 compiler ISA level. ++ ++ There are three x86-64 ISA levels that work on top of ++ the x86-64 baseline, namely: x86-64-v2, x86-64-v3, and x86-64-v4. ++ ++ x86-64-v2 brings support for vector instructions up to Streaming SIMD ++ Extensions 4.2 (SSE4.2) and Supplemental Streaming SIMD Extensions 3 ++ (SSSE3), the POPCNT instruction, and CMPXCHG16B. ++ ++ x86-64-v3 adds vector instructions up to AVX2, MOVBE, and additional ++ bit-manipulation instructions. ++ ++ x86-64-v4 is not included since the kernel does not use AVX512 instructions ++ ++ You can find the best version for your CPU by running one of the following: ++ /lib/ld-linux-x86-64.so.2 --help | grep supported ++ /lib64/ld-linux-x86-64.so.2 --help | grep supported ++ + # + # Define implied options from the CPU selection here + config X86_INTERNODE_CACHE_SHIFT +@@ -318,7 +653,7 @@ config X86_INTERNODE_CACHE_SHIFT + config X86_L1_CACHE_SHIFT + int + default "7" if MPENTIUM4 || MPSC +- default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MATOM || MVIAC7 || X86_GENERIC || GENERIC_CPU ++ default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MATOM || MVIAC7 || X86_GENERIC || GENERIC_CPU || MK8SSE3 || MK10 || MBARCELONA || MBOBCAT || MJAGUAR || MBULLDOZER || MPILEDRIVER || MSTEAMROLLER || MEXCAVATOR || MZEN || MZEN2 || MZEN3 || MZEN4 || MZEN5 || MNEHALEM || MWESTMERE || MSILVERMONT || MGOLDMONT || MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MSKYLAKEX || MCANNONLAKE || MICELAKE || MCASCADELAKE || MCOOPERLAKE || MTIGERLAKE || MSAPPHIRERAPIDS || MROCKETLAKE || MALDERLAKE || MRAPTORLAKE || MMETEORLAKE || MEMERALDRAPIDS || MNATIVE_INTEL || MNATIVE_AMD + default "4" if MELAN || M486SX || M486 || MGEODEGX1 + default "5" if MWINCHIP3D || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX + +@@ -336,11 +671,11 @@ config X86_ALIGNMENT_16 + + config X86_INTEL_USERCOPY + def_bool y +- depends on MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M586MMX || X86_GENERIC || MK8 || MK7 || MEFFICEON || MCORE2 ++ depends on MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M586MMX || X86_GENERIC || MK8 || MK7 || MEFFICEON || MCORE2 || MNEHALEM || MWESTMERE || MSILVERMONT || MGOLDMONT || MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MSKYLAKEX || MCANNONLAKE || MICELAKE || MCASCADELAKE || MCOOPERLAKE || MTIGERLAKE || MSAPPHIRERAPIDS || MROCKETLAKE || MALDERLAKE || MRAPTORLAKE || MMETEORLAKE || MEMERALDRAPIDS || MNATIVE_INTEL + + config X86_USE_PPRO_CHECKSUM + def_bool y +- depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MVIAC7 || MEFFICEON || MGEODE_LX || MCORE2 || MATOM ++ depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MVIAC7 || MEFFICEON || MGEODE_LX || MCORE2 || MATOM || MK8SSE3 || MK10 || MBARCELONA || MBOBCAT || MJAGUAR || MBULLDOZER || MPILEDRIVER || MSTEAMROLLER || MEXCAVATOR || MZEN || MZEN2 || MZEN3 || MZEN4 || MZEN5 || MNEHALEM || MWESTMERE || MSILVERMONT || MGOLDMONT || MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MSKYLAKEX || MCANNONLAKE || MICELAKE || MCASCADELAKE || MCOOPERLAKE || MTIGERLAKE || MSAPPHIRERAPIDS || MROCKETLAKE || MALDERLAKE || MRAPTORLAKE || MMETEORLAKE || MEMERALDRAPIDS || MNATIVE_INTEL || MNATIVE_AMD + + # + # P6_NOPs are a relatively minor optimization that require a family >= +--- a/arch/x86/Makefile ++++ b/arch/x86/Makefile +@@ -178,14 +178,99 @@ else + cflags-$(CONFIG_MPSC) += -march=nocona + cflags-$(CONFIG_MCORE2) += -march=core2 + cflags-$(CONFIG_MATOM) += -march=atom +- cflags-$(CONFIG_GENERIC_CPU) += -mtune=generic ++ ifeq ($(CONFIG_X86_64_VERSION),1) ++ cflags-$(CONFIG_GENERIC_CPU) += -mtune=generic ++ rustflags-$(CONFIG_GENERIC_CPU) += -Ztune-cpu=generic ++ else ++ cflags-$(CONFIG_GENERIC_CPU) += -march=x86-64-v$(CONFIG_X86_64_VERSION) ++ rustflags-$(CONFIG_GENERIC_CPU) += -Ctarget-cpu=x86-64-v$(CONFIG_X86_64_VERSION) ++ endif ++ cflags-$(CONFIG_MK8SSE3) += -march=k8-sse3 ++ cflags-$(CONFIG_MK10) += -march=amdfam10 ++ cflags-$(CONFIG_MBARCELONA) += -march=barcelona ++ cflags-$(CONFIG_MBOBCAT) += -march=btver1 ++ cflags-$(CONFIG_MJAGUAR) += -march=btver2 ++ cflags-$(CONFIG_MBULLDOZER) += -march=bdver1 ++ cflags-$(CONFIG_MPILEDRIVER) += -march=bdver2 -mno-tbm ++ cflags-$(CONFIG_MSTEAMROLLER) += -march=bdver3 -mno-tbm ++ cflags-$(CONFIG_MEXCAVATOR) += -march=bdver4 -mno-tbm ++ cflags-$(CONFIG_MZEN) += -march=znver1 ++ cflags-$(CONFIG_MZEN2) += -march=znver2 ++ cflags-$(CONFIG_MZEN3) += -march=znver3 ++ cflags-$(CONFIG_MZEN4) += -march=znver4 ++ cflags-$(CONFIG_MZEN5) += -march=znver5 ++ cflags-$(CONFIG_MNATIVE_INTEL) += -march=native ++ cflags-$(CONFIG_MNATIVE_AMD) += -march=native -mno-tbm ++ cflags-$(CONFIG_MATOM) += -march=bonnell ++ cflags-$(CONFIG_MCORE2) += -march=core2 ++ cflags-$(CONFIG_MNEHALEM) += -march=nehalem ++ cflags-$(CONFIG_MWESTMERE) += -march=westmere ++ cflags-$(CONFIG_MSILVERMONT) += -march=silvermont ++ cflags-$(CONFIG_MGOLDMONT) += -march=goldmont ++ cflags-$(CONFIG_MGOLDMONTPLUS) += -march=goldmont-plus ++ cflags-$(CONFIG_MSANDYBRIDGE) += -march=sandybridge ++ cflags-$(CONFIG_MIVYBRIDGE) += -march=ivybridge ++ cflags-$(CONFIG_MHASWELL) += -march=haswell ++ cflags-$(CONFIG_MBROADWELL) += -march=broadwell ++ cflags-$(CONFIG_MSKYLAKE) += -march=skylake ++ cflags-$(CONFIG_MSKYLAKEX) += -march=skylake-avx512 ++ cflags-$(CONFIG_MCANNONLAKE) += -march=cannonlake ++ cflags-$(CONFIG_MICELAKE) += -march=icelake-client ++ cflags-$(CONFIG_MCASCADELAKE) += -march=cascadelake ++ cflags-$(CONFIG_MCOOPERLAKE) += -march=cooperlake ++ cflags-$(CONFIG_MTIGERLAKE) += -march=tigerlake ++ cflags-$(CONFIG_MSAPPHIRERAPIDS) += -march=sapphirerapids ++ cflags-$(CONFIG_MROCKETLAKE) += -march=rocketlake ++ cflags-$(CONFIG_MALDERLAKE) += -march=alderlake ++ cflags-$(CONFIG_MRAPTORLAKE) += -march=raptorlake ++ cflags-$(CONFIG_MMETEORLAKE) += -march=meteorlake ++ cflags-$(CONFIG_MEMERALDRAPIDS) += -march=emeraldrapids + KBUILD_CFLAGS += $(cflags-y) + + rustflags-$(CONFIG_MK8) += -Ctarget-cpu=k8 + rustflags-$(CONFIG_MPSC) += -Ctarget-cpu=nocona + rustflags-$(CONFIG_MCORE2) += -Ctarget-cpu=core2 + rustflags-$(CONFIG_MATOM) += -Ctarget-cpu=atom +- rustflags-$(CONFIG_GENERIC_CPU) += -Ztune-cpu=generic ++ rustflags-$(CONFIG_MK8SSE3) += -Ctarget-cpu=k8-sse3 ++ rustflags-$(CONFIG_MK10) += -Ctarget-cpu=amdfam10 ++ rustflags-$(CONFIG_MBARCELONA) += -Ctarget-cpu=barcelona ++ rustflags-$(CONFIG_MBOBCAT) += -Ctarget-cpu=btver1 ++ rustflags-$(CONFIG_MJAGUAR) += -Ctarget-cpu=btver2 ++ rustflags-$(CONFIG_MBULLDOZER) += -Ctarget-cpu=bdver1 ++ rustflags-$(CONFIG_MPILEDRIVER) += -Ctarget-cpu=bdver2 ++ rustflags-$(CONFIG_MSTEAMROLLER) += -Ctarget-cpu=bdver3 ++ rustflags-$(CONFIG_MEXCAVATOR) += -Ctarget-cpu=bdver4 ++ rustflags-$(CONFIG_MZEN) += -Ctarget-cpu=znver1 ++ rustflags-$(CONFIG_MZEN2) += -Ctarget-cpu=znver2 ++ rustflags-$(CONFIG_MZEN3) += -Ctarget-cpu=znver3 ++ rustflags-$(CONFIG_MZEN4) += -Ctarget-cpu=znver4 ++ rustflags-$(CONFIG_MZEN5) += -Ctarget-cpu=znver5 ++ rustflags-$(CONFIG_MNATIVE_INTEL) += -Ctarget-cpu=native ++ rustflags-$(CONFIG_MNATIVE_AMD) += -Ctarget-cpu=native ++ rustflags-$(CONFIG_MATOM) += -Ctarget-cpu=bonnell ++ rustflags-$(CONFIG_MCORE2) += -Ctarget-cpu=core2 ++ rustflags-$(CONFIG_MNEHALEM) += -Ctarget-cpu=nehalem ++ rustflags-$(CONFIG_MWESTMERE) += -Ctarget-cpu=westmere ++ rustflags-$(CONFIG_MSILVERMONT) += -Ctarget-cpu=silvermont ++ rustflags-$(CONFIG_MGOLDMONT) += -Ctarget-cpu=goldmont ++ rustflags-$(CONFIG_MGOLDMONTPLUS) += -Ctarget-cpu=goldmont-plus ++ rustflags-$(CONFIG_MSANDYBRIDGE) += -Ctarget-cpu=sandybridge ++ rustflags-$(CONFIG_MIVYBRIDGE) += -Ctarget-cpu=ivybridge ++ rustflags-$(CONFIG_MHASWELL) += -Ctarget-cpu=haswell ++ rustflags-$(CONFIG_MBROADWELL) += -Ctarget-cpu=broadwell ++ rustflags-$(CONFIG_MSKYLAKE) += -Ctarget-cpu=skylake ++ rustflags-$(CONFIG_MSKYLAKEX) += -Ctarget-cpu=skylake-avx512 ++ rustflags-$(CONFIG_MCANNONLAKE) += -Ctarget-cpu=cannonlake ++ rustflags-$(CONFIG_MICELAKE) += -Ctarget-cpu=icelake-client ++ rustflags-$(CONFIG_MCASCADELAKE) += -Ctarget-cpu=cascadelake ++ rustflags-$(CONFIG_MCOOPERLAKE) += -Ctarget-cpu=cooperlake ++ rustflags-$(CONFIG_MTIGERLAKE) += -Ctarget-cpu=tigerlake ++ rustflags-$(CONFIG_MSAPPHIRERAPIDS) += -Ctarget-cpu=sapphirerapids ++ rustflags-$(CONFIG_MROCKETLAKE) += -Ctarget-cpu=rocketlake ++ rustflags-$(CONFIG_MALDERLAKE) += -Ctarget-cpu=alderlake ++ rustflags-$(CONFIG_MRAPTORLAKE) += -Ctarget-cpu=raptorlake ++ rustflags-$(CONFIG_MMETEORLAKE) += -Ctarget-cpu=meteorlake ++ rustflags-$(CONFIG_MEMERALDRAPIDS) += -Ctarget-cpu=emeraldrapids + KBUILD_RUSTFLAGS += $(rustflags-y) + + KBUILD_CFLAGS += -mno-red-zone +--- a/arch/x86/include/asm/vermagic.h ++++ b/arch/x86/include/asm/vermagic.h +@@ -17,6 +17,54 @@ + #define MODULE_PROC_FAMILY "586MMX " + #elif defined CONFIG_MCORE2 + #define MODULE_PROC_FAMILY "CORE2 " ++#elif defined CONFIG_MNATIVE_INTEL ++#define MODULE_PROC_FAMILY "NATIVE_INTEL " ++#elif defined CONFIG_MNATIVE_AMD ++#define MODULE_PROC_FAMILY "NATIVE_AMD " ++#elif defined CONFIG_MNEHALEM ++#define MODULE_PROC_FAMILY "NEHALEM " ++#elif defined CONFIG_MWESTMERE ++#define MODULE_PROC_FAMILY "WESTMERE " ++#elif defined CONFIG_MSILVERMONT ++#define MODULE_PROC_FAMILY "SILVERMONT " ++#elif defined CONFIG_MGOLDMONT ++#define MODULE_PROC_FAMILY "GOLDMONT " ++#elif defined CONFIG_MGOLDMONTPLUS ++#define MODULE_PROC_FAMILY "GOLDMONTPLUS " ++#elif defined CONFIG_MSANDYBRIDGE ++#define MODULE_PROC_FAMILY "SANDYBRIDGE " ++#elif defined CONFIG_MIVYBRIDGE ++#define MODULE_PROC_FAMILY "IVYBRIDGE " ++#elif defined CONFIG_MHASWELL ++#define MODULE_PROC_FAMILY "HASWELL " ++#elif defined CONFIG_MBROADWELL ++#define MODULE_PROC_FAMILY "BROADWELL " ++#elif defined CONFIG_MSKYLAKE ++#define MODULE_PROC_FAMILY "SKYLAKE " ++#elif defined CONFIG_MSKYLAKEX ++#define MODULE_PROC_FAMILY "SKYLAKEX " ++#elif defined CONFIG_MCANNONLAKE ++#define MODULE_PROC_FAMILY "CANNONLAKE " ++#elif defined CONFIG_MICELAKE ++#define MODULE_PROC_FAMILY "ICELAKE " ++#elif defined CONFIG_MCASCADELAKE ++#define MODULE_PROC_FAMILY "CASCADELAKE " ++#elif defined CONFIG_MCOOPERLAKE ++#define MODULE_PROC_FAMILY "COOPERLAKE " ++#elif defined CONFIG_MTIGERLAKE ++#define MODULE_PROC_FAMILY "TIGERLAKE " ++#elif defined CONFIG_MSAPPHIRERAPIDS ++#define MODULE_PROC_FAMILY "SAPPHIRERAPIDS " ++#elif defined CONFIG_ROCKETLAKE ++#define MODULE_PROC_FAMILY "ROCKETLAKE " ++#elif defined CONFIG_MALDERLAKE ++#define MODULE_PROC_FAMILY "ALDERLAKE " ++#elif defined CONFIG_MRAPTORLAKE ++#define MODULE_PROC_FAMILY "RAPTORLAKE " ++#elif defined CONFIG_MMETEORLAKE ++#define MODULE_PROC_FAMILY "METEORLAKE " ++#elif defined CONFIG_MEMERALDRAPIDS ++#define MODULE_PROC_FAMILY "EMERALDRAPIDS " + #elif defined CONFIG_MATOM + #define MODULE_PROC_FAMILY "ATOM " + #elif defined CONFIG_M686 +@@ -35,6 +83,28 @@ + #define MODULE_PROC_FAMILY "K7 " + #elif defined CONFIG_MK8 + #define MODULE_PROC_FAMILY "K8 " ++#elif defined CONFIG_MK8SSE3 ++#define MODULE_PROC_FAMILY "K8SSE3 " ++#elif defined CONFIG_MK10 ++#define MODULE_PROC_FAMILY "K10 " ++#elif defined CONFIG_MBARCELONA ++#define MODULE_PROC_FAMILY "BARCELONA " ++#elif defined CONFIG_MBOBCAT ++#define MODULE_PROC_FAMILY "BOBCAT " ++#elif defined CONFIG_MBULLDOZER ++#define MODULE_PROC_FAMILY "BULLDOZER " ++#elif defined CONFIG_MPILEDRIVER ++#define MODULE_PROC_FAMILY "PILEDRIVER " ++#elif defined CONFIG_MSTEAMROLLER ++#define MODULE_PROC_FAMILY "STEAMROLLER " ++#elif defined CONFIG_MJAGUAR ++#define MODULE_PROC_FAMILY "JAGUAR " ++#elif defined CONFIG_MEXCAVATOR ++#define MODULE_PROC_FAMILY "EXCAVATOR " ++#elif defined CONFIG_MZEN ++#define MODULE_PROC_FAMILY "ZEN " ++#elif defined CONFIG_MZEN2 ++#define MODULE_PROC_FAMILY "ZEN2 " + #elif defined CONFIG_MELAN + #define MODULE_PROC_FAMILY "ELAN " + #elif defined CONFIG_MCRUSOE diff --git a/debian/patches/mixed-arch/0002-ZEN-Fixup-graysky-s-more-ISA-levels-and-uarches.patch b/debian/patches/mixed-arch/0002-ZEN-Fixup-graysky-s-more-ISA-levels-and-uarches.patch new file mode 100644 index 0000000..35fee6e --- /dev/null +++ b/debian/patches/mixed-arch/0002-ZEN-Fixup-graysky-s-more-ISA-levels-and-uarches.patch @@ -0,0 +1,60 @@ +From 44295ad130b8735cecb288dd7463a14892803d9b Mon Sep 17 00:00:00 2001 +From: "Jan Alexander Steffens (heftig)" +Date: Tue, 1 Oct 2024 02:05:12 +0200 +Subject: ZEN: Fixup graysky's more-ISA-levels-and-uarches + +See: https://github.com/graysky2/kernel_compiler_patch/issues/105 +--- + arch/x86/Kconfig.cpu | 4 ---- + arch/x86/Makefile | 4 ---- + arch/x86/include/asm/vermagic.h | 6 ++++++ + 3 files changed, 6 insertions(+), 8 deletions(-) + +--- a/arch/x86/Kconfig.cpu ++++ b/arch/x86/Kconfig.cpu +@@ -603,10 +603,6 @@ config MNATIVE_AMD + + endchoice + +-config SUPPORT_MARCH_CODEVERS +- bool +- default y if (CC_IS_GCC && GCC_VERSION > 110000) || (CC_IS_CLANG && CLANG_VERSION >= 120000) +- + config X86_GENERIC + bool "Generic x86 support" + depends on X86_32 +--- a/arch/x86/Makefile ++++ b/arch/x86/Makefile +@@ -176,8 +176,6 @@ else + # FIXME - should be integrated in Makefile.cpu (Makefile_32.cpu) + cflags-$(CONFIG_MK8) += -march=k8 + cflags-$(CONFIG_MPSC) += -march=nocona +- cflags-$(CONFIG_MCORE2) += -march=core2 +- cflags-$(CONFIG_MATOM) += -march=atom + ifeq ($(CONFIG_X86_64_VERSION),1) + cflags-$(CONFIG_GENERIC_CPU) += -mtune=generic + rustflags-$(CONFIG_GENERIC_CPU) += -Ztune-cpu=generic +@@ -229,8 +227,6 @@ else + + rustflags-$(CONFIG_MK8) += -Ctarget-cpu=k8 + rustflags-$(CONFIG_MPSC) += -Ctarget-cpu=nocona +- rustflags-$(CONFIG_MCORE2) += -Ctarget-cpu=core2 +- rustflags-$(CONFIG_MATOM) += -Ctarget-cpu=atom + rustflags-$(CONFIG_MK8SSE3) += -Ctarget-cpu=k8-sse3 + rustflags-$(CONFIG_MK10) += -Ctarget-cpu=amdfam10 + rustflags-$(CONFIG_MBARCELONA) += -Ctarget-cpu=barcelona +--- a/arch/x86/include/asm/vermagic.h ++++ b/arch/x86/include/asm/vermagic.h +@@ -105,6 +105,12 @@ + #define MODULE_PROC_FAMILY "ZEN " + #elif defined CONFIG_MZEN2 + #define MODULE_PROC_FAMILY "ZEN2 " ++#elif defined CONFIG_MZEN3 ++#define MODULE_PROC_FAMILY "ZEN3 " ++#elif defined CONFIG_MZEN4 ++#define MODULE_PROC_FAMILY "ZEN4 " ++#elif defined CONFIG_MZEN5 ++#define MODULE_PROC_FAMILY "ZEN5 " + #elif defined CONFIG_MELAN + #define MODULE_PROC_FAMILY "ELAN " + #elif defined CONFIG_MCRUSOE diff --git a/debian/patches/mixed-arch/0003-ZEN-Restore-CONFIG_OPTIMIZE_FOR_PERFORMANCE_O3.patch b/debian/patches/mixed-arch/0003-ZEN-Restore-CONFIG_OPTIMIZE_FOR_PERFORMANCE_O3.patch new file mode 100644 index 0000000..b3ff8a4 --- /dev/null +++ b/debian/patches/mixed-arch/0003-ZEN-Restore-CONFIG_OPTIMIZE_FOR_PERFORMANCE_O3.patch @@ -0,0 +1,40 @@ +From 8dc948926f5b68b16a6a47a8f6e0a2154ac8ef3e Mon Sep 17 00:00:00 2001 +From: "Jan Alexander Steffens (heftig)" +Date: Sun, 11 Dec 2022 23:51:16 +0100 +Subject: ZEN: Restore CONFIG_OPTIMIZE_FOR_PERFORMANCE_O3 + +This reverts a6036a41bffba3d5007e377483b425d470ad8042 (kbuild: drop +support for CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3), removes the +dependency on CONFIG_ARC and adds RUSTFLAGS. +--- + Makefile | 3 +++ + init/Kconfig | 6 ++++++ + 2 files changed, 9 insertions(+) + +--- a/Makefile ++++ b/Makefile +@@ -814,6 +814,9 @@ KBUILD_CFLAGS += -fno-delete-null-pointe + ifdef CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE + KBUILD_CFLAGS += -O2 + KBUILD_RUSTFLAGS += -Copt-level=2 ++else ifdef CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3 ++KBUILD_CFLAGS += -O3 ++KBUILD_RUSTFLAGS += -Copt-level=3 + else ifdef CONFIG_CC_OPTIMIZE_FOR_SIZE + KBUILD_CFLAGS += -Os + KBUILD_RUSTFLAGS += -Copt-level=s +--- a/init/Kconfig ++++ b/init/Kconfig +@@ -1407,6 +1407,12 @@ config CC_OPTIMIZE_FOR_PERFORMANCE + with the "-O2" compiler flag for best performance and most + helpful compile-time warnings. + ++config CC_OPTIMIZE_FOR_PERFORMANCE_O3 ++ bool "Optimize more for performance (-O3)" ++ help ++ Choosing this option will pass "-O3" to your compiler to optimize ++ the kernel yet more for performance. ++ + config CC_OPTIMIZE_FOR_SIZE + bool "Optimize for size (-Os)" + help diff --git a/debian/patches/mixed-arch/0004-krd-adjust-CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3.patch b/debian/patches/mixed-arch/0004-krd-adjust-CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3.patch new file mode 100644 index 0000000..2507221 --- /dev/null +++ b/debian/patches/mixed-arch/0004-krd-adjust-CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3.patch @@ -0,0 +1,11 @@ +--- a/Makefile ++++ b/Makefile +@@ -815,7 +815,7 @@ ifdef CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE + KBUILD_CFLAGS += -O2 + KBUILD_RUSTFLAGS += -Copt-level=2 + else ifdef CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3 +-KBUILD_CFLAGS += -O3 ++KBUILD_CFLAGS += -O3 $(call cc-option,-fivopts) + KBUILD_RUSTFLAGS += -Copt-level=3 + else ifdef CONFIG_CC_OPTIMIZE_FOR_SIZE + KBUILD_CFLAGS += -Os diff --git a/debian/patches/mixed-arch/0005-XANMOD-x86-build-Prevent-generating-avx2-and-avx512-.patch b/debian/patches/mixed-arch/0005-XANMOD-x86-build-Prevent-generating-avx2-and-avx512-.patch new file mode 100644 index 0000000..b66b0b9 --- /dev/null +++ b/debian/patches/mixed-arch/0005-XANMOD-x86-build-Prevent-generating-avx2-and-avx512-.patch @@ -0,0 +1,25 @@ +From 3ebc1fdf3e0ee9bff1efe20eb5791eba5c84a810 Mon Sep 17 00:00:00 2001 +From: Alexandre Frade +Date: Thu, 3 Aug 2023 13:53:49 +0000 +Subject: [PATCH 01/19] XANMOD: x86/build: Prevent generating avx2 and avx512 + floating-point code + +Signed-off-by: Alexandre Frade +--- + arch/x86/Makefile | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +--- a/arch/x86/Makefile ++++ b/arch/x86/Makefile +@@ -70,9 +70,9 @@ export BITS + # + # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=53383 + # +-KBUILD_CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow -mno-avx ++KBUILD_CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow -mno-avx -mno-avx2 -mno-avx512f + KBUILD_RUSTFLAGS += --target=$(objtree)/scripts/target.json +-KBUILD_RUSTFLAGS += -Ctarget-feature=-sse,-sse2,-sse3,-ssse3,-sse4.1,-sse4.2,-avx,-avx2 ++KBUILD_RUSTFLAGS += -Ctarget-feature=-sse,-sse2,-sse3,-ssse3,-sse4.1,-sse4.2,-avx,-avx2,-avx512f + + # + # CFLAGS for compiling floating point code inside the kernel. diff --git a/debian/patches/mixed-arch/0006-krd-adjust-KBUILD_CFLAGS-fno-tree-vectorize.patch b/debian/patches/mixed-arch/0006-krd-adjust-KBUILD_CFLAGS-fno-tree-vectorize.patch new file mode 100644 index 0000000..58c9da1 --- /dev/null +++ b/debian/patches/mixed-arch/0006-krd-adjust-KBUILD_CFLAGS-fno-tree-vectorize.patch @@ -0,0 +1,11 @@ +--- a/arch/x86/Makefile ++++ b/arch/x86/Makefile +@@ -70,7 +70,7 @@ export BITS + # + # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=53383 + # +-KBUILD_CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow -mno-avx -mno-avx2 -mno-avx512f ++KBUILD_CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow -mno-avx -mno-avx2 -mno-avx512f -fno-tree-vectorize + KBUILD_RUSTFLAGS += --target=$(objtree)/scripts/target.json + KBUILD_RUSTFLAGS += -Ctarget-feature=-sse,-sse2,-sse3,-ssse3,-sse4.1,-sse4.2,-avx,-avx2,-avx512f + diff --git a/debian/patches/patchset-pf/amd-pstate/0001-cpufreq-amd-pstate-add-quirk-for-Ryzen-3000-series-p.patch b/debian/patches/patchset-pf/amd-pstate/0001-cpufreq-amd-pstate-add-quirk-for-Ryzen-3000-series-p.patch new file mode 100644 index 0000000..01b3aed --- /dev/null +++ b/debian/patches/patchset-pf/amd-pstate/0001-cpufreq-amd-pstate-add-quirk-for-Ryzen-3000-series-p.patch @@ -0,0 +1,70 @@ +From 3427331872c37b2edb42406c65764e1565b0591b Mon Sep 17 00:00:00 2001 +From: Perry Yuan +Date: Fri, 9 Aug 2024 14:09:05 +0800 +Subject: cpufreq: amd-pstate: add quirk for Ryzen 3000 series processor + +The Ryzen 3000 series processors have been observed lacking the +nominal_freq and lowest_freq parameters in their ACPI tables. This +absence causes issues with loading the amd-pstate driver on these +systems. Introduces a fix to resolve the dependency issue +by adding a quirk specifically for the Ryzen 3000 series. + +Reported-by: David Wang <00107082@163.com> +Signed-off-by: Perry Yuan +--- + drivers/cpufreq/amd-pstate.c | 30 ++++++++++++++++++++++++++++++ + 1 file changed, 30 insertions(+) + +--- a/drivers/cpufreq/amd-pstate.c ++++ b/drivers/cpufreq/amd-pstate.c +@@ -142,6 +142,11 @@ static struct quirk_entry quirk_amd_7k62 + .lowest_freq = 550, + }; + ++static struct quirk_entry quirk_amd_mts = { ++ .nominal_freq = 3600, ++ .lowest_freq = 550, ++}; ++ + static int __init dmi_matched_7k62_bios_bug(const struct dmi_system_id *dmi) + { + /** +@@ -158,6 +163,21 @@ static int __init dmi_matched_7k62_bios_ + return 0; + } + ++static int __init dmi_matched_mts_bios_bug(const struct dmi_system_id *dmi) ++{ ++ /** ++ * match the broken bios for ryzen 3000 series processor support CPPC V2 ++ * broken BIOS lack of nominal_freq and lowest_freq capabilities ++ * definition in ACPI tables ++ */ ++ if (cpu_feature_enabled(X86_FEATURE_ZEN2)) { ++ quirks = dmi->driver_data; ++ pr_info("Overriding nominal and lowest frequencies for %s\n", dmi->ident); ++ return 1; ++ } ++ ++ return 0; ++} + static const struct dmi_system_id amd_pstate_quirks_table[] __initconst = { + { + .callback = dmi_matched_7k62_bios_bug, +@@ -168,6 +188,16 @@ static const struct dmi_system_id amd_ps + }, + .driver_data = &quirk_amd_7k62, + }, ++ { ++ .callback = dmi_matched_mts_bios_bug, ++ .ident = "AMD Ryzen 3000", ++ .matches = { ++ DMI_MATCH(DMI_PRODUCT_NAME, "B450M MORTAR MAX (MS-7B89)"), ++ DMI_MATCH(DMI_BIOS_RELEASE, "06/10/2020"), ++ DMI_MATCH(DMI_BIOS_VERSION, "5.14"), ++ }, ++ .driver_data = &quirk_amd_mts, ++ }, + {} + }; + MODULE_DEVICE_TABLE(dmi, amd_pstate_quirks_table); diff --git a/debian/patches/patchset-pf/amd-pstate/0002-cpufreq-amd-pstate-Export-symbols-for-changing-modes.patch b/debian/patches/patchset-pf/amd-pstate/0002-cpufreq-amd-pstate-Export-symbols-for-changing-modes.patch new file mode 100644 index 0000000..a2f1bf5 --- /dev/null +++ b/debian/patches/patchset-pf/amd-pstate/0002-cpufreq-amd-pstate-Export-symbols-for-changing-modes.patch @@ -0,0 +1,88 @@ +From 44f21855901b1fd618ac16b07dbd14e8fea4ee13 Mon Sep 17 00:00:00 2001 +From: Mario Limonciello +Date: Sat, 31 Aug 2024 21:49:11 -0500 +Subject: cpufreq/amd-pstate: Export symbols for changing modes + +In order to effectively test all mode switch combinations export +everything necessarily for amd-pstate-ut to trigger a mode switch. + +Signed-off-by: Mario Limonciello +--- + drivers/cpufreq/amd-pstate.c | 23 ++++++++++------------- + drivers/cpufreq/amd-pstate.h | 14 ++++++++++++++ + 2 files changed, 24 insertions(+), 13 deletions(-) + +--- a/drivers/cpufreq/amd-pstate.c ++++ b/drivers/cpufreq/amd-pstate.c +@@ -60,18 +60,6 @@ + #define AMD_CPPC_EPP_BALANCE_POWERSAVE 0xBF + #define AMD_CPPC_EPP_POWERSAVE 0xFF + +-/* +- * enum amd_pstate_mode - driver working mode of amd pstate +- */ +-enum amd_pstate_mode { +- AMD_PSTATE_UNDEFINED = 0, +- AMD_PSTATE_DISABLE, +- AMD_PSTATE_PASSIVE, +- AMD_PSTATE_ACTIVE, +- AMD_PSTATE_GUIDED, +- AMD_PSTATE_MAX, +-}; +- + static const char * const amd_pstate_mode_string[] = { + [AMD_PSTATE_UNDEFINED] = "undefined", + [AMD_PSTATE_DISABLE] = "disable", +@@ -81,6 +69,14 @@ static const char * const amd_pstate_mod + NULL, + }; + ++const char *amd_pstate_get_mode_string(enum amd_pstate_mode mode) ++{ ++ if (mode < 0 || mode >= AMD_PSTATE_MAX) ++ return NULL; ++ return amd_pstate_mode_string[mode]; ++} ++EXPORT_SYMBOL_GPL(amd_pstate_get_mode_string); ++ + struct quirk_entry { + u32 nominal_freq; + u32 lowest_freq; +@@ -1392,7 +1388,7 @@ static ssize_t amd_pstate_show_status(ch + return sysfs_emit(buf, "%s\n", amd_pstate_mode_string[cppc_state]); + } + +-static int amd_pstate_update_status(const char *buf, size_t size) ++int amd_pstate_update_status(const char *buf, size_t size) + { + int mode_idx; + +@@ -1409,6 +1405,7 @@ static int amd_pstate_update_status(cons + + return 0; + } ++EXPORT_SYMBOL_GPL(amd_pstate_update_status); + + static ssize_t status_show(struct device *dev, + struct device_attribute *attr, char *buf) +--- a/drivers/cpufreq/amd-pstate.h ++++ b/drivers/cpufreq/amd-pstate.h +@@ -103,4 +103,18 @@ struct amd_cpudata { + bool boost_state; + }; + ++/* ++ * enum amd_pstate_mode - driver working mode of amd pstate ++ */ ++enum amd_pstate_mode { ++ AMD_PSTATE_UNDEFINED = 0, ++ AMD_PSTATE_DISABLE, ++ AMD_PSTATE_PASSIVE, ++ AMD_PSTATE_ACTIVE, ++ AMD_PSTATE_GUIDED, ++ AMD_PSTATE_MAX, ++}; ++const char *amd_pstate_get_mode_string(enum amd_pstate_mode mode); ++int amd_pstate_update_status(const char *buf, size_t size); ++ + #endif /* _LINUX_AMD_PSTATE_H */ diff --git a/debian/patches/patchset-pf/amd-pstate/0003-cpufreq-amd-pstate-ut-Add-test-case-for-mode-switche.patch b/debian/patches/patchset-pf/amd-pstate/0003-cpufreq-amd-pstate-ut-Add-test-case-for-mode-switche.patch new file mode 100644 index 0000000..ccfc5ab --- /dev/null +++ b/debian/patches/patchset-pf/amd-pstate/0003-cpufreq-amd-pstate-ut-Add-test-case-for-mode-switche.patch @@ -0,0 +1,77 @@ +From aabfc7370a7da9c52be97c79ba70a20201e6864a Mon Sep 17 00:00:00 2001 +From: Mario Limonciello +Date: Sat, 31 Aug 2024 21:49:12 -0500 +Subject: cpufreq/amd-pstate-ut: Add test case for mode switches + +There is a state machine in the amd-pstate driver utilized for +switches for all modes. To make sure that cleanup and setup works +properly for each mode add a unit test case that tries all +combinations. + +Signed-off-by: Mario Limonciello +--- + drivers/cpufreq/amd-pstate-ut.c | 41 ++++++++++++++++++++++++++++++++- + 1 file changed, 40 insertions(+), 1 deletion(-) + +--- a/drivers/cpufreq/amd-pstate-ut.c ++++ b/drivers/cpufreq/amd-pstate-ut.c +@@ -54,12 +54,14 @@ static void amd_pstate_ut_acpi_cpc_valid + static void amd_pstate_ut_check_enabled(u32 index); + static void amd_pstate_ut_check_perf(u32 index); + static void amd_pstate_ut_check_freq(u32 index); ++static void amd_pstate_ut_check_driver(u32 index); + + static struct amd_pstate_ut_struct amd_pstate_ut_cases[] = { + {"amd_pstate_ut_acpi_cpc_valid", amd_pstate_ut_acpi_cpc_valid }, + {"amd_pstate_ut_check_enabled", amd_pstate_ut_check_enabled }, + {"amd_pstate_ut_check_perf", amd_pstate_ut_check_perf }, +- {"amd_pstate_ut_check_freq", amd_pstate_ut_check_freq } ++ {"amd_pstate_ut_check_freq", amd_pstate_ut_check_freq }, ++ {"amd_pstate_ut_check_driver", amd_pstate_ut_check_driver } + }; + + static bool get_shared_mem(void) +@@ -257,6 +259,43 @@ skip_test: + cpufreq_cpu_put(policy); + } + ++static int amd_pstate_set_mode(enum amd_pstate_mode mode) ++{ ++ const char *mode_str = amd_pstate_get_mode_string(mode); ++ ++ pr_debug("->setting mode to %s\n", mode_str); ++ ++ return amd_pstate_update_status(mode_str, strlen(mode_str)); ++} ++ ++static void amd_pstate_ut_check_driver(u32 index) ++{ ++ enum amd_pstate_mode mode1, mode2; ++ int ret; ++ ++ for (mode1 = AMD_PSTATE_DISABLE; mode1 < AMD_PSTATE_MAX; mode1++) { ++ ret = amd_pstate_set_mode(mode1); ++ if (ret) ++ goto out; ++ for (mode2 = AMD_PSTATE_DISABLE; mode2 < AMD_PSTATE_MAX; mode2++) { ++ if (mode1 == mode2) ++ continue; ++ ret = amd_pstate_set_mode(mode2); ++ if (ret) ++ goto out; ++ } ++ } ++out: ++ if (ret) ++ pr_warn("%s: failed to update status for %s->%s: %d\n", __func__, ++ amd_pstate_get_mode_string(mode1), ++ amd_pstate_get_mode_string(mode2), ret); ++ ++ amd_pstate_ut_cases[index].result = ret ? ++ AMD_PSTATE_UT_RESULT_FAIL : ++ AMD_PSTATE_UT_RESULT_PASS; ++} ++ + static int __init amd_pstate_ut_init(void) + { + u32 i = 0, arr_size = ARRAY_SIZE(amd_pstate_ut_cases); diff --git a/debian/patches/patchset-pf/amd-pstate/0004-cpufreq-amd-pstate-Catch-failures-for-amd_pstate_epp.patch b/debian/patches/patchset-pf/amd-pstate/0004-cpufreq-amd-pstate-Catch-failures-for-amd_pstate_epp.patch new file mode 100644 index 0000000..0201b74 --- /dev/null +++ b/debian/patches/patchset-pf/amd-pstate/0004-cpufreq-amd-pstate-Catch-failures-for-amd_pstate_epp.patch @@ -0,0 +1,60 @@ +From 24e62fbc101d079d398ac6fc76f458676d3d9491 Mon Sep 17 00:00:00 2001 +From: Mario Limonciello +Date: Sun, 1 Sep 2024 00:00:35 -0500 +Subject: cpufreq/amd-pstate: Catch failures for amd_pstate_epp_update_limit() + +amd_pstate_set_epp() calls cppc_set_epp_perf() which can fail for +a variety of reasons but this is ignored. Change the return flow +to allow failures. + +Signed-off-by: Mario Limonciello +--- + drivers/cpufreq/amd-pstate.c | 11 +++++++---- + 1 file changed, 7 insertions(+), 4 deletions(-) + +--- a/drivers/cpufreq/amd-pstate.c ++++ b/drivers/cpufreq/amd-pstate.c +@@ -1595,7 +1595,7 @@ static void amd_pstate_epp_cpu_exit(stru + pr_debug("CPU %d exiting\n", policy->cpu); + } + +-static void amd_pstate_epp_update_limit(struct cpufreq_policy *policy) ++static int amd_pstate_epp_update_limit(struct cpufreq_policy *policy) + { + struct amd_cpudata *cpudata = policy->driver_data; + u32 max_perf, min_perf, min_limit_perf, max_limit_perf; +@@ -1645,7 +1645,7 @@ static void amd_pstate_epp_update_limit( + * This return value can only be negative for shared_memory + * systems where EPP register read/write not supported. + */ +- return; ++ return epp; + } + + if (cpudata->policy == CPUFREQ_POLICY_PERFORMANCE) +@@ -1658,12 +1658,13 @@ static void amd_pstate_epp_update_limit( + } + + WRITE_ONCE(cpudata->cppc_req_cached, value); +- amd_pstate_set_epp(cpudata, epp); ++ return amd_pstate_set_epp(cpudata, epp); + } + + static int amd_pstate_epp_set_policy(struct cpufreq_policy *policy) + { + struct amd_cpudata *cpudata = policy->driver_data; ++ int ret; + + if (!policy->cpuinfo.max_freq) + return -ENODEV; +@@ -1673,7 +1674,9 @@ static int amd_pstate_epp_set_policy(str + + cpudata->policy = policy->policy; + +- amd_pstate_epp_update_limit(policy); ++ ret = amd_pstate_epp_update_limit(policy); ++ if (ret) ++ return ret; + + /* + * policy->cur is never updated with the amd_pstate_epp driver, but it diff --git a/debian/patches/patchset-pf/amd-pstate/0005-x86-amd-Move-amd_get_highest_perf-from-amd.c-to-cppc.patch b/debian/patches/patchset-pf/amd-pstate/0005-x86-amd-Move-amd_get_highest_perf-from-amd.c-to-cppc.patch new file mode 100644 index 0000000..32b183b --- /dev/null +++ b/debian/patches/patchset-pf/amd-pstate/0005-x86-amd-Move-amd_get_highest_perf-from-amd.c-to-cppc.patch @@ -0,0 +1,67 @@ +From 29c0347dd542e091e2f7e5980dd885f918f5f676 Mon Sep 17 00:00:00 2001 +From: Mario Limonciello +Date: Thu, 5 Sep 2024 11:29:57 -0500 +Subject: x86/amd: Move amd_get_highest_perf() from amd.c to cppc.c + +To prepare to let amd_get_highest_perf() detect preferred cores +it will require CPPC functions. Move amd_get_highest_perf() to +cppc.c to prepare for 'preferred core detection' rework. + +No functional changes intended. + +Reviewed-by: Perry Yuan +Reviewed-by: Gautham R. Shenoy +Signed-off-by: Mario Limonciello +Reviewed-by: Gautham R. Shenoy +--- + arch/x86/kernel/acpi/cppc.c | 16 ++++++++++++++++ + arch/x86/kernel/cpu/amd.c | 16 ---------------- + 2 files changed, 16 insertions(+), 16 deletions(-) + +--- a/arch/x86/kernel/acpi/cppc.c ++++ b/arch/x86/kernel/acpi/cppc.c +@@ -116,3 +116,19 @@ void init_freq_invariance_cppc(void) + init_done = true; + mutex_unlock(&freq_invariance_lock); + } ++ ++u32 amd_get_highest_perf(void) ++{ ++ struct cpuinfo_x86 *c = &boot_cpu_data; ++ ++ if (c->x86 == 0x17 && ((c->x86_model >= 0x30 && c->x86_model < 0x40) || ++ (c->x86_model >= 0x70 && c->x86_model < 0x80))) ++ return 166; ++ ++ if (c->x86 == 0x19 && ((c->x86_model >= 0x20 && c->x86_model < 0x30) || ++ (c->x86_model >= 0x40 && c->x86_model < 0x70))) ++ return 166; ++ ++ return 255; ++} ++EXPORT_SYMBOL_GPL(amd_get_highest_perf); +--- a/arch/x86/kernel/cpu/amd.c ++++ b/arch/x86/kernel/cpu/amd.c +@@ -1190,22 +1190,6 @@ unsigned long amd_get_dr_addr_mask(unsig + } + EXPORT_SYMBOL_GPL(amd_get_dr_addr_mask); + +-u32 amd_get_highest_perf(void) +-{ +- struct cpuinfo_x86 *c = &boot_cpu_data; +- +- if (c->x86 == 0x17 && ((c->x86_model >= 0x30 && c->x86_model < 0x40) || +- (c->x86_model >= 0x70 && c->x86_model < 0x80))) +- return 166; +- +- if (c->x86 == 0x19 && ((c->x86_model >= 0x20 && c->x86_model < 0x30) || +- (c->x86_model >= 0x40 && c->x86_model < 0x70))) +- return 166; +- +- return 255; +-} +-EXPORT_SYMBOL_GPL(amd_get_highest_perf); +- + static void zenbleed_check_cpu(void *unused) + { + struct cpuinfo_x86 *c = &cpu_data(smp_processor_id()); diff --git a/debian/patches/patchset-pf/amd-pstate/0006-ACPI-CPPC-Adjust-return-code-for-inline-functions-in.patch b/debian/patches/patchset-pf/amd-pstate/0006-ACPI-CPPC-Adjust-return-code-for-inline-functions-in.patch new file mode 100644 index 0000000..7548e16 --- /dev/null +++ b/debian/patches/patchset-pf/amd-pstate/0006-ACPI-CPPC-Adjust-return-code-for-inline-functions-in.patch @@ -0,0 +1,95 @@ +From 072efeb45349edd8ba9def11b6a450eaf56690a8 Mon Sep 17 00:00:00 2001 +From: Mario Limonciello +Date: Thu, 5 Sep 2024 11:29:58 -0500 +Subject: ACPI: CPPC: Adjust return code for inline functions in + !CONFIG_ACPI_CPPC_LIB + +Checkpath emits the following warning: +``` +WARNING: ENOTSUPP is not a SUSV4 error code, prefer EOPNOTSUPP +``` + +Adjust the code accordingly. + +Reviewed-by: Gautham R. Shenoy +Signed-off-by: Mario Limonciello +Reviewed-by: Gautham R. Shenoy +--- + include/acpi/cppc_acpi.h | 26 +++++++++++++------------- + 1 file changed, 13 insertions(+), 13 deletions(-) + +--- a/include/acpi/cppc_acpi.h ++++ b/include/acpi/cppc_acpi.h +@@ -164,31 +164,31 @@ extern int cppc_set_auto_sel(int cpu, bo + #else /* !CONFIG_ACPI_CPPC_LIB */ + static inline int cppc_get_desired_perf(int cpunum, u64 *desired_perf) + { +- return -ENOTSUPP; ++ return -EOPNOTSUPP; + } + static inline int cppc_get_nominal_perf(int cpunum, u64 *nominal_perf) + { +- return -ENOTSUPP; ++ return -EOPNOTSUPP; + } + static inline int cppc_get_highest_perf(int cpunum, u64 *highest_perf) + { +- return -ENOTSUPP; ++ return -EOPNOTSUPP; + } + static inline int cppc_get_perf_ctrs(int cpu, struct cppc_perf_fb_ctrs *perf_fb_ctrs) + { +- return -ENOTSUPP; ++ return -EOPNOTSUPP; + } + static inline int cppc_set_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls) + { +- return -ENOTSUPP; ++ return -EOPNOTSUPP; + } + static inline int cppc_set_enable(int cpu, bool enable) + { +- return -ENOTSUPP; ++ return -EOPNOTSUPP; + } + static inline int cppc_get_perf_caps(int cpu, struct cppc_perf_caps *caps) + { +- return -ENOTSUPP; ++ return -EOPNOTSUPP; + } + static inline bool cppc_perf_ctrs_in_pcc(void) + { +@@ -212,27 +212,27 @@ static inline bool cpc_ffh_supported(voi + } + static inline int cpc_read_ffh(int cpunum, struct cpc_reg *reg, u64 *val) + { +- return -ENOTSUPP; ++ return -EOPNOTSUPP; + } + static inline int cpc_write_ffh(int cpunum, struct cpc_reg *reg, u64 val) + { +- return -ENOTSUPP; ++ return -EOPNOTSUPP; + } + static inline int cppc_set_epp_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls, bool enable) + { +- return -ENOTSUPP; ++ return -EOPNOTSUPP; + } + static inline int cppc_get_epp_perf(int cpunum, u64 *epp_perf) + { +- return -ENOTSUPP; ++ return -EOPNOTSUPP; + } + static inline int cppc_set_auto_sel(int cpu, bool enable) + { +- return -ENOTSUPP; ++ return -EOPNOTSUPP; + } + static inline int cppc_get_auto_sel_caps(int cpunum, struct cppc_perf_caps *perf_caps) + { +- return -ENOTSUPP; ++ return -EOPNOTSUPP; + } + #endif /* !CONFIG_ACPI_CPPC_LIB */ + diff --git a/debian/patches/patchset-pf/amd-pstate/0007-x86-amd-Rename-amd_get_highest_perf-to-amd_get_boost.patch b/debian/patches/patchset-pf/amd-pstate/0007-x86-amd-Rename-amd_get_highest_perf-to-amd_get_boost.patch new file mode 100644 index 0000000..dea1ebe --- /dev/null +++ b/debian/patches/patchset-pf/amd-pstate/0007-x86-amd-Rename-amd_get_highest_perf-to-amd_get_boost.patch @@ -0,0 +1,162 @@ +From 21492d91ffc7c3fdb6507f64a74abf8326c75141 Mon Sep 17 00:00:00 2001 +From: Mario Limonciello +Date: Thu, 5 Sep 2024 11:29:59 -0500 +Subject: x86/amd: Rename amd_get_highest_perf() to + amd_get_boost_ratio_numerator() + +The function name is ambiguous because it returns an intermediate value +for calculating maximum frequency rather than the CPPC 'Highest Perf' +register. + +Rename the function to clarify its use and allow the function to return +errors. Adjust the consumer in acpi-cpufreq to catch errors. + +Reviewed-by: Gautham R. Shenoy +Signed-off-by: Mario Limonciello +Reviewed-by: Gautham R. Shenoy +--- + arch/x86/include/asm/processor.h | 3 --- + arch/x86/kernel/acpi/cppc.c | 44 +++++++++++++++++++++++--------- + drivers/cpufreq/acpi-cpufreq.c | 12 ++++++--- + include/acpi/cppc_acpi.h | 5 ++++ + 4 files changed, 46 insertions(+), 18 deletions(-) + +--- a/arch/x86/include/asm/processor.h ++++ b/arch/x86/include/asm/processor.h +@@ -691,8 +691,6 @@ static inline u32 per_cpu_l2c_id(unsigne + } + + #ifdef CONFIG_CPU_SUP_AMD +-extern u32 amd_get_highest_perf(void); +- + /* + * Issue a DIV 0/1 insn to clear any division data from previous DIV + * operations. +@@ -705,7 +703,6 @@ static __always_inline void amd_clear_di + + extern void amd_check_microcode(void); + #else +-static inline u32 amd_get_highest_perf(void) { return 0; } + static inline void amd_clear_divider(void) { } + static inline void amd_check_microcode(void) { } + #endif +--- a/arch/x86/kernel/acpi/cppc.c ++++ b/arch/x86/kernel/acpi/cppc.c +@@ -69,7 +69,7 @@ int cpc_write_ffh(int cpunum, struct cpc + static void amd_set_max_freq_ratio(void) + { + struct cppc_perf_caps perf_caps; +- u64 highest_perf, nominal_perf; ++ u64 numerator, nominal_perf; + u64 perf_ratio; + int rc; + +@@ -79,15 +79,19 @@ static void amd_set_max_freq_ratio(void) + return; + } + +- highest_perf = amd_get_highest_perf(); ++ rc = amd_get_boost_ratio_numerator(0, &numerator); ++ if (rc) { ++ pr_debug("Could not retrieve highest performance (%d)\n", rc); ++ return; ++ } + nominal_perf = perf_caps.nominal_perf; + +- if (!highest_perf || !nominal_perf) { +- pr_debug("Could not retrieve highest or nominal performance\n"); ++ if (!nominal_perf) { ++ pr_debug("Could not retrieve nominal performance\n"); + return; + } + +- perf_ratio = div_u64(highest_perf * SCHED_CAPACITY_SCALE, nominal_perf); ++ perf_ratio = div_u64(numerator * SCHED_CAPACITY_SCALE, nominal_perf); + /* midpoint between max_boost and max_P */ + perf_ratio = (perf_ratio + SCHED_CAPACITY_SCALE) >> 1; + if (!perf_ratio) { +@@ -117,18 +121,34 @@ void init_freq_invariance_cppc(void) + mutex_unlock(&freq_invariance_lock); + } + +-u32 amd_get_highest_perf(void) ++/** ++ * amd_get_boost_ratio_numerator: Get the numerator to use for boost ratio calculation ++ * @cpu: CPU to get numerator for. ++ * @numerator: Output variable for numerator. ++ * ++ * Determine the numerator to use for calculating the boost ratio on ++ * a CPU. On systems that support preferred cores, this will be a hardcoded ++ * value. On other systems this will the highest performance register value. ++ * ++ * Return: 0 for success, negative error code otherwise. ++ */ ++int amd_get_boost_ratio_numerator(unsigned int cpu, u64 *numerator) + { + struct cpuinfo_x86 *c = &boot_cpu_data; + + if (c->x86 == 0x17 && ((c->x86_model >= 0x30 && c->x86_model < 0x40) || +- (c->x86_model >= 0x70 && c->x86_model < 0x80))) +- return 166; ++ (c->x86_model >= 0x70 && c->x86_model < 0x80))) { ++ *numerator = 166; ++ return 0; ++ } + + if (c->x86 == 0x19 && ((c->x86_model >= 0x20 && c->x86_model < 0x30) || +- (c->x86_model >= 0x40 && c->x86_model < 0x70))) +- return 166; ++ (c->x86_model >= 0x40 && c->x86_model < 0x70))) { ++ *numerator = 166; ++ return 0; ++ } ++ *numerator = 255; + +- return 255; ++ return 0; + } +-EXPORT_SYMBOL_GPL(amd_get_highest_perf); ++EXPORT_SYMBOL_GPL(amd_get_boost_ratio_numerator); +--- a/drivers/cpufreq/acpi-cpufreq.c ++++ b/drivers/cpufreq/acpi-cpufreq.c +@@ -642,10 +642,16 @@ static u64 get_max_boost_ratio(unsigned + return 0; + } + +- if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) +- highest_perf = amd_get_highest_perf(); +- else ++ if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { ++ ret = amd_get_boost_ratio_numerator(cpu, &highest_perf); ++ if (ret) { ++ pr_debug("CPU%d: Unable to get boost ratio numerator (%d)\n", ++ cpu, ret); ++ return 0; ++ } ++ } else { + highest_perf = perf_caps.highest_perf; ++ } + + nominal_perf = perf_caps.nominal_perf; + +--- a/include/acpi/cppc_acpi.h ++++ b/include/acpi/cppc_acpi.h +@@ -161,6 +161,7 @@ extern int cppc_get_epp_perf(int cpunum, + extern int cppc_set_epp_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls, bool enable); + extern int cppc_get_auto_sel_caps(int cpunum, struct cppc_perf_caps *perf_caps); + extern int cppc_set_auto_sel(int cpu, bool enable); ++extern int amd_get_boost_ratio_numerator(unsigned int cpu, u64 *numerator); + #else /* !CONFIG_ACPI_CPPC_LIB */ + static inline int cppc_get_desired_perf(int cpunum, u64 *desired_perf) + { +@@ -234,6 +235,10 @@ static inline int cppc_get_auto_sel_caps + { + return -EOPNOTSUPP; + } ++static inline int amd_get_boost_ratio_numerator(unsigned int cpu, u64 *numerator) ++{ ++ return -EOPNOTSUPP; ++} + #endif /* !CONFIG_ACPI_CPPC_LIB */ + + #endif /* _CPPC_ACPI_H*/ diff --git a/debian/patches/patchset-pf/amd-pstate/0008-ACPI-CPPC-Drop-check-for-non-zero-perf-ratio.patch b/debian/patches/patchset-pf/amd-pstate/0008-ACPI-CPPC-Drop-check-for-non-zero-perf-ratio.patch new file mode 100644 index 0000000..dcfd0a2 --- /dev/null +++ b/debian/patches/patchset-pf/amd-pstate/0008-ACPI-CPPC-Drop-check-for-non-zero-perf-ratio.patch @@ -0,0 +1,35 @@ +From 6f10d066dce0f1781b514a0352f0b427a32b1bb2 Mon Sep 17 00:00:00 2001 +From: Mario Limonciello +Date: Thu, 5 Sep 2024 11:30:00 -0500 +Subject: ACPI: CPPC: Drop check for non zero perf ratio + +perf_ratio is a u64 and SCHED_CAPACITY_SCALE is a large number. +Shifting by one will never have a zero value. + +Drop the check. + +Suggested-by: Gautham R. Shenoy +Reviewed-by: Gautham R. Shenoy +Signed-off-by: Mario Limonciello +Reviewed-by: Gautham R. Shenoy +--- + arch/x86/kernel/acpi/cppc.c | 7 +------ + 1 file changed, 1 insertion(+), 6 deletions(-) + +--- a/arch/x86/kernel/acpi/cppc.c ++++ b/arch/x86/kernel/acpi/cppc.c +@@ -91,13 +91,8 @@ static void amd_set_max_freq_ratio(void) + return; + } + +- perf_ratio = div_u64(numerator * SCHED_CAPACITY_SCALE, nominal_perf); + /* midpoint between max_boost and max_P */ +- perf_ratio = (perf_ratio + SCHED_CAPACITY_SCALE) >> 1; +- if (!perf_ratio) { +- pr_debug("Non-zero highest/nominal perf values led to a 0 ratio\n"); +- return; +- } ++ perf_ratio = (div_u64(numerator * SCHED_CAPACITY_SCALE, nominal_perf) + SCHED_CAPACITY_SCALE) >> 1; + + freq_invariance_set_perf_ratio(perf_ratio, false); + } diff --git a/debian/patches/patchset-pf/amd-pstate/0009-ACPI-CPPC-Adjust-debug-messages-in-amd_set_max_freq_.patch b/debian/patches/patchset-pf/amd-pstate/0009-ACPI-CPPC-Adjust-debug-messages-in-amd_set_max_freq_.patch new file mode 100644 index 0000000..f7caec6 --- /dev/null +++ b/debian/patches/patchset-pf/amd-pstate/0009-ACPI-CPPC-Adjust-debug-messages-in-amd_set_max_freq_.patch @@ -0,0 +1,44 @@ +From 8c142a91a58f24119e99d4e66b11890f4a4ef984 Mon Sep 17 00:00:00 2001 +From: Mario Limonciello +Date: Thu, 5 Sep 2024 11:30:01 -0500 +Subject: ACPI: CPPC: Adjust debug messages in amd_set_max_freq_ratio() to warn + +If the boost ratio isn't calculated properly for the system for any +reason this can cause other problems that are non-obvious. + +Raise all messages to warn instead. + +Suggested-by: Perry Yuan +Reviewed-by: Perry Yuan +Reviewed-by: Gautham R. Shenoy +Signed-off-by: Mario Limonciello +Reviewed-by: Gautham R. Shenoy +--- + arch/x86/kernel/acpi/cppc.c | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +--- a/arch/x86/kernel/acpi/cppc.c ++++ b/arch/x86/kernel/acpi/cppc.c +@@ -75,19 +75,19 @@ static void amd_set_max_freq_ratio(void) + + rc = cppc_get_perf_caps(0, &perf_caps); + if (rc) { +- pr_debug("Could not retrieve perf counters (%d)\n", rc); ++ pr_warn("Could not retrieve perf counters (%d)\n", rc); + return; + } + + rc = amd_get_boost_ratio_numerator(0, &numerator); + if (rc) { +- pr_debug("Could not retrieve highest performance (%d)\n", rc); ++ pr_warn("Could not retrieve highest performance (%d)\n", rc); + return; + } + nominal_perf = perf_caps.nominal_perf; + + if (!nominal_perf) { +- pr_debug("Could not retrieve nominal performance\n"); ++ pr_warn("Could not retrieve nominal performance\n"); + return; + } + diff --git a/debian/patches/patchset-pf/amd-pstate/0010-x86-amd-Move-amd_get_highest_perf-out-of-amd-pstate.patch b/debian/patches/patchset-pf/amd-pstate/0010-x86-amd-Move-amd_get_highest_perf-out-of-amd-pstate.patch new file mode 100644 index 0000000..8081de8 --- /dev/null +++ b/debian/patches/patchset-pf/amd-pstate/0010-x86-amd-Move-amd_get_highest_perf-out-of-amd-pstate.patch @@ -0,0 +1,138 @@ +From 952e7bdc4cf67603f230f8eb91818ad4676e5a83 Mon Sep 17 00:00:00 2001 +From: Mario Limonciello +Date: Thu, 5 Sep 2024 11:30:02 -0500 +Subject: x86/amd: Move amd_get_highest_perf() out of amd-pstate + +amd_pstate_get_highest_perf() is a helper used to get the highest perf +value on AMD systems. It's used in amd-pstate as part of preferred +core handling, but applicable for acpi-cpufreq as well. + +Move it out to cppc handling code as amd_get_highest_perf(). + +Reviewed-by: Perry Yuan +Reviewed-by: Gautham R. Shenoy +Signed-off-by: Mario Limonciello +Reviewed-by: Gautham R. Shenoy +--- + arch/x86/kernel/acpi/cppc.c | 30 ++++++++++++++++++++++++++++++ + drivers/cpufreq/amd-pstate.c | 34 ++-------------------------------- + include/acpi/cppc_acpi.h | 5 +++++ + 3 files changed, 37 insertions(+), 32 deletions(-) + +--- a/arch/x86/kernel/acpi/cppc.c ++++ b/arch/x86/kernel/acpi/cppc.c +@@ -116,6 +116,36 @@ void init_freq_invariance_cppc(void) + mutex_unlock(&freq_invariance_lock); + } + ++/* ++ * Get the highest performance register value. ++ * @cpu: CPU from which to get highest performance. ++ * @highest_perf: Return address for highest performance value. ++ * ++ * Return: 0 for success, negative error code otherwise. ++ */ ++int amd_get_highest_perf(unsigned int cpu, u32 *highest_perf) ++{ ++ u64 val; ++ int ret; ++ ++ if (cpu_feature_enabled(X86_FEATURE_CPPC)) { ++ ret = rdmsrl_safe_on_cpu(cpu, MSR_AMD_CPPC_CAP1, &val); ++ if (ret) ++ goto out; ++ ++ val = AMD_CPPC_HIGHEST_PERF(val); ++ } else { ++ ret = cppc_get_highest_perf(cpu, &val); ++ if (ret) ++ goto out; ++ } ++ ++ WRITE_ONCE(*highest_perf, (u32)val); ++out: ++ return ret; ++} ++EXPORT_SYMBOL_GPL(amd_get_highest_perf); ++ + /** + * amd_get_boost_ratio_numerator: Get the numerator to use for boost ratio calculation + * @cpu: CPU to get numerator for. +--- a/drivers/cpufreq/amd-pstate.c ++++ b/drivers/cpufreq/amd-pstate.c +@@ -837,36 +837,6 @@ static void amd_pstste_sched_prefcore_wo + } + static DECLARE_WORK(sched_prefcore_work, amd_pstste_sched_prefcore_workfn); + +-/* +- * Get the highest performance register value. +- * @cpu: CPU from which to get highest performance. +- * @highest_perf: Return address. +- * +- * Return: 0 for success, -EIO otherwise. +- */ +-static int amd_pstate_get_highest_perf(int cpu, u32 *highest_perf) +-{ +- int ret; +- +- if (cpu_feature_enabled(X86_FEATURE_CPPC)) { +- u64 cap1; +- +- ret = rdmsrl_safe_on_cpu(cpu, MSR_AMD_CPPC_CAP1, &cap1); +- if (ret) +- return ret; +- WRITE_ONCE(*highest_perf, AMD_CPPC_HIGHEST_PERF(cap1)); +- } else { +- u64 cppc_highest_perf; +- +- ret = cppc_get_highest_perf(cpu, &cppc_highest_perf); +- if (ret) +- return ret; +- WRITE_ONCE(*highest_perf, cppc_highest_perf); +- } +- +- return (ret); +-} +- + #define CPPC_MAX_PERF U8_MAX + + static void amd_pstate_init_prefcore(struct amd_cpudata *cpudata) +@@ -874,7 +844,7 @@ static void amd_pstate_init_prefcore(str + int ret, prio; + u32 highest_perf; + +- ret = amd_pstate_get_highest_perf(cpudata->cpu, &highest_perf); ++ ret = amd_get_highest_perf(cpudata->cpu, &highest_perf); + if (ret) + return; + +@@ -918,7 +888,7 @@ static void amd_pstate_update_limits(uns + if ((!amd_pstate_prefcore) || (!cpudata->hw_prefcore)) + goto free_cpufreq_put; + +- ret = amd_pstate_get_highest_perf(cpu, &cur_high); ++ ret = amd_get_highest_perf(cpu, &cur_high); + if (ret) + goto free_cpufreq_put; + +--- a/include/acpi/cppc_acpi.h ++++ b/include/acpi/cppc_acpi.h +@@ -161,6 +161,7 @@ extern int cppc_get_epp_perf(int cpunum, + extern int cppc_set_epp_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls, bool enable); + extern int cppc_get_auto_sel_caps(int cpunum, struct cppc_perf_caps *perf_caps); + extern int cppc_set_auto_sel(int cpu, bool enable); ++extern int amd_get_highest_perf(unsigned int cpu, u32 *highest_perf); + extern int amd_get_boost_ratio_numerator(unsigned int cpu, u64 *numerator); + #else /* !CONFIG_ACPI_CPPC_LIB */ + static inline int cppc_get_desired_perf(int cpunum, u64 *desired_perf) +@@ -235,6 +236,10 @@ static inline int cppc_get_auto_sel_caps + { + return -EOPNOTSUPP; + } ++static inline int amd_get_highest_perf(unsigned int cpu, u32 *highest_perf) ++{ ++ return -ENODEV; ++} + static inline int amd_get_boost_ratio_numerator(unsigned int cpu, u64 *numerator) + { + return -EOPNOTSUPP; diff --git a/debian/patches/patchset-pf/amd-pstate/0011-x86-amd-Detect-preferred-cores-in-amd_get_boost_rati.patch b/debian/patches/patchset-pf/amd-pstate/0011-x86-amd-Detect-preferred-cores-in-amd_get_boost_rati.patch new file mode 100644 index 0000000..60ddc56 --- /dev/null +++ b/debian/patches/patchset-pf/amd-pstate/0011-x86-amd-Detect-preferred-cores-in-amd_get_boost_rati.patch @@ -0,0 +1,251 @@ +From 3ab7da5bbf2087982dbfe2b0f2937d0dddc3afb1 Mon Sep 17 00:00:00 2001 +From: Mario Limonciello +Date: Thu, 5 Sep 2024 11:30:03 -0500 +Subject: x86/amd: Detect preferred cores in amd_get_boost_ratio_numerator() + +AMD systems that support preferred cores will use "166" as their +numerator for max frequency calculations instead of "255". + +Add a function for detecting preferred cores by looking at the +highest perf value on all cores. + +If preferred cores are enabled return 166 and if disabled the +value in the highest perf register. As the function will be called +multiple times, cache the values for the boost numerator and if +preferred cores will be enabled in global variables. + +Reviewed-by: Gautham R. Shenoy +Signed-off-by: Mario Limonciello +--- + arch/x86/kernel/acpi/cppc.c | 93 ++++++++++++++++++++++++++++++++---- + drivers/cpufreq/amd-pstate.c | 34 +++++-------- + include/acpi/cppc_acpi.h | 5 ++ + 3 files changed, 101 insertions(+), 31 deletions(-) + +--- a/arch/x86/kernel/acpi/cppc.c ++++ b/arch/x86/kernel/acpi/cppc.c +@@ -9,6 +9,16 @@ + #include + #include + ++#define CPPC_HIGHEST_PERF_PREFCORE 166 ++ ++enum amd_pref_core { ++ AMD_PREF_CORE_UNKNOWN = 0, ++ AMD_PREF_CORE_SUPPORTED, ++ AMD_PREF_CORE_UNSUPPORTED, ++}; ++static enum amd_pref_core amd_pref_core_detected; ++static u64 boost_numerator; ++ + /* Refer to drivers/acpi/cppc_acpi.c for the description of functions */ + + bool cpc_supported_by_cpu(void) +@@ -147,6 +157,66 @@ out: + EXPORT_SYMBOL_GPL(amd_get_highest_perf); + + /** ++ * amd_detect_prefcore: Detect if CPUs in the system support preferred cores ++ * @detected: Output variable for the result of the detection. ++ * ++ * Determine whether CPUs in the system support preferred cores. On systems ++ * that support preferred cores, different highest perf values will be found ++ * on different cores. On other systems, the highest perf value will be the ++ * same on all cores. ++ * ++ * The result of the detection will be stored in the 'detected' parameter. ++ * ++ * Return: 0 for success, negative error code otherwise ++ */ ++int amd_detect_prefcore(bool *detected) ++{ ++ int cpu, count = 0; ++ u64 highest_perf[2] = {0}; ++ ++ if (WARN_ON(!detected)) ++ return -EINVAL; ++ ++ switch (amd_pref_core_detected) { ++ case AMD_PREF_CORE_SUPPORTED: ++ *detected = true; ++ return 0; ++ case AMD_PREF_CORE_UNSUPPORTED: ++ *detected = false; ++ return 0; ++ default: ++ break; ++ } ++ ++ for_each_present_cpu(cpu) { ++ u32 tmp; ++ int ret; ++ ++ ret = amd_get_highest_perf(cpu, &tmp); ++ if (ret) ++ return ret; ++ ++ if (!count || (count == 1 && tmp != highest_perf[0])) ++ highest_perf[count++] = tmp; ++ ++ if (count == 2) ++ break; ++ } ++ ++ *detected = (count == 2); ++ boost_numerator = highest_perf[0]; ++ ++ amd_pref_core_detected = *detected ? AMD_PREF_CORE_SUPPORTED : ++ AMD_PREF_CORE_UNSUPPORTED; ++ ++ pr_debug("AMD CPPC preferred core is %ssupported (highest perf: 0x%llx)\n", ++ *detected ? "" : "un", highest_perf[0]); ++ ++ return 0; ++} ++EXPORT_SYMBOL_GPL(amd_detect_prefcore); ++ ++/** + * amd_get_boost_ratio_numerator: Get the numerator to use for boost ratio calculation + * @cpu: CPU to get numerator for. + * @numerator: Output variable for numerator. +@@ -155,24 +225,27 @@ EXPORT_SYMBOL_GPL(amd_get_highest_perf); + * a CPU. On systems that support preferred cores, this will be a hardcoded + * value. On other systems this will the highest performance register value. + * ++ * If booting the system with amd-pstate enabled but preferred cores disabled then ++ * the correct boost numerator will be returned to match hardware capabilities ++ * even if the preferred cores scheduling hints are not enabled. ++ * + * Return: 0 for success, negative error code otherwise. + */ + int amd_get_boost_ratio_numerator(unsigned int cpu, u64 *numerator) + { +- struct cpuinfo_x86 *c = &boot_cpu_data; +- +- if (c->x86 == 0x17 && ((c->x86_model >= 0x30 && c->x86_model < 0x40) || +- (c->x86_model >= 0x70 && c->x86_model < 0x80))) { +- *numerator = 166; +- return 0; +- } ++ bool prefcore; ++ int ret; + +- if (c->x86 == 0x19 && ((c->x86_model >= 0x20 && c->x86_model < 0x30) || +- (c->x86_model >= 0x40 && c->x86_model < 0x70))) { +- *numerator = 166; ++ ret = amd_detect_prefcore(&prefcore); ++ if (ret) ++ return ret; ++ ++ /* without preferred cores, return the highest perf register value */ ++ if (!prefcore) { ++ *numerator = boost_numerator; + return 0; + } +- *numerator = 255; ++ *numerator = CPPC_HIGHEST_PERF_PREFCORE; + + return 0; + } +--- a/drivers/cpufreq/amd-pstate.c ++++ b/drivers/cpufreq/amd-pstate.c +@@ -841,32 +841,18 @@ static DECLARE_WORK(sched_prefcore_work, + + static void amd_pstate_init_prefcore(struct amd_cpudata *cpudata) + { +- int ret, prio; +- u32 highest_perf; +- +- ret = amd_get_highest_perf(cpudata->cpu, &highest_perf); +- if (ret) ++ /* user disabled or not detected */ ++ if (!amd_pstate_prefcore) + return; + + cpudata->hw_prefcore = true; +- /* check if CPPC preferred core feature is enabled*/ +- if (highest_perf < CPPC_MAX_PERF) +- prio = (int)highest_perf; +- else { +- pr_debug("AMD CPPC preferred core is unsupported!\n"); +- cpudata->hw_prefcore = false; +- return; +- } +- +- if (!amd_pstate_prefcore) +- return; + + /* + * The priorities can be set regardless of whether or not + * sched_set_itmt_support(true) has been called and it is valid to + * update them at any time after it has been called. + */ +- sched_set_itmt_core_prio(prio, cpudata->cpu); ++ sched_set_itmt_core_prio((int)READ_ONCE(cpudata->highest_perf), cpudata->cpu); + + schedule_work(&sched_prefcore_work); + } +@@ -1037,12 +1023,12 @@ static int amd_pstate_cpu_init(struct cp + + cpudata->cpu = policy->cpu; + +- amd_pstate_init_prefcore(cpudata); +- + ret = amd_pstate_init_perf(cpudata); + if (ret) + goto free_cpudata1; + ++ amd_pstate_init_prefcore(cpudata); ++ + ret = amd_pstate_init_freq(cpudata); + if (ret) + goto free_cpudata1; +@@ -1493,12 +1479,12 @@ static int amd_pstate_epp_cpu_init(struc + cpudata->cpu = policy->cpu; + cpudata->epp_policy = 0; + +- amd_pstate_init_prefcore(cpudata); +- + ret = amd_pstate_init_perf(cpudata); + if (ret) + goto free_cpudata1; + ++ amd_pstate_init_prefcore(cpudata); ++ + ret = amd_pstate_init_freq(cpudata); + if (ret) + goto free_cpudata1; +@@ -1960,6 +1946,12 @@ static int __init amd_pstate_init(void) + static_call_update(amd_pstate_update_perf, cppc_update_perf); + } + ++ if (amd_pstate_prefcore) { ++ ret = amd_detect_prefcore(&amd_pstate_prefcore); ++ if (ret) ++ return ret; ++ } ++ + /* enable amd pstate feature */ + ret = amd_pstate_enable(true); + if (ret) { +--- a/include/acpi/cppc_acpi.h ++++ b/include/acpi/cppc_acpi.h +@@ -163,6 +163,7 @@ extern int cppc_get_auto_sel_caps(int cp + extern int cppc_set_auto_sel(int cpu, bool enable); + extern int amd_get_highest_perf(unsigned int cpu, u32 *highest_perf); + extern int amd_get_boost_ratio_numerator(unsigned int cpu, u64 *numerator); ++extern int amd_detect_prefcore(bool *detected); + #else /* !CONFIG_ACPI_CPPC_LIB */ + static inline int cppc_get_desired_perf(int cpunum, u64 *desired_perf) + { +@@ -244,6 +245,10 @@ static inline int amd_get_boost_ratio_nu + { + return -EOPNOTSUPP; + } ++static inline int amd_detect_prefcore(bool *detected) ++{ ++ return -ENODEV; ++} + #endif /* !CONFIG_ACPI_CPPC_LIB */ + + #endif /* _CPPC_ACPI_H*/ diff --git a/debian/patches/patchset-pf/amd-pstate/0012-cpufreq-amd-pstate-Merge-amd_pstate_highest_perf_set.patch b/debian/patches/patchset-pf/amd-pstate/0012-cpufreq-amd-pstate-Merge-amd_pstate_highest_perf_set.patch new file mode 100644 index 0000000..e65aaff --- /dev/null +++ b/debian/patches/patchset-pf/amd-pstate/0012-cpufreq-amd-pstate-Merge-amd_pstate_highest_perf_set.patch @@ -0,0 +1,169 @@ +From 68d89574b86625f4bd7a784fe9bcc221dc290e4f Mon Sep 17 00:00:00 2001 +From: Mario Limonciello +Date: Thu, 5 Sep 2024 11:30:04 -0500 +Subject: cpufreq: amd-pstate: Merge amd_pstate_highest_perf_set() into + amd_get_boost_ratio_numerator() + +The special case in amd_pstate_highest_perf_set() is the value used +for calculating the boost numerator. Merge this into +amd_get_boost_ratio_numerator() and then use that to calculate boost +ratio. + +This allows dropping more special casing of the highest perf value. + +Reviewed-by: Gautham R. Shenoy +Signed-off-by: Mario Limonciello +Reviewed-by: Gautham R. Shenoy +--- + Documentation/admin-guide/pm/amd-pstate.rst | 3 +- + arch/x86/kernel/acpi/cppc.c | 16 +++++++ + drivers/cpufreq/amd-pstate.c | 52 ++++----------------- + 3 files changed, 28 insertions(+), 43 deletions(-) + +--- a/Documentation/admin-guide/pm/amd-pstate.rst ++++ b/Documentation/admin-guide/pm/amd-pstate.rst +@@ -251,7 +251,8 @@ performance supported in `AMD CPPC Perfo + In some ASICs, the highest CPPC performance is not the one in the ``_CPC`` + table, so we need to expose it to sysfs. If boost is not active, but + still supported, this maximum frequency will be larger than the one in +-``cpuinfo``. ++``cpuinfo``. On systems that support preferred core, the driver will have ++different values for some cores than others. + This attribute is read-only. + + ``amd_pstate_lowest_nonlinear_freq`` +--- a/arch/x86/kernel/acpi/cppc.c ++++ b/arch/x86/kernel/acpi/cppc.c +@@ -9,6 +9,7 @@ + #include + #include + ++#define CPPC_HIGHEST_PERF_PERFORMANCE 196 + #define CPPC_HIGHEST_PERF_PREFCORE 166 + + enum amd_pref_core { +@@ -245,6 +246,21 @@ int amd_get_boost_ratio_numerator(unsign + *numerator = boost_numerator; + return 0; + } ++ ++ /* ++ * For AMD CPUs with Family ID 19H and Model ID range 0x70 to 0x7f, ++ * the highest performance level is set to 196. ++ * https://bugzilla.kernel.org/show_bug.cgi?id=218759 ++ */ ++ if (cpu_feature_enabled(X86_FEATURE_ZEN4)) { ++ switch (boot_cpu_data.x86_model) { ++ case 0x70 ... 0x7f: ++ *numerator = CPPC_HIGHEST_PERF_PERFORMANCE; ++ return 0; ++ default: ++ break; ++ } ++ } + *numerator = CPPC_HIGHEST_PERF_PREFCORE; + + return 0; +--- a/drivers/cpufreq/amd-pstate.c ++++ b/drivers/cpufreq/amd-pstate.c +@@ -52,8 +52,6 @@ + #define AMD_PSTATE_TRANSITION_LATENCY 20000 + #define AMD_PSTATE_TRANSITION_DELAY 1000 + #define AMD_PSTATE_FAST_CPPC_TRANSITION_DELAY 600 +-#define CPPC_HIGHEST_PERF_PERFORMANCE 196 +-#define CPPC_HIGHEST_PERF_DEFAULT 166 + + #define AMD_CPPC_EPP_PERFORMANCE 0x00 + #define AMD_CPPC_EPP_BALANCE_PERFORMANCE 0x80 +@@ -398,43 +396,17 @@ static inline int amd_pstate_enable(bool + return static_call(amd_pstate_enable)(enable); + } + +-static u32 amd_pstate_highest_perf_set(struct amd_cpudata *cpudata) +-{ +- struct cpuinfo_x86 *c = &cpu_data(0); +- +- /* +- * For AMD CPUs with Family ID 19H and Model ID range 0x70 to 0x7f, +- * the highest performance level is set to 196. +- * https://bugzilla.kernel.org/show_bug.cgi?id=218759 +- */ +- if (c->x86 == 0x19 && (c->x86_model >= 0x70 && c->x86_model <= 0x7f)) +- return CPPC_HIGHEST_PERF_PERFORMANCE; +- +- return CPPC_HIGHEST_PERF_DEFAULT; +-} +- + static int pstate_init_perf(struct amd_cpudata *cpudata) + { + u64 cap1; +- u32 highest_perf; + + int ret = rdmsrl_safe_on_cpu(cpudata->cpu, MSR_AMD_CPPC_CAP1, + &cap1); + if (ret) + return ret; + +- /* For platforms that do not support the preferred core feature, the +- * highest_pef may be configured with 166 or 255, to avoid max frequency +- * calculated wrongly. we take the AMD_CPPC_HIGHEST_PERF(cap1) value as +- * the default max perf. +- */ +- if (cpudata->hw_prefcore) +- highest_perf = amd_pstate_highest_perf_set(cpudata); +- else +- highest_perf = AMD_CPPC_HIGHEST_PERF(cap1); +- +- WRITE_ONCE(cpudata->highest_perf, highest_perf); +- WRITE_ONCE(cpudata->max_limit_perf, highest_perf); ++ WRITE_ONCE(cpudata->highest_perf, AMD_CPPC_HIGHEST_PERF(cap1)); ++ WRITE_ONCE(cpudata->max_limit_perf, AMD_CPPC_HIGHEST_PERF(cap1)); + WRITE_ONCE(cpudata->nominal_perf, AMD_CPPC_NOMINAL_PERF(cap1)); + WRITE_ONCE(cpudata->lowest_nonlinear_perf, AMD_CPPC_LOWNONLIN_PERF(cap1)); + WRITE_ONCE(cpudata->lowest_perf, AMD_CPPC_LOWEST_PERF(cap1)); +@@ -446,19 +418,13 @@ static int pstate_init_perf(struct amd_c + static int cppc_init_perf(struct amd_cpudata *cpudata) + { + struct cppc_perf_caps cppc_perf; +- u32 highest_perf; + + int ret = cppc_get_perf_caps(cpudata->cpu, &cppc_perf); + if (ret) + return ret; + +- if (cpudata->hw_prefcore) +- highest_perf = amd_pstate_highest_perf_set(cpudata); +- else +- highest_perf = cppc_perf.highest_perf; +- +- WRITE_ONCE(cpudata->highest_perf, highest_perf); +- WRITE_ONCE(cpudata->max_limit_perf, highest_perf); ++ WRITE_ONCE(cpudata->highest_perf, cppc_perf.highest_perf); ++ WRITE_ONCE(cpudata->max_limit_perf, cppc_perf.highest_perf); + WRITE_ONCE(cpudata->nominal_perf, cppc_perf.nominal_perf); + WRITE_ONCE(cpudata->lowest_nonlinear_perf, + cppc_perf.lowest_nonlinear_perf); +@@ -944,8 +910,8 @@ static u32 amd_pstate_get_transition_lat + static int amd_pstate_init_freq(struct amd_cpudata *cpudata) + { + int ret; +- u32 min_freq; +- u32 highest_perf, max_freq; ++ u32 min_freq, max_freq; ++ u64 numerator; + u32 nominal_perf, nominal_freq; + u32 lowest_nonlinear_perf, lowest_nonlinear_freq; + u32 boost_ratio, lowest_nonlinear_ratio; +@@ -967,8 +933,10 @@ static int amd_pstate_init_freq(struct a + + nominal_perf = READ_ONCE(cpudata->nominal_perf); + +- highest_perf = READ_ONCE(cpudata->highest_perf); +- boost_ratio = div_u64(highest_perf << SCHED_CAPACITY_SHIFT, nominal_perf); ++ ret = amd_get_boost_ratio_numerator(cpudata->cpu, &numerator); ++ if (ret) ++ return ret; ++ boost_ratio = div_u64(numerator << SCHED_CAPACITY_SHIFT, nominal_perf); + max_freq = (nominal_freq * boost_ratio >> SCHED_CAPACITY_SHIFT) * 1000; + + lowest_nonlinear_perf = READ_ONCE(cpudata->lowest_nonlinear_perf); diff --git a/debian/patches/patchset-pf/amd-pstate/0013-cpufreq-amd-pstate-Optimize-amd_pstate_update_limits.patch b/debian/patches/patchset-pf/amd-pstate/0013-cpufreq-amd-pstate-Optimize-amd_pstate_update_limits.patch new file mode 100644 index 0000000..3ab7c18 --- /dev/null +++ b/debian/patches/patchset-pf/amd-pstate/0013-cpufreq-amd-pstate-Optimize-amd_pstate_update_limits.patch @@ -0,0 +1,42 @@ +From deed718125e73b6bf280dcebb80c39108226388c Mon Sep 17 00:00:00 2001 +From: Mario Limonciello +Date: Thu, 5 Sep 2024 11:30:05 -0500 +Subject: cpufreq: amd-pstate: Optimize amd_pstate_update_limits() + +Don't take and release the mutex when prefcore isn't present and +avoid initialization of variables that will be initially set +in the function. + +Reviewed-by: Gautham R. Shenoy +Reviewed-by: Perry Yuan +Signed-off-by: Mario Limonciello +Reviewed-by: Gautham R. Shenoy +--- + drivers/cpufreq/amd-pstate.c | 10 +++++----- + 1 file changed, 5 insertions(+), 5 deletions(-) + +--- a/drivers/cpufreq/amd-pstate.c ++++ b/drivers/cpufreq/amd-pstate.c +@@ -836,17 +836,17 @@ static void amd_pstate_update_limits(uns + + cpudata = policy->driver_data; + +- mutex_lock(&amd_pstate_driver_lock); +- if ((!amd_pstate_prefcore) || (!cpudata->hw_prefcore)) +- goto free_cpufreq_put; ++ if (!amd_pstate_prefcore) ++ return; + ++ mutex_lock(&amd_pstate_driver_lock); + ret = amd_get_highest_perf(cpu, &cur_high); + if (ret) + goto free_cpufreq_put; + + prev_high = READ_ONCE(cpudata->prefcore_ranking); +- if (prev_high != cur_high) { +- highest_perf_changed = true; ++ highest_perf_changed = (prev_high != cur_high); ++ if (highest_perf_changed) { + WRITE_ONCE(cpudata->prefcore_ranking, cur_high); + + if (cur_high < CPPC_MAX_PERF) diff --git a/debian/patches/patchset-pf/amd-pstate/0014-cpufreq-amd-pstate-Add-documentation-for-amd_pstate_.patch b/debian/patches/patchset-pf/amd-pstate/0014-cpufreq-amd-pstate-Add-documentation-for-amd_pstate_.patch new file mode 100644 index 0000000..34b04aa --- /dev/null +++ b/debian/patches/patchset-pf/amd-pstate/0014-cpufreq-amd-pstate-Add-documentation-for-amd_pstate_.patch @@ -0,0 +1,29 @@ +From 391075a34e392c7cacd338a6b034a21a10679855 Mon Sep 17 00:00:00 2001 +From: Mario Limonciello +Date: Thu, 5 Sep 2024 11:30:06 -0500 +Subject: cpufreq: amd-pstate: Add documentation for `amd_pstate_hw_prefcore` + +Explain that the sysfs file represents both preferred core being +enabled by the user and supported by the hardware. + +Reviewed-by: Gautham R. Shenoy +Signed-off-by: Mario Limonciello +Reviewed-by: Gautham R. Shenoy +--- + Documentation/admin-guide/pm/amd-pstate.rst | 5 +++++ + 1 file changed, 5 insertions(+) + +--- a/Documentation/admin-guide/pm/amd-pstate.rst ++++ b/Documentation/admin-guide/pm/amd-pstate.rst +@@ -263,6 +263,11 @@ lowest non-linear performance in `AMD CP + `_.) + This attribute is read-only. + ++``amd_pstate_hw_prefcore`` ++ ++Whether the platform supports the preferred core feature and it has been ++enabled. This attribute is read-only. ++ + ``energy_performance_available_preferences`` + + A list of all the supported EPP preferences that could be used for diff --git a/debian/patches/patchset-pf/amd-pstate/0015-amd-pstate-Add-missing-documentation-for-amd_pstate_.patch b/debian/patches/patchset-pf/amd-pstate/0015-amd-pstate-Add-missing-documentation-for-amd_pstate_.patch new file mode 100644 index 0000000..09b194b --- /dev/null +++ b/debian/patches/patchset-pf/amd-pstate/0015-amd-pstate-Add-missing-documentation-for-amd_pstate_.patch @@ -0,0 +1,42 @@ +From 2ed9874f6dcafcc2bee7a922af9e1d1c62dbeb18 Mon Sep 17 00:00:00 2001 +From: Mario Limonciello +Date: Thu, 5 Sep 2024 11:30:07 -0500 +Subject: amd-pstate: Add missing documentation for + `amd_pstate_prefcore_ranking` + +`amd_pstate_prefcore_ranking` reflects the dynamic rankings of a CPU +core based on platform conditions. Explicitly include it in the +documentation. + +Reviewed-by: Gautham R. Shenoy +Signed-off-by: Mario Limonciello +Reviewed-by: Gautham R. Shenoy +--- + Documentation/admin-guide/pm/amd-pstate.rst | 9 ++++++++- + 1 file changed, 8 insertions(+), 1 deletion(-) + +--- a/Documentation/admin-guide/pm/amd-pstate.rst ++++ b/Documentation/admin-guide/pm/amd-pstate.rst +@@ -252,7 +252,8 @@ In some ASICs, the highest CPPC performa + table, so we need to expose it to sysfs. If boost is not active, but + still supported, this maximum frequency will be larger than the one in + ``cpuinfo``. On systems that support preferred core, the driver will have +-different values for some cores than others. ++different values for some cores than others and this will reflect the values ++advertised by the platform at bootup. + This attribute is read-only. + + ``amd_pstate_lowest_nonlinear_freq`` +@@ -268,6 +269,12 @@ This attribute is read-only. + Whether the platform supports the preferred core feature and it has been + enabled. This attribute is read-only. + ++``amd_pstate_prefcore_ranking`` ++ ++The performance ranking of the core. This number doesn't have any unit, but ++larger numbers are preferred at the time of reading. This can change at ++runtime based on platform conditions. This attribute is read-only. ++ + ``energy_performance_available_preferences`` + + A list of all the supported EPP preferences that could be used for diff --git a/debian/patches/patchset-pf/amd-pstate/0016-cpufreq-amd-pstate-Fix-non-kerneldoc-comment.patch b/debian/patches/patchset-pf/amd-pstate/0016-cpufreq-amd-pstate-Fix-non-kerneldoc-comment.patch new file mode 100644 index 0000000..de4a858 --- /dev/null +++ b/debian/patches/patchset-pf/amd-pstate/0016-cpufreq-amd-pstate-Fix-non-kerneldoc-comment.patch @@ -0,0 +1,24 @@ +From 2e2ba39aec71fb51e897c3275b255ef806800cf0 Mon Sep 17 00:00:00 2001 +From: Mario Limonciello +Date: Thu, 5 Sep 2024 11:23:51 -0500 +Subject: cpufreq/amd-pstate: Fix non kerneldoc comment + +The comment for amd_cppc_supported() isn't meant to be kernel doc. + +Fixes: cb817ec6673b7 ("cpufreq: amd-pstate: show CPPC debug message if CPPC is not supported") +Signed-off-by: Mario Limonciello +--- + drivers/cpufreq/amd-pstate.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/drivers/cpufreq/amd-pstate.c ++++ b/drivers/cpufreq/amd-pstate.c +@@ -1786,7 +1786,7 @@ static int __init amd_pstate_set_driver( + return -EINVAL; + } + +-/** ++/* + * CPPC function is not supported for family ID 17H with model_ID ranging from 0x10 to 0x2F. + * show the debug message that helps to check if the CPU has CPPC support for loading issue. + */ diff --git a/debian/patches/patchset-pf/amd-pstate/0017-cpufreq-amd-pstate-ut-Fix-an-Uninitialized-variables.patch b/debian/patches/patchset-pf/amd-pstate/0017-cpufreq-amd-pstate-ut-Fix-an-Uninitialized-variables.patch new file mode 100644 index 0000000..e770805 --- /dev/null +++ b/debian/patches/patchset-pf/amd-pstate/0017-cpufreq-amd-pstate-ut-Fix-an-Uninitialized-variables.patch @@ -0,0 +1,24 @@ +From 185e64a7e1a749593f3d6dadc666da9dda82d48c Mon Sep 17 00:00:00 2001 +From: Qianqiang Liu +Date: Wed, 11 Sep 2024 07:39:24 +0800 +Subject: cpufreq/amd-pstate-ut: Fix an "Uninitialized variables" issue + +Using uninitialized value "mode2" when calling "amd_pstate_get_mode_string". +Set "mode2" to "AMD_PSTATE_DISABLE" by default. + +Signed-off-by: Qianqiang Liu +--- + drivers/cpufreq/amd-pstate-ut.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/drivers/cpufreq/amd-pstate-ut.c ++++ b/drivers/cpufreq/amd-pstate-ut.c +@@ -270,7 +270,7 @@ static int amd_pstate_set_mode(enum amd_ + + static void amd_pstate_ut_check_driver(u32 index) + { +- enum amd_pstate_mode mode1, mode2; ++ enum amd_pstate_mode mode1, mode2 = AMD_PSTATE_DISABLE; + int ret; + + for (mode1 = AMD_PSTATE_DISABLE; mode1 < AMD_PSTATE_MAX; mode1++) { diff --git a/debian/patches/patchset-pf/amd-pstate/0018-cpufreq-amd-pstate-Rename-MSR-and-shared-memory-spec.patch b/debian/patches/patchset-pf/amd-pstate/0018-cpufreq-amd-pstate-Rename-MSR-and-shared-memory-spec.patch new file mode 100644 index 0000000..efe032b --- /dev/null +++ b/debian/patches/patchset-pf/amd-pstate/0018-cpufreq-amd-pstate-Rename-MSR-and-shared-memory-spec.patch @@ -0,0 +1,108 @@ +From d74ce254cc470da670d6b90c69bab553cdbde62b Mon Sep 17 00:00:00 2001 +From: Dhananjay Ugwekar +Date: Tue, 17 Sep 2024 09:14:35 +0000 +Subject: cpufreq/amd-pstate: Rename MSR and shared memory specific functions + +Existing function names "cppc_*" and "pstate_*" for shared memory and +MSR based systems are not intuitive enough, replace them with "shmem_*" and +"msr_*" respectively. + +Signed-off-by: Dhananjay Ugwekar +--- + drivers/cpufreq/amd-pstate.c | 24 ++++++++++++------------ + 1 file changed, 12 insertions(+), 12 deletions(-) + +--- a/drivers/cpufreq/amd-pstate.c ++++ b/drivers/cpufreq/amd-pstate.c +@@ -263,7 +263,7 @@ static int amd_pstate_get_energy_pref_in + return index; + } + +-static void pstate_update_perf(struct amd_cpudata *cpudata, u32 min_perf, ++static void msr_update_perf(struct amd_cpudata *cpudata, u32 min_perf, + u32 des_perf, u32 max_perf, bool fast_switch) + { + if (fast_switch) +@@ -273,7 +273,7 @@ static void pstate_update_perf(struct am + READ_ONCE(cpudata->cppc_req_cached)); + } + +-DEFINE_STATIC_CALL(amd_pstate_update_perf, pstate_update_perf); ++DEFINE_STATIC_CALL(amd_pstate_update_perf, msr_update_perf); + + static inline void amd_pstate_update_perf(struct amd_cpudata *cpudata, + u32 min_perf, u32 des_perf, +@@ -336,7 +336,7 @@ static int amd_pstate_set_energy_pref_in + return ret; + } + +-static inline int pstate_enable(bool enable) ++static inline int msr_enable(bool enable) + { + int ret, cpu; + unsigned long logical_proc_id_mask = 0; +@@ -362,7 +362,7 @@ static inline int pstate_enable(bool ena + return 0; + } + +-static int cppc_enable(bool enable) ++static int shmem_enable(bool enable) + { + int cpu, ret = 0; + struct cppc_perf_ctrls perf_ctrls; +@@ -389,14 +389,14 @@ static int cppc_enable(bool enable) + return ret; + } + +-DEFINE_STATIC_CALL(amd_pstate_enable, pstate_enable); ++DEFINE_STATIC_CALL(amd_pstate_enable, msr_enable); + + static inline int amd_pstate_enable(bool enable) + { + return static_call(amd_pstate_enable)(enable); + } + +-static int pstate_init_perf(struct amd_cpudata *cpudata) ++static int msr_init_perf(struct amd_cpudata *cpudata) + { + u64 cap1; + +@@ -415,7 +415,7 @@ static int pstate_init_perf(struct amd_c + return 0; + } + +-static int cppc_init_perf(struct amd_cpudata *cpudata) ++static int shmem_init_perf(struct amd_cpudata *cpudata) + { + struct cppc_perf_caps cppc_perf; + +@@ -450,14 +450,14 @@ static int cppc_init_perf(struct amd_cpu + return ret; + } + +-DEFINE_STATIC_CALL(amd_pstate_init_perf, pstate_init_perf); ++DEFINE_STATIC_CALL(amd_pstate_init_perf, msr_init_perf); + + static inline int amd_pstate_init_perf(struct amd_cpudata *cpudata) + { + return static_call(amd_pstate_init_perf)(cpudata); + } + +-static void cppc_update_perf(struct amd_cpudata *cpudata, ++static void shmem_update_perf(struct amd_cpudata *cpudata, + u32 min_perf, u32 des_perf, + u32 max_perf, bool fast_switch) + { +@@ -1909,9 +1909,9 @@ static int __init amd_pstate_init(void) + current_pstate_driver->adjust_perf = amd_pstate_adjust_perf; + } else { + pr_debug("AMD CPPC shared memory based functionality is supported\n"); +- static_call_update(amd_pstate_enable, cppc_enable); +- static_call_update(amd_pstate_init_perf, cppc_init_perf); +- static_call_update(amd_pstate_update_perf, cppc_update_perf); ++ static_call_update(amd_pstate_enable, shmem_enable); ++ static_call_update(amd_pstate_init_perf, shmem_init_perf); ++ static_call_update(amd_pstate_update_perf, shmem_update_perf); + } + + if (amd_pstate_prefcore) { diff --git a/debian/patches/patchset-pf/amd-pstate/0019-cpufreq-Add-a-callback-to-update-the-min_freq_req-fr.patch b/debian/patches/patchset-pf/amd-pstate/0019-cpufreq-Add-a-callback-to-update-the-min_freq_req-fr.patch new file mode 100644 index 0000000..bd41ffe --- /dev/null +++ b/debian/patches/patchset-pf/amd-pstate/0019-cpufreq-Add-a-callback-to-update-the-min_freq_req-fr.patch @@ -0,0 +1,115 @@ +From 787175146e26a199c06be4e6bf8cf8da0f757271 Mon Sep 17 00:00:00 2001 +From: Dhananjay Ugwekar +Date: Thu, 3 Oct 2024 08:39:52 +0000 +Subject: cpufreq: Add a callback to update the min_freq_req from drivers + +Currently, there is no proper way to update the initial lower frequency +limit from cpufreq drivers. Only way is to add a new min_freq qos +request from the driver side, but it leads to the issue explained below. + +The QoS infrastructure collates the constraints from multiple +subsystems and saves them in a plist. The "current value" is defined to +be the highest value in the plist for min_freq constraint. + +The cpufreq core adds a qos_request for min_freq to be 0 and the amd-pstate +driver today adds qos request for min_freq to be lowest_freq, where +lowest_freq corresponds to CPPC.lowest_perf. + +Eg: Suppose WLOG considering amd-pstate driver, lowest_freq is 400000 KHz, +lowest_non_linear_freq is 1200000 KHz. + +At this point of time, the min_freq QoS plist looks like: + +head--> 400000 KHz (registered by amd-pstate) --> 0 KHz (registered by +cpufreq core) + +When a user updates /sys/devices/system/cpu/cpuX/cpufreq/scaling_min_freq, +it only results in updating the cpufreq-core's node in the plist, where +say 0 becomes the newly echoed value. + +Now, if the user echoes a value 1000000 KHz, to scaling_min_freq, then the +new list would be + +head--> 1000000 KHz (registered by cpufreq core) --> 400000 KHz (registered +by amd-pstate) + +and the new "current value" of the min_freq QoS constraint will be 1000000 +KHz, this is the scenario where it works as expected. + +Suppose we change the amd-pstate driver code's min_freq qos constraint +to lowest_non_linear_freq instead of lowest_freq, then the user will +never be able to request a value below that, due to the following: + +At boot time, the min_freq QoS plist would be + +head--> 1200000 KHz (registered by amd-pstate) --> 0 KHz (registered by +cpufreq core) + +When the user echoes a value of 1000000 KHz, to +/sys/devices/..../scaling_min_freq, then the new list would be + +head--> 1200000 KHz (registered by amd-pstate) --> 1000000 KHz (registered +by cpufreq core) + +with the new "current value" of the min_freq QoS remaining 1200000 KHz. +Since the current value has not changed, there won't be any notifications +sent to the subsystems which have added their QoS constraints. In +particular, the amd-pstate driver will not get the notification, and thus, +the user's request to lower the scaling_min_freq will be ineffective. + +Hence, it is advisable to have a single source of truth for the min and +max freq QoS constraints between the cpufreq and the cpufreq drivers. + +So add a new callback get_init_min_freq() add in struct cpufreq_driver, +which allows amd-pstate (or any other cpufreq driver) to override the +default min_freq value being set in the policy->min_freq_req. Now +scaling_min_freq can be modified by the user to any value (lower or +higher than the init value) later on if desired. + +Signed-off-by: Dhananjay Ugwekar +Reviewed-by: Gautham R. Shenoy +Reviewed-by: Mario Limonciello +--- + drivers/cpufreq/cpufreq.c | 6 +++++- + include/linux/cpufreq.h | 6 ++++++ + 2 files changed, 11 insertions(+), 1 deletion(-) + +--- a/drivers/cpufreq/cpufreq.c ++++ b/drivers/cpufreq/cpufreq.c +@@ -1380,6 +1380,7 @@ static int cpufreq_online(unsigned int c + bool new_policy; + unsigned long flags; + unsigned int j; ++ u32 init_min_freq = FREQ_QOS_MIN_DEFAULT_VALUE; + int ret; + + pr_debug("%s: bringing CPU%u online\n", __func__, cpu); +@@ -1464,9 +1465,12 @@ static int cpufreq_online(unsigned int c + goto out_destroy_policy; + } + ++ if (cpufreq_driver->get_init_min_freq) ++ init_min_freq = cpufreq_driver->get_init_min_freq(policy); ++ + ret = freq_qos_add_request(&policy->constraints, + policy->min_freq_req, FREQ_QOS_MIN, +- FREQ_QOS_MIN_DEFAULT_VALUE); ++ init_min_freq); + if (ret < 0) { + /* + * So we don't call freq_qos_remove_request() for an +--- a/include/linux/cpufreq.h ++++ b/include/linux/cpufreq.h +@@ -414,6 +414,12 @@ struct cpufreq_driver { + * policy is properly initialized, but before the governor is started. + */ + void (*register_em)(struct cpufreq_policy *policy); ++ ++ /* ++ * Set by drivers that want to initialize the policy->min_freq_req with ++ * a value different from the default value (0) in cpufreq core. ++ */ ++ int (*get_init_min_freq)(struct cpufreq_policy *policy); + }; + + /* flags */ diff --git a/debian/patches/patchset-pf/amd-pstate/0020-cpufreq-amd-pstate-Set-the-initial-min_freq-to-lowes.patch b/debian/patches/patchset-pf/amd-pstate/0020-cpufreq-amd-pstate-Set-the-initial-min_freq-to-lowes.patch new file mode 100644 index 0000000..fa421e8 --- /dev/null +++ b/debian/patches/patchset-pf/amd-pstate/0020-cpufreq-amd-pstate-Set-the-initial-min_freq-to-lowes.patch @@ -0,0 +1,79 @@ +From f5b234be445a45b0bcacc37e0aad7a6bc7900eac Mon Sep 17 00:00:00 2001 +From: Dhananjay Ugwekar +Date: Thu, 3 Oct 2024 08:39:54 +0000 +Subject: cpufreq/amd-pstate: Set the initial min_freq to lowest_nonlinear_freq + +According to the AMD architectural programmer's manual volume 2 [1], in +section "17.6.4.1 CPPC_CAPABILITY_1" lowest_nonlinear_perf is described +as "Reports the most energy efficient performance level (in terms of +performance per watt). Above this threshold, lower performance levels +generally result in increased energy efficiency. Reducing performance +below this threshold does not result in total energy savings for a given +computation, although it reduces instantaneous power consumption". So +lowest_nonlinear_perf is the most power efficient performance level, and +going below that would lead to a worse performance/watt. + +Also, setting the minimum frequency to lowest_nonlinear_freq (instead of +lowest_freq) allows the CPU to idle at a higher frequency which leads +to more time being spent in a deeper idle state (as trivial idle tasks +are completed sooner). This has shown a power benefit in some systems, +in other systems, power consumption has increased but so has the +throughput/watt. + +Use the get_init_min_freq() callback to set the initial lower limit for +amd-pstate driver to lowest_nonlinear_freq instead of lowest_freq. + +Link: https://www.amd.com/content/dam/amd/en/documents/processor-tech-docs/programmer-references/24593.pdf [1] + +Signed-off-by: Dhananjay Ugwekar +Reviewed-by: Mario Limonciello +--- + drivers/cpufreq/amd-pstate.c | 16 +++++++++------- + 1 file changed, 9 insertions(+), 7 deletions(-) + +--- a/drivers/cpufreq/amd-pstate.c ++++ b/drivers/cpufreq/amd-pstate.c +@@ -1025,13 +1025,6 @@ static int amd_pstate_cpu_init(struct cp + if (cpu_feature_enabled(X86_FEATURE_CPPC)) + policy->fast_switch_possible = true; + +- ret = freq_qos_add_request(&policy->constraints, &cpudata->req[0], +- FREQ_QOS_MIN, policy->cpuinfo.min_freq); +- if (ret < 0) { +- dev_err(dev, "Failed to add min-freq constraint (%d)\n", ret); +- goto free_cpudata1; +- } +- + ret = freq_qos_add_request(&policy->constraints, &cpudata->req[1], + FREQ_QOS_MAX, policy->cpuinfo.max_freq); + if (ret < 0) { +@@ -1736,6 +1729,13 @@ static int amd_pstate_epp_resume(struct + return 0; + } + ++static int amd_pstate_get_init_min_freq(struct cpufreq_policy *policy) ++{ ++ struct amd_cpudata *cpudata = policy->driver_data; ++ ++ return READ_ONCE(cpudata->lowest_nonlinear_freq); ++} ++ + static struct cpufreq_driver amd_pstate_driver = { + .flags = CPUFREQ_CONST_LOOPS | CPUFREQ_NEED_UPDATE_LIMITS, + .verify = amd_pstate_verify, +@@ -1749,6 +1749,7 @@ static struct cpufreq_driver amd_pstate_ + .update_limits = amd_pstate_update_limits, + .name = "amd-pstate", + .attr = amd_pstate_attr, ++ .get_init_min_freq = amd_pstate_get_init_min_freq, + }; + + static struct cpufreq_driver amd_pstate_epp_driver = { +@@ -1765,6 +1766,7 @@ static struct cpufreq_driver amd_pstate_ + .set_boost = amd_pstate_set_boost, + .name = "amd-pstate-epp", + .attr = amd_pstate_epp_attr, ++ .get_init_min_freq = amd_pstate_get_init_min_freq, + }; + + static int __init amd_pstate_set_driver(int mode_idx) diff --git a/debian/patches/patchset-pf/amd-pstate/0021-cpufreq-amd-pstate-Cleanup-the-old-min_freq-qos-requ.patch b/debian/patches/patchset-pf/amd-pstate/0021-cpufreq-amd-pstate-Cleanup-the-old-min_freq-qos-requ.patch new file mode 100644 index 0000000..1b82a70 --- /dev/null +++ b/debian/patches/patchset-pf/amd-pstate/0021-cpufreq-amd-pstate-Cleanup-the-old-min_freq-qos-requ.patch @@ -0,0 +1,103 @@ +From f7b2b3a1c0d015c4272793bed89734c5cffb354c Mon Sep 17 00:00:00 2001 +From: Dhananjay Ugwekar +Date: Thu, 3 Oct 2024 08:39:56 +0000 +Subject: cpufreq/amd-pstate: Cleanup the old min_freq qos request remnants + +Convert the freq_qos_request array in struct amd_cpudata to a single +variable (only for max_freq request). Remove the references to cpudata->req +array. Remove and rename the jump labels accordingly. + +Signed-off-by: Dhananjay Ugwekar +Reviewed-by: Mario Limonciello +--- + drivers/cpufreq/amd-pstate.c | 19 ++++++++----------- + drivers/cpufreq/amd-pstate.h | 4 ++-- + 2 files changed, 10 insertions(+), 13 deletions(-) + +--- a/drivers/cpufreq/amd-pstate.c ++++ b/drivers/cpufreq/amd-pstate.c +@@ -726,7 +726,7 @@ static int amd_pstate_cpu_boost_update(s + policy->max = policy->cpuinfo.max_freq; + + if (cppc_state == AMD_PSTATE_PASSIVE) { +- ret = freq_qos_update_request(&cpudata->req[1], policy->cpuinfo.max_freq); ++ ret = freq_qos_update_request(&cpudata->max_freq_req, policy->cpuinfo.max_freq); + if (ret < 0) + pr_debug("Failed to update freq constraint: CPU%d\n", cpudata->cpu); + } +@@ -993,17 +993,17 @@ static int amd_pstate_cpu_init(struct cp + + ret = amd_pstate_init_perf(cpudata); + if (ret) +- goto free_cpudata1; ++ goto free_cpudata; + + amd_pstate_init_prefcore(cpudata); + + ret = amd_pstate_init_freq(cpudata); + if (ret) +- goto free_cpudata1; ++ goto free_cpudata; + + ret = amd_pstate_init_boost_support(cpudata); + if (ret) +- goto free_cpudata1; ++ goto free_cpudata; + + min_freq = READ_ONCE(cpudata->min_freq); + max_freq = READ_ONCE(cpudata->max_freq); +@@ -1025,11 +1025,11 @@ static int amd_pstate_cpu_init(struct cp + if (cpu_feature_enabled(X86_FEATURE_CPPC)) + policy->fast_switch_possible = true; + +- ret = freq_qos_add_request(&policy->constraints, &cpudata->req[1], ++ ret = freq_qos_add_request(&policy->constraints, &cpudata->max_freq_req, + FREQ_QOS_MAX, policy->cpuinfo.max_freq); + if (ret < 0) { + dev_err(dev, "Failed to add max-freq constraint (%d)\n", ret); +- goto free_cpudata2; ++ goto free_cpudata; + } + + cpudata->max_limit_freq = max_freq; +@@ -1042,9 +1042,7 @@ static int amd_pstate_cpu_init(struct cp + + return 0; + +-free_cpudata2: +- freq_qos_remove_request(&cpudata->req[0]); +-free_cpudata1: ++free_cpudata: + kfree(cpudata); + return ret; + } +@@ -1053,8 +1051,7 @@ static void amd_pstate_cpu_exit(struct c + { + struct amd_cpudata *cpudata = policy->driver_data; + +- freq_qos_remove_request(&cpudata->req[1]); +- freq_qos_remove_request(&cpudata->req[0]); ++ freq_qos_remove_request(&cpudata->max_freq_req); + policy->fast_switch_possible = false; + kfree(cpudata); + } +--- a/drivers/cpufreq/amd-pstate.h ++++ b/drivers/cpufreq/amd-pstate.h +@@ -28,7 +28,7 @@ struct amd_aperf_mperf { + /** + * struct amd_cpudata - private CPU data for AMD P-State + * @cpu: CPU number +- * @req: constraint request to apply ++ * @max_freq_req: maximum frequency constraint request to apply + * @cppc_req_cached: cached performance request hints + * @highest_perf: the maximum performance an individual processor may reach, + * assuming ideal conditions +@@ -68,7 +68,7 @@ struct amd_aperf_mperf { + struct amd_cpudata { + int cpu; + +- struct freq_qos_request req[2]; ++ struct freq_qos_request max_freq_req; + u64 cppc_req_cached; + + u32 highest_perf; diff --git a/debian/patches/patchset-pf/amd-pstate/0022-cpufreq-amd-pstate-Fix-amd_pstate-mode-switch-on-sha.patch b/debian/patches/patchset-pf/amd-pstate/0022-cpufreq-amd-pstate-Fix-amd_pstate-mode-switch-on-sha.patch new file mode 100644 index 0000000..6b342ff --- /dev/null +++ b/debian/patches/patchset-pf/amd-pstate/0022-cpufreq-amd-pstate-Fix-amd_pstate-mode-switch-on-sha.patch @@ -0,0 +1,42 @@ +From d1216c052bedbf6d79e4b0261e2f09e17c66ffd3 Mon Sep 17 00:00:00 2001 +From: Dhananjay Ugwekar +Date: Fri, 4 Oct 2024 12:23:04 +0000 +Subject: cpufreq/amd-pstate: Fix amd_pstate mode switch on shared memory + systems + +While switching the driver mode between active and passive, Collaborative +Processor Performance Control (CPPC) is disabled in +amd_pstate_unregister_driver(). But, it is not enabled back while registering +the new driver (passive or active). This leads to the new driver mode not +working correctly, so enable it back in amd_pstate_register_driver(). + +Fixes: 3ca7bc818d8c ("cpufreq: amd-pstate: Add guided mode control support via sysfs") +Signed-off-by: Dhananjay Ugwekar +--- + drivers/cpufreq/amd-pstate.c | 10 ++++++++++ + 1 file changed, 10 insertions(+) + +--- a/drivers/cpufreq/amd-pstate.c ++++ b/drivers/cpufreq/amd-pstate.c +@@ -1221,11 +1221,21 @@ static int amd_pstate_register_driver(in + return -EINVAL; + + cppc_state = mode; ++ ++ ret = amd_pstate_enable(true); ++ if (ret) { ++ pr_err("failed to enable cppc during amd-pstate driver registration, return %d\n", ++ ret); ++ amd_pstate_driver_cleanup(); ++ return ret; ++ } ++ + ret = cpufreq_register_driver(current_pstate_driver); + if (ret) { + amd_pstate_driver_cleanup(); + return ret; + } ++ + return 0; + } + diff --git a/debian/patches/patchset-pf/amd-pstate/0023-cpufreq-amd-pstate-Use-nominal-perf-for-limits-when-.patch b/debian/patches/patchset-pf/amd-pstate/0023-cpufreq-amd-pstate-Use-nominal-perf-for-limits-when-.patch new file mode 100644 index 0000000..7036362 --- /dev/null +++ b/debian/patches/patchset-pf/amd-pstate/0023-cpufreq-amd-pstate-Use-nominal-perf-for-limits-when-.patch @@ -0,0 +1,57 @@ +From c4fde0d177bdb33912f450914d84d6432391a8b5 Mon Sep 17 00:00:00 2001 +From: Mario Limonciello +Date: Sat, 12 Oct 2024 12:45:16 -0500 +Subject: cpufreq/amd-pstate: Use nominal perf for limits when boost is + disabled + +When boost has been disabled the limit for perf should be nominal perf not +the highest perf. Using the latter to do calculations will lead to +incorrect values that are still above nominal. + +Fixes: ad4caad58d91 ("cpufreq: amd-pstate: Merge amd_pstate_highest_perf_set() into amd_get_boost_ratio_numerator()") +Reported-by: Peter Jung +Closes: https://bugzilla.kernel.org/show_bug.cgi?id=219348 +Signed-off-by: Mario Limonciello +--- + drivers/cpufreq/amd-pstate.c | 20 ++++++++++++++------ + 1 file changed, 14 insertions(+), 6 deletions(-) + +--- a/drivers/cpufreq/amd-pstate.c ++++ b/drivers/cpufreq/amd-pstate.c +@@ -566,11 +566,16 @@ static int amd_pstate_verify(struct cpuf + + static int amd_pstate_update_min_max_limit(struct cpufreq_policy *policy) + { +- u32 max_limit_perf, min_limit_perf, lowest_perf; ++ u32 max_limit_perf, min_limit_perf, lowest_perf, max_perf; + struct amd_cpudata *cpudata = policy->driver_data; + +- max_limit_perf = div_u64(policy->max * cpudata->highest_perf, cpudata->max_freq); +- min_limit_perf = div_u64(policy->min * cpudata->highest_perf, cpudata->max_freq); ++ if (cpudata->boost_supported && !policy->boost_enabled) ++ max_perf = READ_ONCE(cpudata->nominal_perf); ++ else ++ max_perf = READ_ONCE(cpudata->highest_perf); ++ ++ max_limit_perf = div_u64(policy->max * max_perf, policy->cpuinfo.max_freq); ++ min_limit_perf = div_u64(policy->min * max_perf, policy->cpuinfo.max_freq); + + lowest_perf = READ_ONCE(cpudata->lowest_perf); + if (min_limit_perf < lowest_perf) +@@ -1526,10 +1531,13 @@ static int amd_pstate_epp_update_limit(s + u64 value; + s16 epp; + +- max_perf = READ_ONCE(cpudata->highest_perf); ++ if (cpudata->boost_supported && !policy->boost_enabled) ++ max_perf = READ_ONCE(cpudata->nominal_perf); ++ else ++ max_perf = READ_ONCE(cpudata->highest_perf); + min_perf = READ_ONCE(cpudata->lowest_perf); +- max_limit_perf = div_u64(policy->max * cpudata->highest_perf, cpudata->max_freq); +- min_limit_perf = div_u64(policy->min * cpudata->highest_perf, cpudata->max_freq); ++ max_limit_perf = div_u64(policy->max * max_perf, policy->cpuinfo.max_freq); ++ min_limit_perf = div_u64(policy->min * max_perf, policy->cpuinfo.max_freq); + + if (min_limit_perf < min_perf) + min_limit_perf = min_perf; diff --git a/debian/patches/patchset-pf/amd-pstate/0024-cpufreq-amd-pstate-Don-t-update-CPPC-request-in-amd_.patch b/debian/patches/patchset-pf/amd-pstate/0024-cpufreq-amd-pstate-Don-t-update-CPPC-request-in-amd_.patch new file mode 100644 index 0000000..0019cc1 --- /dev/null +++ b/debian/patches/patchset-pf/amd-pstate/0024-cpufreq-amd-pstate-Don-t-update-CPPC-request-in-amd_.patch @@ -0,0 +1,55 @@ +From 01ad0fb3da95867947d923596a26b18d844afe3c Mon Sep 17 00:00:00 2001 +From: Mario Limonciello +Date: Sat, 12 Oct 2024 12:45:17 -0500 +Subject: cpufreq/amd-pstate: Don't update CPPC request in + amd_pstate_cpu_boost_update() + +When boost is changed the CPPC value is changed in amd_pstate_cpu_boost_update() +but then changed again when refresh_frequency_limits() and all it's callbacks +occur. The first is a pointless write, so instead just update the limits for +the policy and let the policy refresh anchor everything properly. + +Fixes: c8c68c38b56f ("cpufreq: amd-pstate: initialize core precision boost state") +Signed-off-by: Mario Limonciello +--- + drivers/cpufreq/amd-pstate.c | 24 +----------------------- + 1 file changed, 1 insertion(+), 23 deletions(-) + +--- a/drivers/cpufreq/amd-pstate.c ++++ b/drivers/cpufreq/amd-pstate.c +@@ -695,34 +695,12 @@ static void amd_pstate_adjust_perf(unsig + static int amd_pstate_cpu_boost_update(struct cpufreq_policy *policy, bool on) + { + struct amd_cpudata *cpudata = policy->driver_data; +- struct cppc_perf_ctrls perf_ctrls; +- u32 highest_perf, nominal_perf, nominal_freq, max_freq; ++ u32 nominal_freq, max_freq; + int ret = 0; + +- highest_perf = READ_ONCE(cpudata->highest_perf); +- nominal_perf = READ_ONCE(cpudata->nominal_perf); + nominal_freq = READ_ONCE(cpudata->nominal_freq); + max_freq = READ_ONCE(cpudata->max_freq); + +- if (boot_cpu_has(X86_FEATURE_CPPC)) { +- u64 value = READ_ONCE(cpudata->cppc_req_cached); +- +- value &= ~GENMASK_ULL(7, 0); +- value |= on ? highest_perf : nominal_perf; +- WRITE_ONCE(cpudata->cppc_req_cached, value); +- +- wrmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, value); +- } else { +- perf_ctrls.max_perf = on ? highest_perf : nominal_perf; +- ret = cppc_set_perf(cpudata->cpu, &perf_ctrls); +- if (ret) { +- cpufreq_cpu_release(policy); +- pr_debug("Failed to set max perf on CPU:%d. ret:%d\n", +- cpudata->cpu, ret); +- return ret; +- } +- } +- + if (on) + policy->cpuinfo.max_freq = max_freq; + else if (policy->cpuinfo.max_freq > nominal_freq * 1000) diff --git a/debian/patches/patchset-pf/amd-pstate/0025-cpufreq-amd-pstate-Use-amd_pstate_update_min_max_lim.patch b/debian/patches/patchset-pf/amd-pstate/0025-cpufreq-amd-pstate-Use-amd_pstate_update_min_max_lim.patch new file mode 100644 index 0000000..3fd8c77 --- /dev/null +++ b/debian/patches/patchset-pf/amd-pstate/0025-cpufreq-amd-pstate-Use-amd_pstate_update_min_max_lim.patch @@ -0,0 +1,49 @@ +From 684d162c08ab86fff02861c907ecc92bf9c09af4 Mon Sep 17 00:00:00 2001 +From: Mario Limonciello +Date: Sat, 12 Oct 2024 12:45:18 -0500 +Subject: cpufreq/amd-pstate: Use amd_pstate_update_min_max_limit() for EPP + limits + +When the EPP updates are set the maximum capable frequency for the +CPU is used to set the upper limit instead of that of the policy. + +Adjust amd_pstate_epp_update_limit() to reuse policy calculation code +from amd_pstate_update_min_max_limit(). + +Signed-off-by: Mario Limonciello +--- + drivers/cpufreq/amd-pstate.c | 19 +++---------------- + 1 file changed, 3 insertions(+), 16 deletions(-) + +--- a/drivers/cpufreq/amd-pstate.c ++++ b/drivers/cpufreq/amd-pstate.c +@@ -1505,26 +1505,13 @@ static void amd_pstate_epp_cpu_exit(stru + static int amd_pstate_epp_update_limit(struct cpufreq_policy *policy) + { + struct amd_cpudata *cpudata = policy->driver_data; +- u32 max_perf, min_perf, min_limit_perf, max_limit_perf; ++ u32 max_perf, min_perf; + u64 value; + s16 epp; + +- if (cpudata->boost_supported && !policy->boost_enabled) +- max_perf = READ_ONCE(cpudata->nominal_perf); +- else +- max_perf = READ_ONCE(cpudata->highest_perf); ++ max_perf = READ_ONCE(cpudata->highest_perf); + min_perf = READ_ONCE(cpudata->lowest_perf); +- max_limit_perf = div_u64(policy->max * max_perf, policy->cpuinfo.max_freq); +- min_limit_perf = div_u64(policy->min * max_perf, policy->cpuinfo.max_freq); +- +- if (min_limit_perf < min_perf) +- min_limit_perf = min_perf; +- +- if (max_limit_perf < min_limit_perf) +- max_limit_perf = min_limit_perf; +- +- WRITE_ONCE(cpudata->max_limit_perf, max_limit_perf); +- WRITE_ONCE(cpudata->min_limit_perf, min_limit_perf); ++ amd_pstate_update_min_max_limit(policy); + + max_perf = clamp_t(unsigned long, max_perf, cpudata->min_limit_perf, + cpudata->max_limit_perf); diff --git a/debian/patches/patchset-pf/amd-pstate/0026-cpufreq-amd-pstate-Drop-needless-EPP-initialization.patch b/debian/patches/patchset-pf/amd-pstate/0026-cpufreq-amd-pstate-Drop-needless-EPP-initialization.patch new file mode 100644 index 0000000..33964dc --- /dev/null +++ b/debian/patches/patchset-pf/amd-pstate/0026-cpufreq-amd-pstate-Drop-needless-EPP-initialization.patch @@ -0,0 +1,29 @@ +From fa46d2873c9fa4060ce407e4bc5c7e29babce9d0 Mon Sep 17 00:00:00 2001 +From: Mario Limonciello +Date: Sat, 12 Oct 2024 12:45:19 -0500 +Subject: cpufreq/amd-pstate: Drop needless EPP initialization + +The EPP value doesn't need to be cached to the CPPC request in +amd_pstate_epp_update_limit() because it's passed as an argument +at the end to amd_pstate_set_epp() and stored at that time. + +Signed-off-by: Mario Limonciello +--- + drivers/cpufreq/amd-pstate.c | 6 ------ + 1 file changed, 6 deletions(-) + +--- a/drivers/cpufreq/amd-pstate.c ++++ b/drivers/cpufreq/amd-pstate.c +@@ -1548,12 +1548,6 @@ static int amd_pstate_epp_update_limit(s + if (cpudata->policy == CPUFREQ_POLICY_PERFORMANCE) + epp = 0; + +- /* Set initial EPP value */ +- if (cpu_feature_enabled(X86_FEATURE_CPPC)) { +- value &= ~GENMASK_ULL(31, 24); +- value |= (u64)epp << 24; +- } +- + WRITE_ONCE(cpudata->cppc_req_cached, value); + return amd_pstate_set_epp(cpudata, epp); + } diff --git a/debian/patches/patchset-pf/amd-pstate/0027-amd-pstate-6.11-update-setting-the-minimum-frequency.patch b/debian/patches/patchset-pf/amd-pstate/0027-amd-pstate-6.11-update-setting-the-minimum-frequency.patch new file mode 100644 index 0000000..9fe70c7 --- /dev/null +++ b/debian/patches/patchset-pf/amd-pstate/0027-amd-pstate-6.11-update-setting-the-minimum-frequency.patch @@ -0,0 +1,228 @@ +From 649d296be0c7f0df6e71b4fca25fdbe75cb3994e Mon Sep 17 00:00:00 2001 +From: Oleksandr Natalenko +Date: Thu, 17 Oct 2024 17:03:11 +0200 +Subject: amd-pstate-6.11: update setting the minimum frequency to + lowest_nonlinear_freq patchset to v3 + +Signed-off-by: Oleksandr Natalenko +--- + drivers/cpufreq/amd-pstate.c | 67 +++++++++++++++++++++--------------- + drivers/cpufreq/amd-pstate.h | 4 +-- + drivers/cpufreq/cpufreq.c | 6 +--- + include/linux/cpufreq.h | 6 ---- + 4 files changed, 43 insertions(+), 40 deletions(-) + +--- a/drivers/cpufreq/amd-pstate.c ++++ b/drivers/cpufreq/amd-pstate.c +@@ -557,9 +557,28 @@ cpufreq_policy_put: + cpufreq_cpu_put(policy); + } + +-static int amd_pstate_verify(struct cpufreq_policy_data *policy) ++static int amd_pstate_verify(struct cpufreq_policy_data *policy_data) + { +- cpufreq_verify_within_cpu_limits(policy); ++ /* ++ * Initialize lower frequency limit (i.e.policy->min) with ++ * lowest_nonlinear_frequency which is the most energy efficient ++ * frequency. Override the initial value set by cpufreq core and ++ * amd-pstate qos_requests. ++ */ ++ if (policy_data->min == FREQ_QOS_MIN_DEFAULT_VALUE) { ++ struct cpufreq_policy *policy = cpufreq_cpu_get(policy_data->cpu); ++ struct amd_cpudata *cpudata; ++ ++ if (!policy) ++ return -EINVAL; ++ ++ cpudata = policy->driver_data; ++ policy_data->min = cpudata->lowest_nonlinear_freq; ++ cpufreq_cpu_put(policy); ++ } ++ ++ cpufreq_verify_within_cpu_limits(policy_data); ++ pr_debug("policy_max =%d, policy_min=%d\n", policy_data->max, policy_data->min); + + return 0; + } +@@ -709,7 +728,7 @@ static int amd_pstate_cpu_boost_update(s + policy->max = policy->cpuinfo.max_freq; + + if (cppc_state == AMD_PSTATE_PASSIVE) { +- ret = freq_qos_update_request(&cpudata->max_freq_req, policy->cpuinfo.max_freq); ++ ret = freq_qos_update_request(&cpudata->req[1], policy->cpuinfo.max_freq); + if (ret < 0) + pr_debug("Failed to update freq constraint: CPU%d\n", cpudata->cpu); + } +@@ -976,17 +995,17 @@ static int amd_pstate_cpu_init(struct cp + + ret = amd_pstate_init_perf(cpudata); + if (ret) +- goto free_cpudata; ++ goto free_cpudata1; + + amd_pstate_init_prefcore(cpudata); + + ret = amd_pstate_init_freq(cpudata); + if (ret) +- goto free_cpudata; ++ goto free_cpudata1; + + ret = amd_pstate_init_boost_support(cpudata); + if (ret) +- goto free_cpudata; ++ goto free_cpudata1; + + min_freq = READ_ONCE(cpudata->min_freq); + max_freq = READ_ONCE(cpudata->max_freq); +@@ -1008,11 +1027,18 @@ static int amd_pstate_cpu_init(struct cp + if (cpu_feature_enabled(X86_FEATURE_CPPC)) + policy->fast_switch_possible = true; + +- ret = freq_qos_add_request(&policy->constraints, &cpudata->max_freq_req, ++ ret = freq_qos_add_request(&policy->constraints, &cpudata->req[0], ++ FREQ_QOS_MIN, FREQ_QOS_MIN_DEFAULT_VALUE); ++ if (ret < 0) { ++ dev_err(dev, "Failed to add min-freq constraint (%d)\n", ret); ++ goto free_cpudata1; ++ } ++ ++ ret = freq_qos_add_request(&policy->constraints, &cpudata->req[1], + FREQ_QOS_MAX, policy->cpuinfo.max_freq); + if (ret < 0) { + dev_err(dev, "Failed to add max-freq constraint (%d)\n", ret); +- goto free_cpudata; ++ goto free_cpudata2; + } + + cpudata->max_limit_freq = max_freq; +@@ -1025,7 +1051,9 @@ static int amd_pstate_cpu_init(struct cp + + return 0; + +-free_cpudata: ++free_cpudata2: ++ freq_qos_remove_request(&cpudata->req[0]); ++free_cpudata1: + kfree(cpudata); + return ret; + } +@@ -1034,7 +1062,8 @@ static void amd_pstate_cpu_exit(struct c + { + struct amd_cpudata *cpudata = policy->driver_data; + +- freq_qos_remove_request(&cpudata->max_freq_req); ++ freq_qos_remove_request(&cpudata->req[1]); ++ freq_qos_remove_request(&cpudata->req[0]); + policy->fast_switch_possible = false; + kfree(cpudata); + } +@@ -1658,13 +1687,6 @@ static int amd_pstate_epp_cpu_offline(st + return 0; + } + +-static int amd_pstate_epp_verify_policy(struct cpufreq_policy_data *policy) +-{ +- cpufreq_verify_within_cpu_limits(policy); +- pr_debug("policy_max =%d, policy_min=%d\n", policy->max, policy->min); +- return 0; +-} +- + static int amd_pstate_epp_suspend(struct cpufreq_policy *policy) + { + struct amd_cpudata *cpudata = policy->driver_data; +@@ -1703,13 +1725,6 @@ static int amd_pstate_epp_resume(struct + return 0; + } + +-static int amd_pstate_get_init_min_freq(struct cpufreq_policy *policy) +-{ +- struct amd_cpudata *cpudata = policy->driver_data; +- +- return READ_ONCE(cpudata->lowest_nonlinear_freq); +-} +- + static struct cpufreq_driver amd_pstate_driver = { + .flags = CPUFREQ_CONST_LOOPS | CPUFREQ_NEED_UPDATE_LIMITS, + .verify = amd_pstate_verify, +@@ -1723,12 +1738,11 @@ static struct cpufreq_driver amd_pstate_ + .update_limits = amd_pstate_update_limits, + .name = "amd-pstate", + .attr = amd_pstate_attr, +- .get_init_min_freq = amd_pstate_get_init_min_freq, + }; + + static struct cpufreq_driver amd_pstate_epp_driver = { + .flags = CPUFREQ_CONST_LOOPS, +- .verify = amd_pstate_epp_verify_policy, ++ .verify = amd_pstate_verify, + .setpolicy = amd_pstate_epp_set_policy, + .init = amd_pstate_epp_cpu_init, + .exit = amd_pstate_epp_cpu_exit, +@@ -1740,7 +1754,6 @@ static struct cpufreq_driver amd_pstate_ + .set_boost = amd_pstate_set_boost, + .name = "amd-pstate-epp", + .attr = amd_pstate_epp_attr, +- .get_init_min_freq = amd_pstate_get_init_min_freq, + }; + + static int __init amd_pstate_set_driver(int mode_idx) +--- a/drivers/cpufreq/amd-pstate.h ++++ b/drivers/cpufreq/amd-pstate.h +@@ -28,7 +28,7 @@ struct amd_aperf_mperf { + /** + * struct amd_cpudata - private CPU data for AMD P-State + * @cpu: CPU number +- * @max_freq_req: maximum frequency constraint request to apply ++ * @req: constraint request to apply + * @cppc_req_cached: cached performance request hints + * @highest_perf: the maximum performance an individual processor may reach, + * assuming ideal conditions +@@ -68,7 +68,7 @@ struct amd_aperf_mperf { + struct amd_cpudata { + int cpu; + +- struct freq_qos_request max_freq_req; ++ struct freq_qos_request req[2]; + u64 cppc_req_cached; + + u32 highest_perf; +--- a/drivers/cpufreq/cpufreq.c ++++ b/drivers/cpufreq/cpufreq.c +@@ -1380,7 +1380,6 @@ static int cpufreq_online(unsigned int c + bool new_policy; + unsigned long flags; + unsigned int j; +- u32 init_min_freq = FREQ_QOS_MIN_DEFAULT_VALUE; + int ret; + + pr_debug("%s: bringing CPU%u online\n", __func__, cpu); +@@ -1465,12 +1464,9 @@ static int cpufreq_online(unsigned int c + goto out_destroy_policy; + } + +- if (cpufreq_driver->get_init_min_freq) +- init_min_freq = cpufreq_driver->get_init_min_freq(policy); +- + ret = freq_qos_add_request(&policy->constraints, + policy->min_freq_req, FREQ_QOS_MIN, +- init_min_freq); ++ FREQ_QOS_MIN_DEFAULT_VALUE); + if (ret < 0) { + /* + * So we don't call freq_qos_remove_request() for an +--- a/include/linux/cpufreq.h ++++ b/include/linux/cpufreq.h +@@ -414,12 +414,6 @@ struct cpufreq_driver { + * policy is properly initialized, but before the governor is started. + */ + void (*register_em)(struct cpufreq_policy *policy); +- +- /* +- * Set by drivers that want to initialize the policy->min_freq_req with +- * a value different from the default value (0) in cpufreq core. +- */ +- int (*get_init_min_freq)(struct cpufreq_policy *policy); + }; + + /* flags */ diff --git a/debian/patches/patchset-pf/amd-pstate/0028-cpufreq-amd-pstate-Call-amd_pstate_register-in-amd_p.patch b/debian/patches/patchset-pf/amd-pstate/0028-cpufreq-amd-pstate-Call-amd_pstate_register-in-amd_p.patch new file mode 100644 index 0000000..cc03ae5 --- /dev/null +++ b/debian/patches/patchset-pf/amd-pstate/0028-cpufreq-amd-pstate-Call-amd_pstate_register-in-amd_p.patch @@ -0,0 +1,44 @@ +From db147a0a6341822a15fd9c4cd51f8dc4a9a1747b Mon Sep 17 00:00:00 2001 +From: Dhananjay Ugwekar +Date: Thu, 17 Oct 2024 10:05:27 +0000 +Subject: cpufreq/amd-pstate: Call amd_pstate_register() in amd_pstate_init() + +Replace a similar chunk of code in amd_pstate_init() with +amd_pstate_register() call. + +Suggested-by: Mario Limonciello +Signed-off-by: Dhananjay Ugwekar +--- + drivers/cpufreq/amd-pstate.c | 12 ++---------- + 1 file changed, 2 insertions(+), 10 deletions(-) + +--- a/drivers/cpufreq/amd-pstate.c ++++ b/drivers/cpufreq/amd-pstate.c +@@ -1909,17 +1909,10 @@ static int __init amd_pstate_init(void) + return ret; + } + +- /* enable amd pstate feature */ +- ret = amd_pstate_enable(true); +- if (ret) { +- pr_err("failed to enable driver mode(%d)\n", cppc_state); +- return ret; +- } +- +- ret = cpufreq_register_driver(current_pstate_driver); ++ ret = amd_pstate_register_driver(cppc_state); + if (ret) { + pr_err("failed to register with return %d\n", ret); +- goto disable_driver; ++ return ret; + } + + dev_root = bus_get_dev_root(&cpu_subsys); +@@ -1936,7 +1929,6 @@ static int __init amd_pstate_init(void) + + global_attr_free: + cpufreq_unregister_driver(current_pstate_driver); +-disable_driver: + amd_pstate_enable(false); + return ret; + } diff --git a/debian/patches/patchset-pf/amd-pstate/0029-cpufreq-amd-pstate-Call-amd_pstate_set_driver-in-amd.patch b/debian/patches/patchset-pf/amd-pstate/0029-cpufreq-amd-pstate-Call-amd_pstate_set_driver-in-amd.patch new file mode 100644 index 0000000..39b68b6 --- /dev/null +++ b/debian/patches/patchset-pf/amd-pstate/0029-cpufreq-amd-pstate-Call-amd_pstate_set_driver-in-amd.patch @@ -0,0 +1,81 @@ +From 7c658490b05f6ab4dd59e1c25e75ba1037f6cfeb Mon Sep 17 00:00:00 2001 +From: Dhananjay Ugwekar +Date: Thu, 17 Oct 2024 10:05:29 +0000 +Subject: cpufreq/amd-pstate: Call amd_pstate_set_driver() in + amd_pstate_register_driver() + +Replace a similar chunk of code in amd_pstate_register_driver() with +amd_pstate_set_driver() call. + +Signed-off-by: Dhananjay Ugwekar +--- + drivers/cpufreq/amd-pstate.c | 47 +++++++++++++++++------------------- + 1 file changed, 22 insertions(+), 25 deletions(-) + +--- a/drivers/cpufreq/amd-pstate.c ++++ b/drivers/cpufreq/amd-pstate.c +@@ -1221,16 +1221,32 @@ static void amd_pstate_driver_cleanup(vo + current_pstate_driver = NULL; + } + ++static int amd_pstate_set_driver(int mode_idx) ++{ ++ if (mode_idx >= AMD_PSTATE_DISABLE && mode_idx < AMD_PSTATE_MAX) { ++ cppc_state = mode_idx; ++ if (cppc_state == AMD_PSTATE_DISABLE) ++ pr_info("driver is explicitly disabled\n"); ++ ++ if (cppc_state == AMD_PSTATE_ACTIVE) ++ current_pstate_driver = &amd_pstate_epp_driver; ++ ++ if (cppc_state == AMD_PSTATE_PASSIVE || cppc_state == AMD_PSTATE_GUIDED) ++ current_pstate_driver = &amd_pstate_driver; ++ ++ return 0; ++ } ++ ++ return -EINVAL; ++} ++ + static int amd_pstate_register_driver(int mode) + { + int ret; + +- if (mode == AMD_PSTATE_PASSIVE || mode == AMD_PSTATE_GUIDED) +- current_pstate_driver = &amd_pstate_driver; +- else if (mode == AMD_PSTATE_ACTIVE) +- current_pstate_driver = &amd_pstate_epp_driver; +- else +- return -EINVAL; ++ ret = amd_pstate_set_driver(mode); ++ if (ret) ++ return ret; + + cppc_state = mode; + +@@ -1756,25 +1772,6 @@ static struct cpufreq_driver amd_pstate_ + .attr = amd_pstate_epp_attr, + }; + +-static int __init amd_pstate_set_driver(int mode_idx) +-{ +- if (mode_idx >= AMD_PSTATE_DISABLE && mode_idx < AMD_PSTATE_MAX) { +- cppc_state = mode_idx; +- if (cppc_state == AMD_PSTATE_DISABLE) +- pr_info("driver is explicitly disabled\n"); +- +- if (cppc_state == AMD_PSTATE_ACTIVE) +- current_pstate_driver = &amd_pstate_epp_driver; +- +- if (cppc_state == AMD_PSTATE_PASSIVE || cppc_state == AMD_PSTATE_GUIDED) +- current_pstate_driver = &amd_pstate_driver; +- +- return 0; +- } +- +- return -EINVAL; +-} +- + /* + * CPPC function is not supported for family ID 17H with model_ID ranging from 0x10 to 0x2F. + * show the debug message that helps to check if the CPU has CPPC support for loading issue. diff --git a/debian/patches/patchset-pf/amd-pstate/0030-cpufreq-amd-pstate-Remove-the-switch-case-in-amd_pst.patch b/debian/patches/patchset-pf/amd-pstate/0030-cpufreq-amd-pstate-Remove-the-switch-case-in-amd_pst.patch new file mode 100644 index 0000000..507e5e4 --- /dev/null +++ b/debian/patches/patchset-pf/amd-pstate/0030-cpufreq-amd-pstate-Remove-the-switch-case-in-amd_pst.patch @@ -0,0 +1,41 @@ +From 55be5db97f4f52badc958463ee8d9cbc2ae91615 Mon Sep 17 00:00:00 2001 +From: Dhananjay Ugwekar +Date: Thu, 17 Oct 2024 10:05:31 +0000 +Subject: cpufreq/amd-pstate: Remove the switch case in amd_pstate_init() + +Replace the switch case with a more readable if condition. + +Signed-off-by: Dhananjay Ugwekar +--- + drivers/cpufreq/amd-pstate.c | 16 +++++----------- + 1 file changed, 5 insertions(+), 11 deletions(-) + +--- a/drivers/cpufreq/amd-pstate.c ++++ b/drivers/cpufreq/amd-pstate.c +@@ -1873,21 +1873,15 @@ static int __init amd_pstate_init(void) + cppc_state = CONFIG_X86_AMD_PSTATE_DEFAULT_MODE; + } + +- switch (cppc_state) { +- case AMD_PSTATE_DISABLE: ++ if (cppc_state == AMD_PSTATE_DISABLE) { + pr_info("driver load is disabled, boot with specific mode to enable this\n"); + return -ENODEV; +- case AMD_PSTATE_PASSIVE: +- case AMD_PSTATE_ACTIVE: +- case AMD_PSTATE_GUIDED: +- ret = amd_pstate_set_driver(cppc_state); +- if (ret) +- return ret; +- break; +- default: +- return -EINVAL; + } + ++ ret = amd_pstate_set_driver(cppc_state); ++ if (ret) ++ return ret; ++ + /* capability check */ + if (cpu_feature_enabled(X86_FEATURE_CPPC)) { + pr_debug("AMD CPPC MSR based functionality is supported\n"); diff --git a/debian/patches/patchset-pf/amd-pstate/0031-cpufreq-amd-pstate-Remove-the-redundant-amd_pstate_s.patch b/debian/patches/patchset-pf/amd-pstate/0031-cpufreq-amd-pstate-Remove-the-redundant-amd_pstate_s.patch new file mode 100644 index 0000000..d66b162 --- /dev/null +++ b/debian/patches/patchset-pf/amd-pstate/0031-cpufreq-amd-pstate-Remove-the-redundant-amd_pstate_s.patch @@ -0,0 +1,43 @@ +From 7305364888151cb9e6b435c5f219ccfd18132b58 Mon Sep 17 00:00:00 2001 +From: Dhananjay Ugwekar +Date: Thu, 17 Oct 2024 10:05:33 +0000 +Subject: cpufreq/amd-pstate: Remove the redundant amd_pstate_set_driver() call + +amd_pstate_set_driver() is called twice, once in amd_pstate_init() and once +as part of amd_pstate_register_driver(). Move around code and eliminate +the redundancy. + +Signed-off-by: Dhananjay Ugwekar +--- + drivers/cpufreq/amd-pstate.c | 12 ++++-------- + 1 file changed, 4 insertions(+), 8 deletions(-) + +--- a/drivers/cpufreq/amd-pstate.c ++++ b/drivers/cpufreq/amd-pstate.c +@@ -1878,9 +1878,11 @@ static int __init amd_pstate_init(void) + return -ENODEV; + } + +- ret = amd_pstate_set_driver(cppc_state); +- if (ret) ++ ret = amd_pstate_register_driver(cppc_state); ++ if (ret) { ++ pr_err("failed to register with return %d\n", ret); + return ret; ++ } + + /* capability check */ + if (cpu_feature_enabled(X86_FEATURE_CPPC)) { +@@ -1900,12 +1902,6 @@ static int __init amd_pstate_init(void) + return ret; + } + +- ret = amd_pstate_register_driver(cppc_state); +- if (ret) { +- pr_err("failed to register with return %d\n", ret); +- return ret; +- } +- + dev_root = bus_get_dev_root(&cpu_subsys); + if (dev_root) { + ret = sysfs_create_group(&dev_root->kobj, &amd_pstate_global_attr_group); diff --git a/debian/patches/patchset-pf/amd-pstate/0032-cpufreq-amd-pstate-ut-Add-fix-for-min-freq-unit-test.patch b/debian/patches/patchset-pf/amd-pstate/0032-cpufreq-amd-pstate-ut-Add-fix-for-min-freq-unit-test.patch new file mode 100644 index 0000000..30b28d1 --- /dev/null +++ b/debian/patches/patchset-pf/amd-pstate/0032-cpufreq-amd-pstate-ut-Add-fix-for-min-freq-unit-test.patch @@ -0,0 +1,33 @@ +From 5886ef269d069c72ea952cb00699e16221289e8c Mon Sep 17 00:00:00 2001 +From: Mario Limonciello +Date: Thu, 17 Oct 2024 12:34:39 -0500 +Subject: cpufreq/amd-pstate-ut: Add fix for min freq unit test + +commit 642aff3964b0f ("cpufreq/amd-pstate: Set the initial min_freq to +lowest_nonlinear_freq") changed the iniital minimum frequency to lowest +nonlinear frequency, but the unit tests weren't updated and now fail. + +Update them to match this same change. + +Fixes: 642aff3964b0f ("cpufreq/amd-pstate: Set the initial min_freq to lowest_nonlinear_freq") +Signed-off-by: Mario Limonciello +--- + drivers/cpufreq/amd-pstate-ut.c | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +--- a/drivers/cpufreq/amd-pstate-ut.c ++++ b/drivers/cpufreq/amd-pstate-ut.c +@@ -227,10 +227,10 @@ static void amd_pstate_ut_check_freq(u32 + goto skip_test; + } + +- if (cpudata->min_freq != policy->min) { ++ if (cpudata->lowest_nonlinear_freq != policy->min) { + amd_pstate_ut_cases[index].result = AMD_PSTATE_UT_RESULT_FAIL; +- pr_err("%s cpu%d cpudata_min_freq=%d policy_min=%d, they should be equal!\n", +- __func__, cpu, cpudata->min_freq, policy->min); ++ pr_err("%s cpu%d cpudata_lowest_nonlinear_freq=%d policy_min=%d, they should be equal!\n", ++ __func__, cpu, cpudata->lowest_nonlinear_freq, policy->min); + goto skip_test; + } + diff --git a/debian/patches/patchset-pf/amd-pstate/0033-amd-pstate-Set-min_perf-to-nominal_perf-for-active-m.patch b/debian/patches/patchset-pf/amd-pstate/0033-amd-pstate-Set-min_perf-to-nominal_perf-for-active-m.patch new file mode 100644 index 0000000..528b7cb --- /dev/null +++ b/debian/patches/patchset-pf/amd-pstate/0033-amd-pstate-Set-min_perf-to-nominal_perf-for-active-m.patch @@ -0,0 +1,33 @@ +From e82b9b5a56bcac18cae68878fe67263279805735 Mon Sep 17 00:00:00 2001 +From: "Gautham R. Shenoy" +Date: Mon, 21 Oct 2024 15:48:35 +0530 +Subject: amd-pstate: Set min_perf to nominal_perf for active mode performance + gov + +The amd-pstate driver sets CPPC_REQ.min_perf to CPPC_REQ.max_perf when +in active mode with performance governor. Typically CPPC_REQ.max_perf +is set to CPPC.highest_perf. This causes frequency throttling on +power-limited platforms which causes performance regressions on +certain classes of workloads. + +Hence, set the CPPC_REQ.min_perf to the CPPC.nominal_perf or +CPPC_REQ.max_perf, whichever is lower of the two. + +Fixes: ffa5096a7c33 ("cpufreq: amd-pstate: implement Pstate EPP support for the AMD processors") +Signed-off-by: Gautham R. Shenoy +Reviewed-by: Mario Limonciello +--- + drivers/cpufreq/amd-pstate.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/drivers/cpufreq/amd-pstate.c ++++ b/drivers/cpufreq/amd-pstate.c +@@ -1565,7 +1565,7 @@ static int amd_pstate_epp_update_limit(s + value = READ_ONCE(cpudata->cppc_req_cached); + + if (cpudata->policy == CPUFREQ_POLICY_PERFORMANCE) +- min_perf = max_perf; ++ min_perf = min(cpudata->nominal_perf, max_perf); + + /* Initial min/max values for CPPC Performance Controls Register */ + value &= ~AMD_CPPC_MIN_PERF(~0L); diff --git a/debian/patches/patchset-pf/amd-pstate/0034-amd-pstate-Switch-to-amd-pstate-by-default-on-some-S.patch b/debian/patches/patchset-pf/amd-pstate/0034-amd-pstate-Switch-to-amd-pstate-by-default-on-some-S.patch new file mode 100644 index 0000000..bfe6fd4 --- /dev/null +++ b/debian/patches/patchset-pf/amd-pstate/0034-amd-pstate-Switch-to-amd-pstate-by-default-on-some-S.patch @@ -0,0 +1,44 @@ +From 497447cf96a785a4edd0756da5d5718037f5687c Mon Sep 17 00:00:00 2001 +From: Swapnil Sapkal +Date: Mon, 21 Oct 2024 15:48:36 +0530 +Subject: amd-pstate: Switch to amd-pstate by default on some Server platforms + +Currently the default cpufreq driver for all the AMD EPYC servers is +acpi-cpufreq. Going forward, switch to amd-pstate as the default +driver on the AMD EPYC server platforms with CPU family 0x1A or +higher. The default mode will be active mode. + +Testing shows that amd-pstate with active mode and performance +governor provides comparable or better performance per-watt against +acpi-cpufreq + performance governor. + +Likewise, amd-pstate with active mode and powersave governor with the +energy_performance_preference=power (EPP=255) provides comparable or +better performance per-watt against acpi-cpufreq + schedutil governor +for a wide range of workloads. + +Users can still revert to using acpi-cpufreq driver on these platforms +with the "amd_pstate=disable" kernel commandline parameter. + +Signed-off-by: Swapnil Sapkal +Signed-off-by: Gautham R. Shenoy +Reviewed-by: Mario Limonciello +--- + drivers/cpufreq/amd-pstate.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +--- a/drivers/cpufreq/amd-pstate.c ++++ b/drivers/cpufreq/amd-pstate.c +@@ -1862,10 +1862,10 @@ static int __init amd_pstate_init(void) + if (cppc_state == AMD_PSTATE_UNDEFINED) { + /* Disable on the following configs by default: + * 1. Undefined platforms +- * 2. Server platforms ++ * 2. Server platforms with CPUs older than Family 0x1A. + */ + if (amd_pstate_acpi_pm_profile_undefined() || +- amd_pstate_acpi_pm_profile_server()) { ++ (amd_pstate_acpi_pm_profile_server() && boot_cpu_data.x86 < 0x1A)) { + pr_info("driver load is disabled, boot with specific mode to enable this\n"); + return -ENODEV; + } diff --git a/debian/patches/patchset-pf/amd-pstate/0035-cpufreq-amd-pstate-Push-adjust_perf-vfunc-init-into-.patch b/debian/patches/patchset-pf/amd-pstate/0035-cpufreq-amd-pstate-Push-adjust_perf-vfunc-init-into-.patch new file mode 100644 index 0000000..f4f044b --- /dev/null +++ b/debian/patches/patchset-pf/amd-pstate/0035-cpufreq-amd-pstate-Push-adjust_perf-vfunc-init-into-.patch @@ -0,0 +1,38 @@ +From a4d255935a1ea6e4b10167df942ec641079bcdf7 Mon Sep 17 00:00:00 2001 +From: Mario Limonciello +Date: Mon, 28 Oct 2024 09:55:41 -0500 +Subject: cpufreq/amd-pstate: Push adjust_perf vfunc init into cpu_init + +As the driver can be changed in and out of different modes it's possible +that adjust_perf is assigned when it shouldn't be. + +This could happen if an MSR design is started up in passive mode and then +switches to active mode. + +To solve this explicitly clear `adjust_perf` in amd_pstate_epp_cpu_init(). + +Signed-off-by: Mario Limonciello +--- + drivers/cpufreq/amd-pstate.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +--- a/drivers/cpufreq/amd-pstate.c ++++ b/drivers/cpufreq/amd-pstate.c +@@ -1528,6 +1528,8 @@ static int amd_pstate_epp_cpu_init(struc + WRITE_ONCE(cpudata->cppc_cap1_cached, value); + } + ++ current_pstate_driver->adjust_perf = NULL; ++ + return 0; + + free_cpudata1: +@@ -1887,8 +1889,6 @@ static int __init amd_pstate_init(void) + /* capability check */ + if (cpu_feature_enabled(X86_FEATURE_CPPC)) { + pr_debug("AMD CPPC MSR based functionality is supported\n"); +- if (cppc_state != AMD_PSTATE_ACTIVE) +- current_pstate_driver->adjust_perf = amd_pstate_adjust_perf; + } else { + pr_debug("AMD CPPC shared memory based functionality is supported\n"); + static_call_update(amd_pstate_enable, shmem_enable); diff --git a/debian/patches/patchset-pf/amd-pstate/0036-cpufreq-amd-pstate-Move-registration-after-static-fu.patch b/debian/patches/patchset-pf/amd-pstate/0036-cpufreq-amd-pstate-Move-registration-after-static-fu.patch new file mode 100644 index 0000000..32b6282 --- /dev/null +++ b/debian/patches/patchset-pf/amd-pstate/0036-cpufreq-amd-pstate-Move-registration-after-static-fu.patch @@ -0,0 +1,47 @@ +From c42a82a583646dcbba8500d47ed878616ab5c33a Mon Sep 17 00:00:00 2001 +From: Mario Limonciello +Date: Mon, 28 Oct 2024 09:55:42 -0500 +Subject: cpufreq/amd-pstate: Move registration after static function call + update + +On shared memory designs the static functions need to work before +registration is done or the system can hang at bootup. + +Move the registration later in amd_pstate_init() to solve this. + +Fixes: e238968a2087 ("cpufreq/amd-pstate: Remove the redundant amd_pstate_set_driver() call") +Reported-by: Klara Modin +Closes: https://lore.kernel.org/linux-pm/cf9c146d-bacf-444e-92e2-15ebf513af96@gmail.com/#t +Signed-off-by: Mario Limonciello +--- + drivers/cpufreq/amd-pstate.c | 12 ++++++------ + 1 file changed, 6 insertions(+), 6 deletions(-) + +--- a/drivers/cpufreq/amd-pstate.c ++++ b/drivers/cpufreq/amd-pstate.c +@@ -1880,12 +1880,6 @@ static int __init amd_pstate_init(void) + return -ENODEV; + } + +- ret = amd_pstate_register_driver(cppc_state); +- if (ret) { +- pr_err("failed to register with return %d\n", ret); +- return ret; +- } +- + /* capability check */ + if (cpu_feature_enabled(X86_FEATURE_CPPC)) { + pr_debug("AMD CPPC MSR based functionality is supported\n"); +@@ -1896,6 +1890,12 @@ static int __init amd_pstate_init(void) + static_call_update(amd_pstate_update_perf, shmem_update_perf); + } + ++ ret = amd_pstate_register_driver(cppc_state); ++ if (ret) { ++ pr_err("failed to register with return %d\n", ret); ++ return ret; ++ } ++ + if (amd_pstate_prefcore) { + ret = amd_detect_prefcore(&amd_pstate_prefcore); + if (ret) diff --git a/debian/patches/patchset-pf/amd-rapl/0001-perf-Generic-hotplug-support-for-a-PMU-with-a-scope.patch b/debian/patches/patchset-pf/amd-rapl/0001-perf-Generic-hotplug-support-for-a-PMU-with-a-scope.patch new file mode 100644 index 0000000..8c7f40f --- /dev/null +++ b/debian/patches/patchset-pf/amd-rapl/0001-perf-Generic-hotplug-support-for-a-PMU-with-a-scope.patch @@ -0,0 +1,321 @@ +From 023d6b8aa8d8b346cfdcccf5ca4cb880c8d41d87 Mon Sep 17 00:00:00 2001 +From: Kan Liang +Date: Fri, 2 Aug 2024 08:16:37 -0700 +Subject: perf: Generic hotplug support for a PMU with a scope + +The perf subsystem assumes that the counters of a PMU are per-CPU. So +the user space tool reads a counter from each CPU in the system wide +mode. However, many PMUs don't have a per-CPU counter. The counter is +effective for a scope, e.g., a die or a socket. To address this, a +cpumask is exposed by the kernel driver to restrict to one CPU to stand +for a specific scope. In case the given CPU is removed, +the hotplug support has to be implemented for each such driver. + +The codes to support the cpumask and hotplug are very similar. +- Expose a cpumask into sysfs +- Pickup another CPU in the same scope if the given CPU is removed. +- Invoke the perf_pmu_migrate_context() to migrate to a new CPU. +- In event init, always set the CPU in the cpumask to event->cpu + +Similar duplicated codes are implemented for each such PMU driver. It +would be good to introduce a generic infrastructure to avoid such +duplication. + +5 popular scopes are implemented here, core, die, cluster, pkg, and +the system-wide. The scope can be set when a PMU is registered. If so, a +"cpumask" is automatically exposed for the PMU. + +The "cpumask" is from the perf_online__mask, which is to track +the active CPU for each scope. They are set when the first CPU of the +scope is online via the generic perf hotplug support. When a +corresponding CPU is removed, the perf_online__mask is updated +accordingly and the PMU will be moved to a new CPU from the same scope +if possible. + +Signed-off-by: Kan Liang +--- + include/linux/perf_event.h | 18 ++++ + kernel/events/core.c | 164 ++++++++++++++++++++++++++++++++++++- + 2 files changed, 180 insertions(+), 2 deletions(-) + +--- a/include/linux/perf_event.h ++++ b/include/linux/perf_event.h +@@ -292,6 +292,19 @@ struct perf_event_pmu_context; + #define PERF_PMU_CAP_AUX_OUTPUT 0x0080 + #define PERF_PMU_CAP_EXTENDED_HW_TYPE 0x0100 + ++/** ++ * pmu::scope ++ */ ++enum perf_pmu_scope { ++ PERF_PMU_SCOPE_NONE = 0, ++ PERF_PMU_SCOPE_CORE, ++ PERF_PMU_SCOPE_DIE, ++ PERF_PMU_SCOPE_CLUSTER, ++ PERF_PMU_SCOPE_PKG, ++ PERF_PMU_SCOPE_SYS_WIDE, ++ PERF_PMU_MAX_SCOPE, ++}; ++ + struct perf_output_handle; + + #define PMU_NULL_DEV ((void *)(~0UL)) +@@ -315,6 +328,11 @@ struct pmu { + */ + int capabilities; + ++ /* ++ * PMU scope ++ */ ++ unsigned int scope; ++ + int __percpu *pmu_disable_count; + struct perf_cpu_pmu_context __percpu *cpu_pmu_context; + atomic_t exclusive_cnt; /* < 0: cpu; > 0: tsk */ +--- a/kernel/events/core.c ++++ b/kernel/events/core.c +@@ -411,6 +411,11 @@ static LIST_HEAD(pmus); + static DEFINE_MUTEX(pmus_lock); + static struct srcu_struct pmus_srcu; + static cpumask_var_t perf_online_mask; ++static cpumask_var_t perf_online_core_mask; ++static cpumask_var_t perf_online_die_mask; ++static cpumask_var_t perf_online_cluster_mask; ++static cpumask_var_t perf_online_pkg_mask; ++static cpumask_var_t perf_online_sys_mask; + static struct kmem_cache *perf_event_cache; + + /* +@@ -11497,10 +11502,60 @@ perf_event_mux_interval_ms_store(struct + } + static DEVICE_ATTR_RW(perf_event_mux_interval_ms); + ++static inline const struct cpumask *perf_scope_cpu_topology_cpumask(unsigned int scope, int cpu) ++{ ++ switch (scope) { ++ case PERF_PMU_SCOPE_CORE: ++ return topology_sibling_cpumask(cpu); ++ case PERF_PMU_SCOPE_DIE: ++ return topology_die_cpumask(cpu); ++ case PERF_PMU_SCOPE_CLUSTER: ++ return topology_cluster_cpumask(cpu); ++ case PERF_PMU_SCOPE_PKG: ++ return topology_core_cpumask(cpu); ++ case PERF_PMU_SCOPE_SYS_WIDE: ++ return cpu_online_mask; ++ } ++ ++ return NULL; ++} ++ ++static inline struct cpumask *perf_scope_cpumask(unsigned int scope) ++{ ++ switch (scope) { ++ case PERF_PMU_SCOPE_CORE: ++ return perf_online_core_mask; ++ case PERF_PMU_SCOPE_DIE: ++ return perf_online_die_mask; ++ case PERF_PMU_SCOPE_CLUSTER: ++ return perf_online_cluster_mask; ++ case PERF_PMU_SCOPE_PKG: ++ return perf_online_pkg_mask; ++ case PERF_PMU_SCOPE_SYS_WIDE: ++ return perf_online_sys_mask; ++ } ++ ++ return NULL; ++} ++ ++static ssize_t cpumask_show(struct device *dev, struct device_attribute *attr, ++ char *buf) ++{ ++ struct pmu *pmu = dev_get_drvdata(dev); ++ struct cpumask *mask = perf_scope_cpumask(pmu->scope); ++ ++ if (mask) ++ return cpumap_print_to_pagebuf(true, buf, mask); ++ return 0; ++} ++ ++static DEVICE_ATTR_RO(cpumask); ++ + static struct attribute *pmu_dev_attrs[] = { + &dev_attr_type.attr, + &dev_attr_perf_event_mux_interval_ms.attr, + &dev_attr_nr_addr_filters.attr, ++ &dev_attr_cpumask.attr, + NULL, + }; + +@@ -11512,6 +11567,10 @@ static umode_t pmu_dev_is_visible(struct + if (n == 2 && !pmu->nr_addr_filters) + return 0; + ++ /* cpumask */ ++ if (n == 3 && pmu->scope == PERF_PMU_SCOPE_NONE) ++ return 0; ++ + return a->mode; + } + +@@ -11596,6 +11655,11 @@ int perf_pmu_register(struct pmu *pmu, c + goto free_pdc; + } + ++ if (WARN_ONCE(pmu->scope >= PERF_PMU_MAX_SCOPE, "Can not register a pmu with an invalid scope.\n")) { ++ ret = -EINVAL; ++ goto free_pdc; ++ } ++ + pmu->name = name; + + if (type >= 0) +@@ -11750,6 +11814,22 @@ static int perf_try_init_event(struct pm + event_has_any_exclude_flag(event)) + ret = -EINVAL; + ++ if (pmu->scope != PERF_PMU_SCOPE_NONE && event->cpu >= 0) { ++ const struct cpumask *cpumask = perf_scope_cpu_topology_cpumask(pmu->scope, event->cpu); ++ struct cpumask *pmu_cpumask = perf_scope_cpumask(pmu->scope); ++ int cpu; ++ ++ if (pmu_cpumask && cpumask) { ++ cpu = cpumask_any_and(pmu_cpumask, cpumask); ++ if (cpu >= nr_cpu_ids) ++ ret = -ENODEV; ++ else ++ event->cpu = cpu; ++ } else { ++ ret = -ENODEV; ++ } ++ } ++ + if (ret && event->destroy) + event->destroy(event); + } +@@ -13713,6 +13793,12 @@ static void __init perf_event_init_all_c + int cpu; + + zalloc_cpumask_var(&perf_online_mask, GFP_KERNEL); ++ zalloc_cpumask_var(&perf_online_core_mask, GFP_KERNEL); ++ zalloc_cpumask_var(&perf_online_die_mask, GFP_KERNEL); ++ zalloc_cpumask_var(&perf_online_cluster_mask, GFP_KERNEL); ++ zalloc_cpumask_var(&perf_online_pkg_mask, GFP_KERNEL); ++ zalloc_cpumask_var(&perf_online_sys_mask, GFP_KERNEL); ++ + + for_each_possible_cpu(cpu) { + swhash = &per_cpu(swevent_htable, cpu); +@@ -13762,6 +13848,40 @@ static void __perf_event_exit_context(vo + raw_spin_unlock(&ctx->lock); + } + ++static void perf_event_clear_cpumask(unsigned int cpu) ++{ ++ int target[PERF_PMU_MAX_SCOPE]; ++ unsigned int scope; ++ struct pmu *pmu; ++ ++ cpumask_clear_cpu(cpu, perf_online_mask); ++ ++ for (scope = PERF_PMU_SCOPE_NONE + 1; scope < PERF_PMU_MAX_SCOPE; scope++) { ++ const struct cpumask *cpumask = perf_scope_cpu_topology_cpumask(scope, cpu); ++ struct cpumask *pmu_cpumask = perf_scope_cpumask(scope); ++ ++ target[scope] = -1; ++ if (WARN_ON_ONCE(!pmu_cpumask || !cpumask)) ++ continue; ++ ++ if (!cpumask_test_and_clear_cpu(cpu, pmu_cpumask)) ++ continue; ++ target[scope] = cpumask_any_but(cpumask, cpu); ++ if (target[scope] < nr_cpu_ids) ++ cpumask_set_cpu(target[scope], pmu_cpumask); ++ } ++ ++ /* migrate */ ++ list_for_each_entry_rcu(pmu, &pmus, entry, lockdep_is_held(&pmus_srcu)) { ++ if (pmu->scope == PERF_PMU_SCOPE_NONE || ++ WARN_ON_ONCE(pmu->scope >= PERF_PMU_MAX_SCOPE)) ++ continue; ++ ++ if (target[pmu->scope] >= 0 && target[pmu->scope] < nr_cpu_ids) ++ perf_pmu_migrate_context(pmu, cpu, target[pmu->scope]); ++ } ++} ++ + static void perf_event_exit_cpu_context(int cpu) + { + struct perf_cpu_context *cpuctx; +@@ -13769,6 +13889,11 @@ static void perf_event_exit_cpu_context( + + // XXX simplify cpuctx->online + mutex_lock(&pmus_lock); ++ /* ++ * Clear the cpumasks, and migrate to other CPUs if possible. ++ * Must be invoked before the __perf_event_exit_context. ++ */ ++ perf_event_clear_cpumask(cpu); + cpuctx = per_cpu_ptr(&perf_cpu_context, cpu); + ctx = &cpuctx->ctx; + +@@ -13776,7 +13901,6 @@ static void perf_event_exit_cpu_context( + smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1); + cpuctx->online = 0; + mutex_unlock(&ctx->mutex); +- cpumask_clear_cpu(cpu, perf_online_mask); + mutex_unlock(&pmus_lock); + } + #else +@@ -13785,6 +13909,42 @@ static void perf_event_exit_cpu_context( + + #endif + ++static void perf_event_setup_cpumask(unsigned int cpu) ++{ ++ struct cpumask *pmu_cpumask; ++ unsigned int scope; ++ ++ cpumask_set_cpu(cpu, perf_online_mask); ++ ++ /* ++ * Early boot stage, the cpumask hasn't been set yet. ++ * The perf_online__masks includes the first CPU of each domain. ++ * Always uncondifionally set the boot CPU for the perf_online__masks. ++ */ ++ if (!topology_sibling_cpumask(cpu)) { ++ for (scope = PERF_PMU_SCOPE_NONE + 1; scope < PERF_PMU_MAX_SCOPE; scope++) { ++ pmu_cpumask = perf_scope_cpumask(scope); ++ if (WARN_ON_ONCE(!pmu_cpumask)) ++ continue; ++ cpumask_set_cpu(cpu, pmu_cpumask); ++ } ++ return; ++ } ++ ++ for (scope = PERF_PMU_SCOPE_NONE + 1; scope < PERF_PMU_MAX_SCOPE; scope++) { ++ const struct cpumask *cpumask = perf_scope_cpu_topology_cpumask(scope, cpu); ++ ++ pmu_cpumask = perf_scope_cpumask(scope); ++ ++ if (WARN_ON_ONCE(!pmu_cpumask || !cpumask)) ++ continue; ++ ++ if (!cpumask_empty(cpumask) && ++ cpumask_any_and(pmu_cpumask, cpumask) >= nr_cpu_ids) ++ cpumask_set_cpu(cpu, pmu_cpumask); ++ } ++} ++ + int perf_event_init_cpu(unsigned int cpu) + { + struct perf_cpu_context *cpuctx; +@@ -13793,7 +13953,7 @@ int perf_event_init_cpu(unsigned int cpu + perf_swevent_init_cpu(cpu); + + mutex_lock(&pmus_lock); +- cpumask_set_cpu(cpu, perf_online_mask); ++ perf_event_setup_cpumask(cpu); + cpuctx = per_cpu_ptr(&perf_cpu_context, cpu); + ctx = &cpuctx->ctx; + diff --git a/debian/patches/patchset-pf/amd-rapl/0002-perf-Add-PERF_EV_CAP_READ_SCOPE.patch b/debian/patches/patchset-pf/amd-rapl/0002-perf-Add-PERF_EV_CAP_READ_SCOPE.patch new file mode 100644 index 0000000..3240af8 --- /dev/null +++ b/debian/patches/patchset-pf/amd-rapl/0002-perf-Add-PERF_EV_CAP_READ_SCOPE.patch @@ -0,0 +1,71 @@ +From 8c7eb17e722a6a45c4436e5debb9336089b21d9b Mon Sep 17 00:00:00 2001 +From: Kan Liang +Date: Fri, 2 Aug 2024 08:16:38 -0700 +Subject: perf: Add PERF_EV_CAP_READ_SCOPE + +Usually, an event can be read from any CPU of the scope. It doesn't need +to be read from the advertised CPU. + +Add a new event cap, PERF_EV_CAP_READ_SCOPE. An event of a PMU with +scope can be read from any active CPU in the scope. + +Signed-off-by: Kan Liang +--- + include/linux/perf_event.h | 3 +++ + kernel/events/core.c | 14 +++++++++++--- + 2 files changed, 14 insertions(+), 3 deletions(-) + +--- a/include/linux/perf_event.h ++++ b/include/linux/perf_event.h +@@ -633,10 +633,13 @@ typedef void (*perf_overflow_handler_t)( + * PERF_EV_CAP_SIBLING: An event with this flag must be a group sibling and + * cannot be a group leader. If an event with this flag is detached from the + * group it is scheduled out and moved into an unrecoverable ERROR state. ++ * PERF_EV_CAP_READ_SCOPE: A CPU event that can be read from any CPU of the ++ * PMU scope where it is active. + */ + #define PERF_EV_CAP_SOFTWARE BIT(0) + #define PERF_EV_CAP_READ_ACTIVE_PKG BIT(1) + #define PERF_EV_CAP_SIBLING BIT(2) ++#define PERF_EV_CAP_READ_SCOPE BIT(3) + + #define SWEVENT_HLIST_BITS 8 + #define SWEVENT_HLIST_SIZE (1 << SWEVENT_HLIST_BITS) +--- a/kernel/events/core.c ++++ b/kernel/events/core.c +@@ -4477,16 +4477,24 @@ struct perf_read_data { + int ret; + }; + ++static inline const struct cpumask *perf_scope_cpu_topology_cpumask(unsigned int scope, int cpu); ++ + static int __perf_event_read_cpu(struct perf_event *event, int event_cpu) + { ++ int local_cpu = smp_processor_id(); + u16 local_pkg, event_pkg; + + if ((unsigned)event_cpu >= nr_cpu_ids) + return event_cpu; + +- if (event->group_caps & PERF_EV_CAP_READ_ACTIVE_PKG) { +- int local_cpu = smp_processor_id(); ++ if (event->group_caps & PERF_EV_CAP_READ_SCOPE) { ++ const struct cpumask *cpumask = perf_scope_cpu_topology_cpumask(event->pmu->scope, event_cpu); ++ ++ if (cpumask && cpumask_test_cpu(local_cpu, cpumask)) ++ return local_cpu; ++ } + ++ if (event->group_caps & PERF_EV_CAP_READ_ACTIVE_PKG) { + event_pkg = topology_physical_package_id(event_cpu); + local_pkg = topology_physical_package_id(local_cpu); + +@@ -11824,7 +11832,7 @@ static int perf_try_init_event(struct pm + if (cpu >= nr_cpu_ids) + ret = -ENODEV; + else +- event->cpu = cpu; ++ event->event_caps |= PERF_EV_CAP_READ_SCOPE; + } else { + ret = -ENODEV; + } diff --git a/debian/patches/patchset-pf/amd-rapl/0003-perf-x86-intel-cstate-Clean-up-cpumask-and-hotplug.patch b/debian/patches/patchset-pf/amd-rapl/0003-perf-x86-intel-cstate-Clean-up-cpumask-and-hotplug.patch new file mode 100644 index 0000000..299cac0 --- /dev/null +++ b/debian/patches/patchset-pf/amd-rapl/0003-perf-x86-intel-cstate-Clean-up-cpumask-and-hotplug.patch @@ -0,0 +1,286 @@ +From 09c1529eb102b486220c35546f2663ca858a2943 Mon Sep 17 00:00:00 2001 +From: Kan Liang +Date: Fri, 2 Aug 2024 08:16:39 -0700 +Subject: perf/x86/intel/cstate: Clean up cpumask and hotplug + +There are three cstate PMUs with different scopes, core, die and module. +The scopes are supported by the generic perf_event subsystem now. + +Set the scope for each PMU and remove all the cpumask and hotplug codes. + +Signed-off-by: Kan Liang +--- + arch/x86/events/intel/cstate.c | 142 ++------------------------------- + include/linux/cpuhotplug.h | 2 - + 2 files changed, 5 insertions(+), 139 deletions(-) + +--- a/arch/x86/events/intel/cstate.c ++++ b/arch/x86/events/intel/cstate.c +@@ -128,10 +128,6 @@ static ssize_t __cstate_##_var##_show(st + static struct device_attribute format_attr_##_var = \ + __ATTR(_name, 0444, __cstate_##_var##_show, NULL) + +-static ssize_t cstate_get_attr_cpumask(struct device *dev, +- struct device_attribute *attr, +- char *buf); +- + /* Model -> events mapping */ + struct cstate_model { + unsigned long core_events; +@@ -206,22 +202,9 @@ static struct attribute_group cstate_for + .attrs = cstate_format_attrs, + }; + +-static cpumask_t cstate_core_cpu_mask; +-static DEVICE_ATTR(cpumask, S_IRUGO, cstate_get_attr_cpumask, NULL); +- +-static struct attribute *cstate_cpumask_attrs[] = { +- &dev_attr_cpumask.attr, +- NULL, +-}; +- +-static struct attribute_group cpumask_attr_group = { +- .attrs = cstate_cpumask_attrs, +-}; +- + static const struct attribute_group *cstate_attr_groups[] = { + &cstate_events_attr_group, + &cstate_format_attr_group, +- &cpumask_attr_group, + NULL, + }; + +@@ -269,8 +252,6 @@ static struct perf_msr pkg_msr[] = { + [PERF_CSTATE_PKG_C10_RES] = { MSR_PKG_C10_RESIDENCY, &group_cstate_pkg_c10, test_msr }, + }; + +-static cpumask_t cstate_pkg_cpu_mask; +- + /* cstate_module PMU */ + static struct pmu cstate_module_pmu; + static bool has_cstate_module; +@@ -291,28 +272,9 @@ static struct perf_msr module_msr[] = { + [PERF_CSTATE_MODULE_C6_RES] = { MSR_MODULE_C6_RES_MS, &group_cstate_module_c6, test_msr }, + }; + +-static cpumask_t cstate_module_cpu_mask; +- +-static ssize_t cstate_get_attr_cpumask(struct device *dev, +- struct device_attribute *attr, +- char *buf) +-{ +- struct pmu *pmu = dev_get_drvdata(dev); +- +- if (pmu == &cstate_core_pmu) +- return cpumap_print_to_pagebuf(true, buf, &cstate_core_cpu_mask); +- else if (pmu == &cstate_pkg_pmu) +- return cpumap_print_to_pagebuf(true, buf, &cstate_pkg_cpu_mask); +- else if (pmu == &cstate_module_pmu) +- return cpumap_print_to_pagebuf(true, buf, &cstate_module_cpu_mask); +- else +- return 0; +-} +- + static int cstate_pmu_event_init(struct perf_event *event) + { + u64 cfg = event->attr.config; +- int cpu; + + if (event->attr.type != event->pmu->type) + return -ENOENT; +@@ -331,20 +293,13 @@ static int cstate_pmu_event_init(struct + if (!(core_msr_mask & (1 << cfg))) + return -EINVAL; + event->hw.event_base = core_msr[cfg].msr; +- cpu = cpumask_any_and(&cstate_core_cpu_mask, +- topology_sibling_cpumask(event->cpu)); + } else if (event->pmu == &cstate_pkg_pmu) { + if (cfg >= PERF_CSTATE_PKG_EVENT_MAX) + return -EINVAL; + cfg = array_index_nospec((unsigned long)cfg, PERF_CSTATE_PKG_EVENT_MAX); + if (!(pkg_msr_mask & (1 << cfg))) + return -EINVAL; +- +- event->event_caps |= PERF_EV_CAP_READ_ACTIVE_PKG; +- + event->hw.event_base = pkg_msr[cfg].msr; +- cpu = cpumask_any_and(&cstate_pkg_cpu_mask, +- topology_die_cpumask(event->cpu)); + } else if (event->pmu == &cstate_module_pmu) { + if (cfg >= PERF_CSTATE_MODULE_EVENT_MAX) + return -EINVAL; +@@ -352,16 +307,10 @@ static int cstate_pmu_event_init(struct + if (!(module_msr_mask & (1 << cfg))) + return -EINVAL; + event->hw.event_base = module_msr[cfg].msr; +- cpu = cpumask_any_and(&cstate_module_cpu_mask, +- topology_cluster_cpumask(event->cpu)); + } else { + return -ENOENT; + } + +- if (cpu >= nr_cpu_ids) +- return -ENODEV; +- +- event->cpu = cpu; + event->hw.config = cfg; + event->hw.idx = -1; + return 0; +@@ -412,84 +361,6 @@ static int cstate_pmu_event_add(struct p + return 0; + } + +-/* +- * Check if exiting cpu is the designated reader. If so migrate the +- * events when there is a valid target available +- */ +-static int cstate_cpu_exit(unsigned int cpu) +-{ +- unsigned int target; +- +- if (has_cstate_core && +- cpumask_test_and_clear_cpu(cpu, &cstate_core_cpu_mask)) { +- +- target = cpumask_any_but(topology_sibling_cpumask(cpu), cpu); +- /* Migrate events if there is a valid target */ +- if (target < nr_cpu_ids) { +- cpumask_set_cpu(target, &cstate_core_cpu_mask); +- perf_pmu_migrate_context(&cstate_core_pmu, cpu, target); +- } +- } +- +- if (has_cstate_pkg && +- cpumask_test_and_clear_cpu(cpu, &cstate_pkg_cpu_mask)) { +- +- target = cpumask_any_but(topology_die_cpumask(cpu), cpu); +- /* Migrate events if there is a valid target */ +- if (target < nr_cpu_ids) { +- cpumask_set_cpu(target, &cstate_pkg_cpu_mask); +- perf_pmu_migrate_context(&cstate_pkg_pmu, cpu, target); +- } +- } +- +- if (has_cstate_module && +- cpumask_test_and_clear_cpu(cpu, &cstate_module_cpu_mask)) { +- +- target = cpumask_any_but(topology_cluster_cpumask(cpu), cpu); +- /* Migrate events if there is a valid target */ +- if (target < nr_cpu_ids) { +- cpumask_set_cpu(target, &cstate_module_cpu_mask); +- perf_pmu_migrate_context(&cstate_module_pmu, cpu, target); +- } +- } +- return 0; +-} +- +-static int cstate_cpu_init(unsigned int cpu) +-{ +- unsigned int target; +- +- /* +- * If this is the first online thread of that core, set it in +- * the core cpu mask as the designated reader. +- */ +- target = cpumask_any_and(&cstate_core_cpu_mask, +- topology_sibling_cpumask(cpu)); +- +- if (has_cstate_core && target >= nr_cpu_ids) +- cpumask_set_cpu(cpu, &cstate_core_cpu_mask); +- +- /* +- * If this is the first online thread of that package, set it +- * in the package cpu mask as the designated reader. +- */ +- target = cpumask_any_and(&cstate_pkg_cpu_mask, +- topology_die_cpumask(cpu)); +- if (has_cstate_pkg && target >= nr_cpu_ids) +- cpumask_set_cpu(cpu, &cstate_pkg_cpu_mask); +- +- /* +- * If this is the first online thread of that cluster, set it +- * in the cluster cpu mask as the designated reader. +- */ +- target = cpumask_any_and(&cstate_module_cpu_mask, +- topology_cluster_cpumask(cpu)); +- if (has_cstate_module && target >= nr_cpu_ids) +- cpumask_set_cpu(cpu, &cstate_module_cpu_mask); +- +- return 0; +-} +- + static const struct attribute_group *core_attr_update[] = { + &group_cstate_core_c1, + &group_cstate_core_c3, +@@ -526,6 +397,7 @@ static struct pmu cstate_core_pmu = { + .stop = cstate_pmu_event_stop, + .read = cstate_pmu_event_update, + .capabilities = PERF_PMU_CAP_NO_INTERRUPT | PERF_PMU_CAP_NO_EXCLUDE, ++ .scope = PERF_PMU_SCOPE_CORE, + .module = THIS_MODULE, + }; + +@@ -541,6 +413,7 @@ static struct pmu cstate_pkg_pmu = { + .stop = cstate_pmu_event_stop, + .read = cstate_pmu_event_update, + .capabilities = PERF_PMU_CAP_NO_INTERRUPT | PERF_PMU_CAP_NO_EXCLUDE, ++ .scope = PERF_PMU_SCOPE_PKG, + .module = THIS_MODULE, + }; + +@@ -556,6 +429,7 @@ static struct pmu cstate_module_pmu = { + .stop = cstate_pmu_event_stop, + .read = cstate_pmu_event_update, + .capabilities = PERF_PMU_CAP_NO_INTERRUPT | PERF_PMU_CAP_NO_EXCLUDE, ++ .scope = PERF_PMU_SCOPE_CLUSTER, + .module = THIS_MODULE, + }; + +@@ -810,9 +684,6 @@ static int __init cstate_probe(const str + + static inline void cstate_cleanup(void) + { +- cpuhp_remove_state_nocalls(CPUHP_AP_PERF_X86_CSTATE_ONLINE); +- cpuhp_remove_state_nocalls(CPUHP_AP_PERF_X86_CSTATE_STARTING); +- + if (has_cstate_core) + perf_pmu_unregister(&cstate_core_pmu); + +@@ -827,11 +698,6 @@ static int __init cstate_init(void) + { + int err; + +- cpuhp_setup_state(CPUHP_AP_PERF_X86_CSTATE_STARTING, +- "perf/x86/cstate:starting", cstate_cpu_init, NULL); +- cpuhp_setup_state(CPUHP_AP_PERF_X86_CSTATE_ONLINE, +- "perf/x86/cstate:online", NULL, cstate_cpu_exit); +- + if (has_cstate_core) { + err = perf_pmu_register(&cstate_core_pmu, cstate_core_pmu.name, -1); + if (err) { +@@ -844,6 +710,8 @@ static int __init cstate_init(void) + + if (has_cstate_pkg) { + if (topology_max_dies_per_package() > 1) { ++ /* CLX-AP is multi-die and the cstate is die-scope */ ++ cstate_pkg_pmu.scope = PERF_PMU_SCOPE_DIE; + err = perf_pmu_register(&cstate_pkg_pmu, + "cstate_die", -1); + } else { +--- a/include/linux/cpuhotplug.h ++++ b/include/linux/cpuhotplug.h +@@ -152,7 +152,6 @@ enum cpuhp_state { + CPUHP_AP_PERF_X86_AMD_UNCORE_STARTING, + CPUHP_AP_PERF_X86_STARTING, + CPUHP_AP_PERF_X86_AMD_IBS_STARTING, +- CPUHP_AP_PERF_X86_CSTATE_STARTING, + CPUHP_AP_PERF_XTENSA_STARTING, + CPUHP_AP_ARM_VFP_STARTING, + CPUHP_AP_ARM64_DEBUG_MONITORS_STARTING, +@@ -209,7 +208,6 @@ enum cpuhp_state { + CPUHP_AP_PERF_X86_AMD_UNCORE_ONLINE, + CPUHP_AP_PERF_X86_AMD_POWER_ONLINE, + CPUHP_AP_PERF_X86_RAPL_ONLINE, +- CPUHP_AP_PERF_X86_CSTATE_ONLINE, + CPUHP_AP_PERF_S390_CF_ONLINE, + CPUHP_AP_PERF_S390_SF_ONLINE, + CPUHP_AP_PERF_ARM_CCI_ONLINE, diff --git a/debian/patches/patchset-pf/amd-rapl/0004-iommu-vt-d-Clean-up-cpumask-and-hotplug-for-perfmon.patch b/debian/patches/patchset-pf/amd-rapl/0004-iommu-vt-d-Clean-up-cpumask-and-hotplug-for-perfmon.patch new file mode 100644 index 0000000..5ae825e --- /dev/null +++ b/debian/patches/patchset-pf/amd-rapl/0004-iommu-vt-d-Clean-up-cpumask-and-hotplug-for-perfmon.patch @@ -0,0 +1,188 @@ +From f91da33af8295b4b3d73a2083225f69e1d5ff301 Mon Sep 17 00:00:00 2001 +From: Kan Liang +Date: Fri, 2 Aug 2024 08:16:40 -0700 +Subject: iommu/vt-d: Clean up cpumask and hotplug for perfmon + +The iommu PMU is system-wide scope, which is supported by the generic +perf_event subsystem now. + +Set the scope for the iommu PMU and remove all the cpumask and hotplug +codes. + +Reviewed-by: Lu Baolu +Signed-off-by: Kan Liang +Cc: David Woodhouse +Cc: Joerg Roedel +Cc: Will Deacon +Cc: iommu@lists.linux.dev +--- + drivers/iommu/intel/iommu.h | 2 - + drivers/iommu/intel/perfmon.c | 111 +--------------------------------- + 2 files changed, 2 insertions(+), 111 deletions(-) + +--- a/drivers/iommu/intel/iommu.h ++++ b/drivers/iommu/intel/iommu.h +@@ -687,8 +687,6 @@ struct iommu_pmu { + DECLARE_BITMAP(used_mask, IOMMU_PMU_IDX_MAX); + struct perf_event *event_list[IOMMU_PMU_IDX_MAX]; + unsigned char irq_name[16]; +- struct hlist_node cpuhp_node; +- int cpu; + }; + + #define IOMMU_IRQ_ID_OFFSET_PRQ (DMAR_UNITS_SUPPORTED) +--- a/drivers/iommu/intel/perfmon.c ++++ b/drivers/iommu/intel/perfmon.c +@@ -34,28 +34,9 @@ static struct attribute_group iommu_pmu_ + .attrs = attrs_empty, + }; + +-static cpumask_t iommu_pmu_cpu_mask; +- +-static ssize_t +-cpumask_show(struct device *dev, struct device_attribute *attr, char *buf) +-{ +- return cpumap_print_to_pagebuf(true, buf, &iommu_pmu_cpu_mask); +-} +-static DEVICE_ATTR_RO(cpumask); +- +-static struct attribute *iommu_pmu_cpumask_attrs[] = { +- &dev_attr_cpumask.attr, +- NULL +-}; +- +-static struct attribute_group iommu_pmu_cpumask_attr_group = { +- .attrs = iommu_pmu_cpumask_attrs, +-}; +- + static const struct attribute_group *iommu_pmu_attr_groups[] = { + &iommu_pmu_format_attr_group, + &iommu_pmu_events_attr_group, +- &iommu_pmu_cpumask_attr_group, + NULL + }; + +@@ -565,6 +546,7 @@ static int __iommu_pmu_register(struct i + iommu_pmu->pmu.attr_groups = iommu_pmu_attr_groups; + iommu_pmu->pmu.attr_update = iommu_pmu_attr_update; + iommu_pmu->pmu.capabilities = PERF_PMU_CAP_NO_EXCLUDE; ++ iommu_pmu->pmu.scope = PERF_PMU_SCOPE_SYS_WIDE; + iommu_pmu->pmu.module = THIS_MODULE; + + return perf_pmu_register(&iommu_pmu->pmu, iommu_pmu->pmu.name, -1); +@@ -773,89 +755,6 @@ static void iommu_pmu_unset_interrupt(st + iommu->perf_irq = 0; + } + +-static int iommu_pmu_cpu_online(unsigned int cpu, struct hlist_node *node) +-{ +- struct iommu_pmu *iommu_pmu = hlist_entry_safe(node, typeof(*iommu_pmu), cpuhp_node); +- +- if (cpumask_empty(&iommu_pmu_cpu_mask)) +- cpumask_set_cpu(cpu, &iommu_pmu_cpu_mask); +- +- if (cpumask_test_cpu(cpu, &iommu_pmu_cpu_mask)) +- iommu_pmu->cpu = cpu; +- +- return 0; +-} +- +-static int iommu_pmu_cpu_offline(unsigned int cpu, struct hlist_node *node) +-{ +- struct iommu_pmu *iommu_pmu = hlist_entry_safe(node, typeof(*iommu_pmu), cpuhp_node); +- int target = cpumask_first(&iommu_pmu_cpu_mask); +- +- /* +- * The iommu_pmu_cpu_mask has been updated when offline the CPU +- * for the first iommu_pmu. Migrate the other iommu_pmu to the +- * new target. +- */ +- if (target < nr_cpu_ids && target != iommu_pmu->cpu) { +- perf_pmu_migrate_context(&iommu_pmu->pmu, cpu, target); +- iommu_pmu->cpu = target; +- return 0; +- } +- +- if (!cpumask_test_and_clear_cpu(cpu, &iommu_pmu_cpu_mask)) +- return 0; +- +- target = cpumask_any_but(cpu_online_mask, cpu); +- +- if (target < nr_cpu_ids) +- cpumask_set_cpu(target, &iommu_pmu_cpu_mask); +- else +- return 0; +- +- perf_pmu_migrate_context(&iommu_pmu->pmu, cpu, target); +- iommu_pmu->cpu = target; +- +- return 0; +-} +- +-static int nr_iommu_pmu; +-static enum cpuhp_state iommu_cpuhp_slot; +- +-static int iommu_pmu_cpuhp_setup(struct iommu_pmu *iommu_pmu) +-{ +- int ret; +- +- if (!nr_iommu_pmu) { +- ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, +- "driver/iommu/intel/perfmon:online", +- iommu_pmu_cpu_online, +- iommu_pmu_cpu_offline); +- if (ret < 0) +- return ret; +- iommu_cpuhp_slot = ret; +- } +- +- ret = cpuhp_state_add_instance(iommu_cpuhp_slot, &iommu_pmu->cpuhp_node); +- if (ret) { +- if (!nr_iommu_pmu) +- cpuhp_remove_multi_state(iommu_cpuhp_slot); +- return ret; +- } +- nr_iommu_pmu++; +- +- return 0; +-} +- +-static void iommu_pmu_cpuhp_free(struct iommu_pmu *iommu_pmu) +-{ +- cpuhp_state_remove_instance(iommu_cpuhp_slot, &iommu_pmu->cpuhp_node); +- +- if (--nr_iommu_pmu) +- return; +- +- cpuhp_remove_multi_state(iommu_cpuhp_slot); +-} +- + void iommu_pmu_register(struct intel_iommu *iommu) + { + struct iommu_pmu *iommu_pmu = iommu->pmu; +@@ -866,17 +765,12 @@ void iommu_pmu_register(struct intel_iom + if (__iommu_pmu_register(iommu)) + goto err; + +- if (iommu_pmu_cpuhp_setup(iommu_pmu)) +- goto unregister; +- + /* Set interrupt for overflow */ + if (iommu_pmu_set_interrupt(iommu)) +- goto cpuhp_free; ++ goto unregister; + + return; + +-cpuhp_free: +- iommu_pmu_cpuhp_free(iommu_pmu); + unregister: + perf_pmu_unregister(&iommu_pmu->pmu); + err: +@@ -892,6 +786,5 @@ void iommu_pmu_unregister(struct intel_i + return; + + iommu_pmu_unset_interrupt(iommu); +- iommu_pmu_cpuhp_free(iommu_pmu); + perf_pmu_unregister(&iommu_pmu->pmu); + } diff --git a/debian/patches/patchset-pf/amd-rapl/0005-dmaengine-idxd-Clean-up-cpumask-and-hotplug-for-perf.patch b/debian/patches/patchset-pf/amd-rapl/0005-dmaengine-idxd-Clean-up-cpumask-and-hotplug-for-perf.patch new file mode 100644 index 0000000..df672f0 --- /dev/null +++ b/debian/patches/patchset-pf/amd-rapl/0005-dmaengine-idxd-Clean-up-cpumask-and-hotplug-for-perf.patch @@ -0,0 +1,238 @@ +From 76278bd3946d618ead2d9cc22612a75a4ab99ace Mon Sep 17 00:00:00 2001 +From: Kan Liang +Date: Fri, 2 Aug 2024 08:16:41 -0700 +Subject: dmaengine: idxd: Clean up cpumask and hotplug for perfmon + +The idxd PMU is system-wide scope, which is supported by the generic +perf_event subsystem now. + +Set the scope for the idxd PMU and remove all the cpumask and hotplug +codes. + +Signed-off-by: Kan Liang +Cc: Fenghua Yu +Cc: Dave Jiang +Cc: Vinod Koul +Cc: dmaengine@vger.kernel.org +Reviewed-by: Dave Jiang +Reviewed-by: Fenghua Yu +--- + drivers/dma/idxd/idxd.h | 7 --- + drivers/dma/idxd/init.c | 3 -- + drivers/dma/idxd/perfmon.c | 98 +------------------------------------- + 3 files changed, 1 insertion(+), 107 deletions(-) + +--- a/drivers/dma/idxd/idxd.h ++++ b/drivers/dma/idxd/idxd.h +@@ -124,7 +124,6 @@ struct idxd_pmu { + + struct pmu pmu; + char name[IDXD_NAME_SIZE]; +- int cpu; + + int n_counters; + int counter_width; +@@ -135,8 +134,6 @@ struct idxd_pmu { + + unsigned long supported_filters; + int n_filters; +- +- struct hlist_node cpuhp_node; + }; + + #define IDXD_MAX_PRIORITY 0xf +@@ -803,14 +800,10 @@ void idxd_user_counter_increment(struct + int perfmon_pmu_init(struct idxd_device *idxd); + void perfmon_pmu_remove(struct idxd_device *idxd); + void perfmon_counter_overflow(struct idxd_device *idxd); +-void perfmon_init(void); +-void perfmon_exit(void); + #else + static inline int perfmon_pmu_init(struct idxd_device *idxd) { return 0; } + static inline void perfmon_pmu_remove(struct idxd_device *idxd) {} + static inline void perfmon_counter_overflow(struct idxd_device *idxd) {} +-static inline void perfmon_init(void) {} +-static inline void perfmon_exit(void) {} + #endif + + /* debugfs */ +--- a/drivers/dma/idxd/init.c ++++ b/drivers/dma/idxd/init.c +@@ -878,8 +878,6 @@ static int __init idxd_init_module(void) + else + support_enqcmd = true; + +- perfmon_init(); +- + err = idxd_driver_register(&idxd_drv); + if (err < 0) + goto err_idxd_driver_register; +@@ -928,7 +926,6 @@ static void __exit idxd_exit_module(void + idxd_driver_unregister(&idxd_drv); + pci_unregister_driver(&idxd_pci_driver); + idxd_cdev_remove(); +- perfmon_exit(); + idxd_remove_debugfs(); + } + module_exit(idxd_exit_module); +--- a/drivers/dma/idxd/perfmon.c ++++ b/drivers/dma/idxd/perfmon.c +@@ -6,29 +6,6 @@ + #include "idxd.h" + #include "perfmon.h" + +-static ssize_t cpumask_show(struct device *dev, struct device_attribute *attr, +- char *buf); +- +-static cpumask_t perfmon_dsa_cpu_mask; +-static bool cpuhp_set_up; +-static enum cpuhp_state cpuhp_slot; +- +-/* +- * perf userspace reads this attribute to determine which cpus to open +- * counters on. It's connected to perfmon_dsa_cpu_mask, which is +- * maintained by the cpu hotplug handlers. +- */ +-static DEVICE_ATTR_RO(cpumask); +- +-static struct attribute *perfmon_cpumask_attrs[] = { +- &dev_attr_cpumask.attr, +- NULL, +-}; +- +-static struct attribute_group cpumask_attr_group = { +- .attrs = perfmon_cpumask_attrs, +-}; +- + /* + * These attributes specify the bits in the config word that the perf + * syscall uses to pass the event ids and categories to perfmon. +@@ -67,16 +44,9 @@ static struct attribute_group perfmon_fo + + static const struct attribute_group *perfmon_attr_groups[] = { + &perfmon_format_attr_group, +- &cpumask_attr_group, + NULL, + }; + +-static ssize_t cpumask_show(struct device *dev, struct device_attribute *attr, +- char *buf) +-{ +- return cpumap_print_to_pagebuf(true, buf, &perfmon_dsa_cpu_mask); +-} +- + static bool is_idxd_event(struct idxd_pmu *idxd_pmu, struct perf_event *event) + { + return &idxd_pmu->pmu == event->pmu; +@@ -217,7 +187,6 @@ static int perfmon_pmu_event_init(struct + return -EINVAL; + + event->hw.event_base = ioread64(PERFMON_TABLE_OFFSET(idxd)); +- event->cpu = idxd->idxd_pmu->cpu; + event->hw.config = event->attr.config; + + if (event->group_leader != event) +@@ -488,6 +457,7 @@ static void idxd_pmu_init(struct idxd_pm + idxd_pmu->pmu.stop = perfmon_pmu_event_stop; + idxd_pmu->pmu.read = perfmon_pmu_event_update; + idxd_pmu->pmu.capabilities = PERF_PMU_CAP_NO_EXCLUDE; ++ idxd_pmu->pmu.scope = PERF_PMU_SCOPE_SYS_WIDE; + idxd_pmu->pmu.module = THIS_MODULE; + } + +@@ -496,47 +466,11 @@ void perfmon_pmu_remove(struct idxd_devi + if (!idxd->idxd_pmu) + return; + +- cpuhp_state_remove_instance(cpuhp_slot, &idxd->idxd_pmu->cpuhp_node); + perf_pmu_unregister(&idxd->idxd_pmu->pmu); + kfree(idxd->idxd_pmu); + idxd->idxd_pmu = NULL; + } + +-static int perf_event_cpu_online(unsigned int cpu, struct hlist_node *node) +-{ +- struct idxd_pmu *idxd_pmu; +- +- idxd_pmu = hlist_entry_safe(node, typeof(*idxd_pmu), cpuhp_node); +- +- /* select the first online CPU as the designated reader */ +- if (cpumask_empty(&perfmon_dsa_cpu_mask)) { +- cpumask_set_cpu(cpu, &perfmon_dsa_cpu_mask); +- idxd_pmu->cpu = cpu; +- } +- +- return 0; +-} +- +-static int perf_event_cpu_offline(unsigned int cpu, struct hlist_node *node) +-{ +- struct idxd_pmu *idxd_pmu; +- unsigned int target; +- +- idxd_pmu = hlist_entry_safe(node, typeof(*idxd_pmu), cpuhp_node); +- +- if (!cpumask_test_and_clear_cpu(cpu, &perfmon_dsa_cpu_mask)) +- return 0; +- +- target = cpumask_any_but(cpu_online_mask, cpu); +- /* migrate events if there is a valid target */ +- if (target < nr_cpu_ids) { +- cpumask_set_cpu(target, &perfmon_dsa_cpu_mask); +- perf_pmu_migrate_context(&idxd_pmu->pmu, cpu, target); +- } +- +- return 0; +-} +- + int perfmon_pmu_init(struct idxd_device *idxd) + { + union idxd_perfcap perfcap; +@@ -544,12 +478,6 @@ int perfmon_pmu_init(struct idxd_device + int rc = -ENODEV; + + /* +- * perfmon module initialization failed, nothing to do +- */ +- if (!cpuhp_set_up) +- return -ENODEV; +- +- /* + * If perfmon_offset or num_counters is 0, it means perfmon is + * not supported on this hardware. + */ +@@ -624,11 +552,6 @@ int perfmon_pmu_init(struct idxd_device + if (rc) + goto free; + +- rc = cpuhp_state_add_instance(cpuhp_slot, &idxd_pmu->cpuhp_node); +- if (rc) { +- perf_pmu_unregister(&idxd->idxd_pmu->pmu); +- goto free; +- } + out: + return rc; + free: +@@ -637,22 +560,3 @@ free: + + goto out; + } +- +-void __init perfmon_init(void) +-{ +- int rc = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, +- "driver/dma/idxd/perf:online", +- perf_event_cpu_online, +- perf_event_cpu_offline); +- if (WARN_ON(rc < 0)) +- return; +- +- cpuhp_slot = rc; +- cpuhp_set_up = true; +-} +- +-void __exit perfmon_exit(void) +-{ +- if (cpuhp_set_up) +- cpuhp_remove_multi_state(cpuhp_slot); +-} diff --git a/debian/patches/patchset-pf/amd-rapl/0006-perf-x86-rapl-Move-the-pmu-allocation-out-of-CPU-hot.patch b/debian/patches/patchset-pf/amd-rapl/0006-perf-x86-rapl-Move-the-pmu-allocation-out-of-CPU-hot.patch new file mode 100644 index 0000000..c78ed41 --- /dev/null +++ b/debian/patches/patchset-pf/amd-rapl/0006-perf-x86-rapl-Move-the-pmu-allocation-out-of-CPU-hot.patch @@ -0,0 +1,84 @@ +From fb0a3b5932882f02ed42fcaa6db73aba3eafd6d7 Mon Sep 17 00:00:00 2001 +From: Kan Liang +Date: Fri, 2 Aug 2024 08:16:42 -0700 +Subject: perf/x86/rapl: Move the pmu allocation out of CPU hotplug + +The rapl pmu just needs to be allocated once. It doesn't matter to be +allocated at each CPU hotplug, or the global init_rapl_pmus(). + +Move the pmu allocation to the init_rapl_pmus(). So the generic hotplug +supports can be applied. + +Signed-off-by: Kan Liang +Cc: Dhananjay Ugwekar +--- + arch/x86/events/rapl.c | 44 +++++++++++++++++++++++++++++------------- + 1 file changed, 31 insertions(+), 13 deletions(-) + +--- a/arch/x86/events/rapl.c ++++ b/arch/x86/events/rapl.c +@@ -568,19 +568,8 @@ static int rapl_cpu_online(unsigned int + struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu); + int target; + +- if (!pmu) { +- pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu)); +- if (!pmu) +- return -ENOMEM; +- +- raw_spin_lock_init(&pmu->lock); +- INIT_LIST_HEAD(&pmu->active_list); +- pmu->pmu = &rapl_pmus->pmu; +- pmu->timer_interval = ms_to_ktime(rapl_timer_ms); +- rapl_hrtimer_init(pmu); +- +- rapl_pmus->pmus[topology_logical_die_id(cpu)] = pmu; +- } ++ if (!pmu) ++ return -ENOMEM; + + /* + * Check if there is an online cpu in the package which collects rapl +@@ -673,6 +662,32 @@ static const struct attribute_group *rap + NULL, + }; + ++static void __init init_rapl_pmu(void) ++{ ++ struct rapl_pmu *pmu; ++ int cpu; ++ ++ cpus_read_lock(); ++ ++ for_each_cpu(cpu, cpu_online_mask) { ++ pmu = cpu_to_rapl_pmu(cpu); ++ if (pmu) ++ continue; ++ pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu)); ++ if (!pmu) ++ continue; ++ raw_spin_lock_init(&pmu->lock); ++ INIT_LIST_HEAD(&pmu->active_list); ++ pmu->pmu = &rapl_pmus->pmu; ++ pmu->timer_interval = ms_to_ktime(rapl_timer_ms); ++ rapl_hrtimer_init(pmu); ++ ++ rapl_pmus->pmus[topology_logical_die_id(cpu)] = pmu; ++ } ++ ++ cpus_read_unlock(); ++} ++ + static int __init init_rapl_pmus(void) + { + int nr_rapl_pmu = topology_max_packages() * topology_max_dies_per_package(); +@@ -693,6 +708,9 @@ static int __init init_rapl_pmus(void) + rapl_pmus->pmu.read = rapl_pmu_event_read; + rapl_pmus->pmu.module = THIS_MODULE; + rapl_pmus->pmu.capabilities = PERF_PMU_CAP_NO_EXCLUDE; ++ ++ init_rapl_pmu(); ++ + return 0; + } + diff --git a/debian/patches/patchset-pf/amd-rapl/0007-perf-x86-rapl-Clean-up-cpumask-and-hotplug.patch b/debian/patches/patchset-pf/amd-rapl/0007-perf-x86-rapl-Clean-up-cpumask-and-hotplug.patch new file mode 100644 index 0000000..565c457 --- /dev/null +++ b/debian/patches/patchset-pf/amd-rapl/0007-perf-x86-rapl-Clean-up-cpumask-and-hotplug.patch @@ -0,0 +1,179 @@ +From 7b4f6ba1b1dc5f3120652bcb5921a697d5167bff Mon Sep 17 00:00:00 2001 +From: Kan Liang +Date: Fri, 2 Aug 2024 08:16:43 -0700 +Subject: perf/x86/rapl: Clean up cpumask and hotplug + +The rapl pmu is die scope, which is supported by the generic perf_event +subsystem now. + +Set the scope for the rapl PMU and remove all the cpumask and hotplug +codes. + +Signed-off-by: Kan Liang +Cc: Dhananjay Ugwekar +--- + arch/x86/events/rapl.c | 80 +------------------------------------- + include/linux/cpuhotplug.h | 1 - + 2 files changed, 2 insertions(+), 79 deletions(-) + +--- a/arch/x86/events/rapl.c ++++ b/arch/x86/events/rapl.c +@@ -135,7 +135,6 @@ struct rapl_model { + /* 1/2^hw_unit Joule */ + static int rapl_hw_unit[NR_RAPL_DOMAINS] __read_mostly; + static struct rapl_pmus *rapl_pmus; +-static cpumask_t rapl_cpu_mask; + static unsigned int rapl_cntr_mask; + static u64 rapl_timer_ms; + static struct perf_msr *rapl_msrs; +@@ -340,8 +339,6 @@ static int rapl_pmu_event_init(struct pe + if (event->cpu < 0) + return -EINVAL; + +- event->event_caps |= PERF_EV_CAP_READ_ACTIVE_PKG; +- + if (!cfg || cfg >= NR_RAPL_DOMAINS + 1) + return -EINVAL; + +@@ -360,7 +357,6 @@ static int rapl_pmu_event_init(struct pe + pmu = cpu_to_rapl_pmu(event->cpu); + if (!pmu) + return -EINVAL; +- event->cpu = pmu->cpu; + event->pmu_private = pmu; + event->hw.event_base = rapl_msrs[bit].msr; + event->hw.config = cfg; +@@ -374,23 +370,6 @@ static void rapl_pmu_event_read(struct p + rapl_event_update(event); + } + +-static ssize_t rapl_get_attr_cpumask(struct device *dev, +- struct device_attribute *attr, char *buf) +-{ +- return cpumap_print_to_pagebuf(true, buf, &rapl_cpu_mask); +-} +- +-static DEVICE_ATTR(cpumask, S_IRUGO, rapl_get_attr_cpumask, NULL); +- +-static struct attribute *rapl_pmu_attrs[] = { +- &dev_attr_cpumask.attr, +- NULL, +-}; +- +-static struct attribute_group rapl_pmu_attr_group = { +- .attrs = rapl_pmu_attrs, +-}; +- + RAPL_EVENT_ATTR_STR(energy-cores, rapl_cores, "event=0x01"); + RAPL_EVENT_ATTR_STR(energy-pkg , rapl_pkg, "event=0x02"); + RAPL_EVENT_ATTR_STR(energy-ram , rapl_ram, "event=0x03"); +@@ -438,7 +417,6 @@ static struct attribute_group rapl_pmu_f + }; + + static const struct attribute_group *rapl_attr_groups[] = { +- &rapl_pmu_attr_group, + &rapl_pmu_format_group, + &rapl_pmu_events_group, + NULL, +@@ -541,49 +519,6 @@ static struct perf_msr amd_rapl_msrs[] = + [PERF_RAPL_PSYS] = { 0, &rapl_events_psys_group, NULL, false, 0 }, + }; + +-static int rapl_cpu_offline(unsigned int cpu) +-{ +- struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu); +- int target; +- +- /* Check if exiting cpu is used for collecting rapl events */ +- if (!cpumask_test_and_clear_cpu(cpu, &rapl_cpu_mask)) +- return 0; +- +- pmu->cpu = -1; +- /* Find a new cpu to collect rapl events */ +- target = cpumask_any_but(topology_die_cpumask(cpu), cpu); +- +- /* Migrate rapl events to the new target */ +- if (target < nr_cpu_ids) { +- cpumask_set_cpu(target, &rapl_cpu_mask); +- pmu->cpu = target; +- perf_pmu_migrate_context(pmu->pmu, cpu, target); +- } +- return 0; +-} +- +-static int rapl_cpu_online(unsigned int cpu) +-{ +- struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu); +- int target; +- +- if (!pmu) +- return -ENOMEM; +- +- /* +- * Check if there is an online cpu in the package which collects rapl +- * events already. +- */ +- target = cpumask_any_and(&rapl_cpu_mask, topology_die_cpumask(cpu)); +- if (target < nr_cpu_ids) +- return 0; +- +- cpumask_set_cpu(cpu, &rapl_cpu_mask); +- pmu->cpu = cpu; +- return 0; +-} +- + static int rapl_check_hw_unit(struct rapl_model *rm) + { + u64 msr_rapl_power_unit_bits; +@@ -707,6 +642,7 @@ static int __init init_rapl_pmus(void) + rapl_pmus->pmu.stop = rapl_pmu_event_stop; + rapl_pmus->pmu.read = rapl_pmu_event_read; + rapl_pmus->pmu.module = THIS_MODULE; ++ rapl_pmus->pmu.scope = PERF_PMU_SCOPE_DIE; + rapl_pmus->pmu.capabilities = PERF_PMU_CAP_NO_EXCLUDE; + + init_rapl_pmu(); +@@ -857,24 +793,13 @@ static int __init rapl_pmu_init(void) + if (ret) + return ret; + +- /* +- * Install callbacks. Core will call them for each online cpu. +- */ +- ret = cpuhp_setup_state(CPUHP_AP_PERF_X86_RAPL_ONLINE, +- "perf/x86/rapl:online", +- rapl_cpu_online, rapl_cpu_offline); +- if (ret) +- goto out; +- + ret = perf_pmu_register(&rapl_pmus->pmu, "power", -1); + if (ret) +- goto out1; ++ goto out; + + rapl_advertise(); + return 0; + +-out1: +- cpuhp_remove_state(CPUHP_AP_PERF_X86_RAPL_ONLINE); + out: + pr_warn("Initialization failed (%d), disabled\n", ret); + cleanup_rapl_pmus(); +@@ -884,7 +809,6 @@ module_init(rapl_pmu_init); + + static void __exit intel_rapl_exit(void) + { +- cpuhp_remove_state_nocalls(CPUHP_AP_PERF_X86_RAPL_ONLINE); + perf_pmu_unregister(&rapl_pmus->pmu); + cleanup_rapl_pmus(); + } +--- a/include/linux/cpuhotplug.h ++++ b/include/linux/cpuhotplug.h +@@ -207,7 +207,6 @@ enum cpuhp_state { + CPUHP_AP_PERF_X86_UNCORE_ONLINE, + CPUHP_AP_PERF_X86_AMD_UNCORE_ONLINE, + CPUHP_AP_PERF_X86_AMD_POWER_ONLINE, +- CPUHP_AP_PERF_X86_RAPL_ONLINE, + CPUHP_AP_PERF_S390_CF_ONLINE, + CPUHP_AP_PERF_S390_SF_ONLINE, + CPUHP_AP_PERF_ARM_CCI_ONLINE, diff --git a/debian/patches/patchset-pf/amd-rapl/0008-perf-x86-rapl-Fix-the-energy-pkg-event-for-AMD-CPUs.patch b/debian/patches/patchset-pf/amd-rapl/0008-perf-x86-rapl-Fix-the-energy-pkg-event-for-AMD-CPUs.patch new file mode 100644 index 0000000..e535abd --- /dev/null +++ b/debian/patches/patchset-pf/amd-rapl/0008-perf-x86-rapl-Fix-the-energy-pkg-event-for-AMD-CPUs.patch @@ -0,0 +1,101 @@ +From f1525664ff9da3241b3556594dc0b67506ae1ddd Mon Sep 17 00:00:00 2001 +From: Dhananjay Ugwekar +Date: Tue, 10 Sep 2024 14:25:05 +0530 +Subject: perf/x86/rapl: Fix the energy-pkg event for AMD CPUs + +After commit ("x86/cpu/topology: Add support for the AMD 0x80000026 leaf"), +on AMD processors that support extended CPUID leaf 0x80000026, the +topology_die_cpumask() and topology_logical_die_id() macros, no longer +return the package cpumask and package id, instead they return the CCD +(Core Complex Die) mask and id respectively. This leads to the energy-pkg +event scope to be modified to CCD instead of package. + +So, change the PMU scope for AMD and Hygon back to package. + +On a 12 CCD 1 Package AMD Zen4 Genoa machine: + +Before: +$ cat /sys/devices/power/cpumask +0,8,16,24,32,40,48,56,64,72,80,88. + +The expected cpumask here is supposed to be just "0", as it is a package +scope event, only one CPU will be collecting the event for all the CPUs in +the package. + +After: +$ cat /sys/devices/power/cpumask +0 + +Signed-off-by: Dhananjay Ugwekar +--- + arch/x86/events/rapl.c | 35 ++++++++++++++++++++++++++++++++--- + 1 file changed, 32 insertions(+), 3 deletions(-) + +--- a/arch/x86/events/rapl.c ++++ b/arch/x86/events/rapl.c +@@ -139,9 +139,32 @@ static unsigned int rapl_cntr_mask; + static u64 rapl_timer_ms; + static struct perf_msr *rapl_msrs; + ++/* ++ * RAPL Package energy counter scope: ++ * 1. AMD/HYGON platforms have a per-PKG package energy counter ++ * 2. For Intel platforms ++ * 2.1. CLX-AP is multi-die and its RAPL MSRs are die-scope ++ * 2.2. Other Intel platforms are single die systems so the scope can be ++ * considered as either pkg-scope or die-scope, and we are considering ++ * them as die-scope. ++ */ ++#define rapl_pmu_is_pkg_scope() \ ++ (boot_cpu_data.x86_vendor == X86_VENDOR_AMD || \ ++ boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) ++ ++/* ++ * Helper function to get the correct topology id according to the ++ * RAPL PMU scope. ++ */ ++static inline unsigned int get_rapl_pmu_idx(int cpu) ++{ ++ return rapl_pmu_is_pkg_scope() ? topology_logical_package_id(cpu) : ++ topology_logical_die_id(cpu); ++} ++ + static inline struct rapl_pmu *cpu_to_rapl_pmu(unsigned int cpu) + { +- unsigned int rapl_pmu_idx = topology_logical_die_id(cpu); ++ unsigned int rapl_pmu_idx = get_rapl_pmu_idx(cpu); + + /* + * The unsigned check also catches the '-1' return value for non +@@ -617,7 +640,7 @@ static void __init init_rapl_pmu(void) + pmu->timer_interval = ms_to_ktime(rapl_timer_ms); + rapl_hrtimer_init(pmu); + +- rapl_pmus->pmus[topology_logical_die_id(cpu)] = pmu; ++ rapl_pmus->pmus[get_rapl_pmu_idx(cpu)] = pmu; + } + + cpus_read_unlock(); +@@ -626,6 +649,12 @@ static void __init init_rapl_pmu(void) + static int __init init_rapl_pmus(void) + { + int nr_rapl_pmu = topology_max_packages() * topology_max_dies_per_package(); ++ int rapl_pmu_scope = PERF_PMU_SCOPE_DIE; ++ ++ if (rapl_pmu_is_pkg_scope()) { ++ nr_rapl_pmu = topology_max_packages(); ++ rapl_pmu_scope = PERF_PMU_SCOPE_PKG; ++ } + + rapl_pmus = kzalloc(struct_size(rapl_pmus, pmus, nr_rapl_pmu), GFP_KERNEL); + if (!rapl_pmus) +@@ -641,8 +670,8 @@ static int __init init_rapl_pmus(void) + rapl_pmus->pmu.start = rapl_pmu_event_start; + rapl_pmus->pmu.stop = rapl_pmu_event_stop; + rapl_pmus->pmu.read = rapl_pmu_event_read; ++ rapl_pmus->pmu.scope = rapl_pmu_scope; + rapl_pmus->pmu.module = THIS_MODULE; +- rapl_pmus->pmu.scope = PERF_PMU_SCOPE_DIE; + rapl_pmus->pmu.capabilities = PERF_PMU_CAP_NO_EXCLUDE; + + init_rapl_pmu(); diff --git a/debian/patches/patchset-pf/amd-rapl/0009-x86-topology-Introduce-topology_logical_core_id.patch b/debian/patches/patchset-pf/amd-rapl/0009-x86-topology-Introduce-topology_logical_core_id.patch new file mode 100644 index 0000000..44994f6 --- /dev/null +++ b/debian/patches/patchset-pf/amd-rapl/0009-x86-topology-Introduce-topology_logical_core_id.patch @@ -0,0 +1,77 @@ +From 9439067951f4d857272836b35812af26650d9c16 Mon Sep 17 00:00:00 2001 +From: K Prateek Nayak +Date: Fri, 13 Sep 2024 15:21:41 +0000 +Subject: x86/topology: Introduce topology_logical_core_id() + +On x86, topology_core_id() returns a unique core ID within the PKG +domain. Looking at match_smt() suggests that a core ID just needs to be +unique within a LLC domain. For use cases such as the per-core RAPL PMU, +there exists a need for a unique core ID across the entire system with +multiple PKG domains. Introduce topology_logical_core_id() to derive a +unique core ID across the system. + +Signed-off-by: K Prateek Nayak +Signed-off-by: Dhananjay Ugwekar +Reviewed-by: Zhang Rui +Tested-by: K Prateek Nayak +--- + Documentation/arch/x86/topology.rst | 4 ++++ + arch/x86/include/asm/processor.h | 1 + + arch/x86/include/asm/topology.h | 1 + + arch/x86/kernel/cpu/debugfs.c | 1 + + arch/x86/kernel/cpu/topology_common.c | 1 + + 5 files changed, 8 insertions(+) + +--- a/Documentation/arch/x86/topology.rst ++++ b/Documentation/arch/x86/topology.rst +@@ -135,6 +135,10 @@ Thread-related topology information in t + The ID of the core to which a thread belongs. It is also printed in /proc/cpuinfo + "core_id." + ++ - topology_logical_core_id(); ++ ++ The logical core ID to which a thread belongs. ++ + + + System topology examples +--- a/arch/x86/include/asm/processor.h ++++ b/arch/x86/include/asm/processor.h +@@ -98,6 +98,7 @@ struct cpuinfo_topology { + // Logical ID mappings + u32 logical_pkg_id; + u32 logical_die_id; ++ u32 logical_core_id; + + // AMD Node ID and Nodes per Package info + u32 amd_node_id; +--- a/arch/x86/include/asm/topology.h ++++ b/arch/x86/include/asm/topology.h +@@ -137,6 +137,7 @@ extern const struct cpumask *cpu_cluster + #define topology_logical_package_id(cpu) (cpu_data(cpu).topo.logical_pkg_id) + #define topology_physical_package_id(cpu) (cpu_data(cpu).topo.pkg_id) + #define topology_logical_die_id(cpu) (cpu_data(cpu).topo.logical_die_id) ++#define topology_logical_core_id(cpu) (cpu_data(cpu).topo.logical_core_id) + #define topology_die_id(cpu) (cpu_data(cpu).topo.die_id) + #define topology_core_id(cpu) (cpu_data(cpu).topo.core_id) + #define topology_ppin(cpu) (cpu_data(cpu).ppin) +--- a/arch/x86/kernel/cpu/debugfs.c ++++ b/arch/x86/kernel/cpu/debugfs.c +@@ -24,6 +24,7 @@ static int cpu_debug_show(struct seq_fil + seq_printf(m, "core_id: %u\n", c->topo.core_id); + seq_printf(m, "logical_pkg_id: %u\n", c->topo.logical_pkg_id); + seq_printf(m, "logical_die_id: %u\n", c->topo.logical_die_id); ++ seq_printf(m, "logical_core_id: %u\n", c->topo.logical_core_id); + seq_printf(m, "llc_id: %u\n", c->topo.llc_id); + seq_printf(m, "l2c_id: %u\n", c->topo.l2c_id); + seq_printf(m, "amd_node_id: %u\n", c->topo.amd_node_id); +--- a/arch/x86/kernel/cpu/topology_common.c ++++ b/arch/x86/kernel/cpu/topology_common.c +@@ -151,6 +151,7 @@ static void topo_set_ids(struct topo_sca + if (!early) { + c->topo.logical_pkg_id = topology_get_logical_id(apicid, TOPO_PKG_DOMAIN); + c->topo.logical_die_id = topology_get_logical_id(apicid, TOPO_DIE_DOMAIN); ++ c->topo.logical_core_id = topology_get_logical_id(apicid, TOPO_CORE_DOMAIN); + } + + /* Package relative core ID */ diff --git a/debian/patches/patchset-pf/amd-rapl/0010-perf-x86-rapl-Remove-the-cpu_to_rapl_pmu-function.patch b/debian/patches/patchset-pf/amd-rapl/0010-perf-x86-rapl-Remove-the-cpu_to_rapl_pmu-function.patch new file mode 100644 index 0000000..75276ef --- /dev/null +++ b/debian/patches/patchset-pf/amd-rapl/0010-perf-x86-rapl-Remove-the-cpu_to_rapl_pmu-function.patch @@ -0,0 +1,87 @@ +From b8e1231d5f78314de8f9066baba7b1fdd5e59218 Mon Sep 17 00:00:00 2001 +From: Dhananjay Ugwekar +Date: Fri, 13 Sep 2024 15:21:42 +0000 +Subject: perf/x86/rapl: Remove the cpu_to_rapl_pmu() function + +Preparation for the addition of per-core RAPL energy counter support for +AMD CPUs. Post which, one cpu might be mapped to more than one rapl_pmu +(package/die one or per-core one), also makes sense to use the +get_rapl_pmu_idx macro which is anyway used to index into the +rapl_pmus->pmus[] array. + +Signed-off-by: Dhananjay Ugwekar +--- + arch/x86/events/rapl.c | 29 +++++++++++++---------------- + 1 file changed, 13 insertions(+), 16 deletions(-) + +--- a/arch/x86/events/rapl.c ++++ b/arch/x86/events/rapl.c +@@ -162,17 +162,6 @@ static inline unsigned int get_rapl_pmu_ + topology_logical_die_id(cpu); + } + +-static inline struct rapl_pmu *cpu_to_rapl_pmu(unsigned int cpu) +-{ +- unsigned int rapl_pmu_idx = get_rapl_pmu_idx(cpu); +- +- /* +- * The unsigned check also catches the '-1' return value for non +- * existent mappings in the topology map. +- */ +- return rapl_pmu_idx < rapl_pmus->nr_rapl_pmu ? rapl_pmus->pmus[rapl_pmu_idx] : NULL; +-} +- + static inline u64 rapl_read_counter(struct perf_event *event) + { + u64 raw; +@@ -348,7 +337,7 @@ static void rapl_pmu_event_del(struct pe + static int rapl_pmu_event_init(struct perf_event *event) + { + u64 cfg = event->attr.config & RAPL_EVENT_MASK; +- int bit, ret = 0; ++ int bit, rapl_pmu_idx, ret = 0; + struct rapl_pmu *pmu; + + /* only look at RAPL events */ +@@ -376,8 +365,12 @@ static int rapl_pmu_event_init(struct pe + if (event->attr.sample_period) /* no sampling */ + return -EINVAL; + ++ rapl_pmu_idx = get_rapl_pmu_idx(event->cpu); ++ if (rapl_pmu_idx >= rapl_pmus->nr_rapl_pmu) ++ return -EINVAL; ++ + /* must be done before validate_group */ +- pmu = cpu_to_rapl_pmu(event->cpu); ++ pmu = rapl_pmus->pmus[rapl_pmu_idx]; + if (!pmu) + return -EINVAL; + event->pmu_private = pmu; +@@ -623,12 +616,16 @@ static const struct attribute_group *rap + static void __init init_rapl_pmu(void) + { + struct rapl_pmu *pmu; +- int cpu; ++ int cpu, rapl_pmu_idx; + + cpus_read_lock(); + + for_each_cpu(cpu, cpu_online_mask) { +- pmu = cpu_to_rapl_pmu(cpu); ++ rapl_pmu_idx = get_rapl_pmu_idx(cpu); ++ if (rapl_pmu_idx >= rapl_pmus->nr_rapl_pmu) ++ continue; ++ ++ pmu = rapl_pmus->pmus[rapl_pmu_idx]; + if (pmu) + continue; + pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu)); +@@ -640,7 +637,7 @@ static void __init init_rapl_pmu(void) + pmu->timer_interval = ms_to_ktime(rapl_timer_ms); + rapl_hrtimer_init(pmu); + +- rapl_pmus->pmus[get_rapl_pmu_idx(cpu)] = pmu; ++ rapl_pmus->pmus[rapl_pmu_idx] = pmu; + } + + cpus_read_unlock(); diff --git a/debian/patches/patchset-pf/amd-rapl/0011-perf-x86-rapl-Rename-rapl_pmu-variables.patch b/debian/patches/patchset-pf/amd-rapl/0011-perf-x86-rapl-Rename-rapl_pmu-variables.patch new file mode 100644 index 0000000..0846423 --- /dev/null +++ b/debian/patches/patchset-pf/amd-rapl/0011-perf-x86-rapl-Rename-rapl_pmu-variables.patch @@ -0,0 +1,240 @@ +From 07ec9f38cac6eb6e5b0b062ef99e9458ba567de8 Mon Sep 17 00:00:00 2001 +From: Dhananjay Ugwekar +Date: Fri, 13 Sep 2024 15:21:43 +0000 +Subject: perf/x86/rapl: Rename rapl_pmu variables + +Rename struct rapl_pmu variables from "pmu" to "rapl_pmu", to +avoid any confusion between the variables of two different +structs pmu and rapl_pmu. As rapl_pmu also contains a pointer to +struct pmu, which leads to situations in code like pmu->pmu, +which is needlessly confusing. Above scenario is replaced with +much more readable rapl_pmu->pmu with this change. + +Also rename "pmus" member in rapl_pmus struct, for same reason. + +No functional change. + +Signed-off-by: Dhananjay Ugwekar +--- + arch/x86/events/rapl.c | 93 +++++++++++++++++++++--------------------- + 1 file changed, 47 insertions(+), 46 deletions(-) + +--- a/arch/x86/events/rapl.c ++++ b/arch/x86/events/rapl.c +@@ -116,7 +116,7 @@ struct rapl_pmu { + struct rapl_pmus { + struct pmu pmu; + unsigned int nr_rapl_pmu; +- struct rapl_pmu *pmus[] __counted_by(nr_rapl_pmu); ++ struct rapl_pmu *rapl_pmu[] __counted_by(nr_rapl_pmu); + }; + + enum rapl_unit_quirk { +@@ -223,34 +223,34 @@ static void rapl_start_hrtimer(struct ra + + static enum hrtimer_restart rapl_hrtimer_handle(struct hrtimer *hrtimer) + { +- struct rapl_pmu *pmu = container_of(hrtimer, struct rapl_pmu, hrtimer); ++ struct rapl_pmu *rapl_pmu = container_of(hrtimer, struct rapl_pmu, hrtimer); + struct perf_event *event; + unsigned long flags; + +- if (!pmu->n_active) ++ if (!rapl_pmu->n_active) + return HRTIMER_NORESTART; + +- raw_spin_lock_irqsave(&pmu->lock, flags); ++ raw_spin_lock_irqsave(&rapl_pmu->lock, flags); + +- list_for_each_entry(event, &pmu->active_list, active_entry) ++ list_for_each_entry(event, &rapl_pmu->active_list, active_entry) + rapl_event_update(event); + +- raw_spin_unlock_irqrestore(&pmu->lock, flags); ++ raw_spin_unlock_irqrestore(&rapl_pmu->lock, flags); + +- hrtimer_forward_now(hrtimer, pmu->timer_interval); ++ hrtimer_forward_now(hrtimer, rapl_pmu->timer_interval); + + return HRTIMER_RESTART; + } + +-static void rapl_hrtimer_init(struct rapl_pmu *pmu) ++static void rapl_hrtimer_init(struct rapl_pmu *rapl_pmu) + { +- struct hrtimer *hr = &pmu->hrtimer; ++ struct hrtimer *hr = &rapl_pmu->hrtimer; + + hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hr->function = rapl_hrtimer_handle; + } + +-static void __rapl_pmu_event_start(struct rapl_pmu *pmu, ++static void __rapl_pmu_event_start(struct rapl_pmu *rapl_pmu, + struct perf_event *event) + { + if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED))) +@@ -258,39 +258,39 @@ static void __rapl_pmu_event_start(struc + + event->hw.state = 0; + +- list_add_tail(&event->active_entry, &pmu->active_list); ++ list_add_tail(&event->active_entry, &rapl_pmu->active_list); + + local64_set(&event->hw.prev_count, rapl_read_counter(event)); + +- pmu->n_active++; +- if (pmu->n_active == 1) +- rapl_start_hrtimer(pmu); ++ rapl_pmu->n_active++; ++ if (rapl_pmu->n_active == 1) ++ rapl_start_hrtimer(rapl_pmu); + } + + static void rapl_pmu_event_start(struct perf_event *event, int mode) + { +- struct rapl_pmu *pmu = event->pmu_private; ++ struct rapl_pmu *rapl_pmu = event->pmu_private; + unsigned long flags; + +- raw_spin_lock_irqsave(&pmu->lock, flags); +- __rapl_pmu_event_start(pmu, event); +- raw_spin_unlock_irqrestore(&pmu->lock, flags); ++ raw_spin_lock_irqsave(&rapl_pmu->lock, flags); ++ __rapl_pmu_event_start(rapl_pmu, event); ++ raw_spin_unlock_irqrestore(&rapl_pmu->lock, flags); + } + + static void rapl_pmu_event_stop(struct perf_event *event, int mode) + { +- struct rapl_pmu *pmu = event->pmu_private; ++ struct rapl_pmu *rapl_pmu = event->pmu_private; + struct hw_perf_event *hwc = &event->hw; + unsigned long flags; + +- raw_spin_lock_irqsave(&pmu->lock, flags); ++ raw_spin_lock_irqsave(&rapl_pmu->lock, flags); + + /* mark event as deactivated and stopped */ + if (!(hwc->state & PERF_HES_STOPPED)) { +- WARN_ON_ONCE(pmu->n_active <= 0); +- pmu->n_active--; +- if (pmu->n_active == 0) +- hrtimer_cancel(&pmu->hrtimer); ++ WARN_ON_ONCE(rapl_pmu->n_active <= 0); ++ rapl_pmu->n_active--; ++ if (rapl_pmu->n_active == 0) ++ hrtimer_cancel(&rapl_pmu->hrtimer); + + list_del(&event->active_entry); + +@@ -308,23 +308,23 @@ static void rapl_pmu_event_stop(struct p + hwc->state |= PERF_HES_UPTODATE; + } + +- raw_spin_unlock_irqrestore(&pmu->lock, flags); ++ raw_spin_unlock_irqrestore(&rapl_pmu->lock, flags); + } + + static int rapl_pmu_event_add(struct perf_event *event, int mode) + { +- struct rapl_pmu *pmu = event->pmu_private; ++ struct rapl_pmu *rapl_pmu = event->pmu_private; + struct hw_perf_event *hwc = &event->hw; + unsigned long flags; + +- raw_spin_lock_irqsave(&pmu->lock, flags); ++ raw_spin_lock_irqsave(&rapl_pmu->lock, flags); + + hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED; + + if (mode & PERF_EF_START) +- __rapl_pmu_event_start(pmu, event); ++ __rapl_pmu_event_start(rapl_pmu, event); + +- raw_spin_unlock_irqrestore(&pmu->lock, flags); ++ raw_spin_unlock_irqrestore(&rapl_pmu->lock, flags); + + return 0; + } +@@ -338,7 +338,7 @@ static int rapl_pmu_event_init(struct pe + { + u64 cfg = event->attr.config & RAPL_EVENT_MASK; + int bit, rapl_pmu_idx, ret = 0; +- struct rapl_pmu *pmu; ++ struct rapl_pmu *rapl_pmu; + + /* only look at RAPL events */ + if (event->attr.type != rapl_pmus->pmu.type) +@@ -370,10 +370,11 @@ static int rapl_pmu_event_init(struct pe + return -EINVAL; + + /* must be done before validate_group */ +- pmu = rapl_pmus->pmus[rapl_pmu_idx]; +- if (!pmu) ++ rapl_pmu = rapl_pmus->rapl_pmu[rapl_pmu_idx]; ++ if (!rapl_pmu) + return -EINVAL; +- event->pmu_private = pmu; ++ ++ event->pmu_private = rapl_pmu; + event->hw.event_base = rapl_msrs[bit].msr; + event->hw.config = cfg; + event->hw.idx = bit; +@@ -600,7 +601,7 @@ static void cleanup_rapl_pmus(void) + int i; + + for (i = 0; i < rapl_pmus->nr_rapl_pmu; i++) +- kfree(rapl_pmus->pmus[i]); ++ kfree(rapl_pmus->rapl_pmu[i]); + kfree(rapl_pmus); + } + +@@ -615,7 +616,7 @@ static const struct attribute_group *rap + + static void __init init_rapl_pmu(void) + { +- struct rapl_pmu *pmu; ++ struct rapl_pmu *rapl_pmu; + int cpu, rapl_pmu_idx; + + cpus_read_lock(); +@@ -625,19 +626,19 @@ static void __init init_rapl_pmu(void) + if (rapl_pmu_idx >= rapl_pmus->nr_rapl_pmu) + continue; + +- pmu = rapl_pmus->pmus[rapl_pmu_idx]; +- if (pmu) ++ rapl_pmu = rapl_pmus->rapl_pmu[rapl_pmu_idx]; ++ if (rapl_pmu) + continue; +- pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu)); +- if (!pmu) ++ rapl_pmu = kzalloc_node(sizeof(*rapl_pmu), GFP_KERNEL, cpu_to_node(cpu)); ++ if (!rapl_pmu) + continue; +- raw_spin_lock_init(&pmu->lock); +- INIT_LIST_HEAD(&pmu->active_list); +- pmu->pmu = &rapl_pmus->pmu; +- pmu->timer_interval = ms_to_ktime(rapl_timer_ms); +- rapl_hrtimer_init(pmu); ++ raw_spin_lock_init(&rapl_pmu->lock); ++ INIT_LIST_HEAD(&rapl_pmu->active_list); ++ rapl_pmu->pmu = &rapl_pmus->pmu; ++ rapl_pmu->timer_interval = ms_to_ktime(rapl_timer_ms); ++ rapl_hrtimer_init(rapl_pmu); + +- rapl_pmus->pmus[rapl_pmu_idx] = pmu; ++ rapl_pmus->rapl_pmu[rapl_pmu_idx] = rapl_pmu; + } + + cpus_read_unlock(); +@@ -653,7 +654,7 @@ static int __init init_rapl_pmus(void) + rapl_pmu_scope = PERF_PMU_SCOPE_PKG; + } + +- rapl_pmus = kzalloc(struct_size(rapl_pmus, pmus, nr_rapl_pmu), GFP_KERNEL); ++ rapl_pmus = kzalloc(struct_size(rapl_pmus, rapl_pmu, nr_rapl_pmu), GFP_KERNEL); + if (!rapl_pmus) + return -ENOMEM; + diff --git a/debian/patches/patchset-pf/amd-rapl/0012-perf-x86-rapl-Make-rapl_model-struct-global.patch b/debian/patches/patchset-pf/amd-rapl/0012-perf-x86-rapl-Make-rapl_model-struct-global.patch new file mode 100644 index 0000000..9bc67dc --- /dev/null +++ b/debian/patches/patchset-pf/amd-rapl/0012-perf-x86-rapl-Make-rapl_model-struct-global.patch @@ -0,0 +1,75 @@ +From 68614752b9fd6b6bae6f9ab7b02fc28350c5a541 Mon Sep 17 00:00:00 2001 +From: Dhananjay Ugwekar +Date: Fri, 13 Sep 2024 15:47:56 +0000 +Subject: perf/x86/rapl: Make rapl_model struct global + +Preparation for per-core energy counter support addition for AMD CPUs. + +As there will always be just one rapl_model variable on a system, make it +global, to make it easier to access it from any function. + +No functional change. + +Signed-off-by: Dhananjay Ugwekar +--- + arch/x86/events/rapl.c | 16 ++++++++-------- + 1 file changed, 8 insertions(+), 8 deletions(-) + +--- a/arch/x86/events/rapl.c ++++ b/arch/x86/events/rapl.c +@@ -138,6 +138,7 @@ static struct rapl_pmus *rapl_pmus; + static unsigned int rapl_cntr_mask; + static u64 rapl_timer_ms; + static struct perf_msr *rapl_msrs; ++static struct rapl_model *rapl_model; + + /* + * RAPL Package energy counter scope: +@@ -536,18 +537,18 @@ static struct perf_msr amd_rapl_msrs[] = + [PERF_RAPL_PSYS] = { 0, &rapl_events_psys_group, NULL, false, 0 }, + }; + +-static int rapl_check_hw_unit(struct rapl_model *rm) ++static int rapl_check_hw_unit(void) + { + u64 msr_rapl_power_unit_bits; + int i; + + /* protect rdmsrl() to handle virtualization */ +- if (rdmsrl_safe(rm->msr_power_unit, &msr_rapl_power_unit_bits)) ++ if (rdmsrl_safe(rapl_model->msr_power_unit, &msr_rapl_power_unit_bits)) + return -1; + for (i = 0; i < NR_RAPL_DOMAINS; i++) + rapl_hw_unit[i] = (msr_rapl_power_unit_bits >> 8) & 0x1FULL; + +- switch (rm->unit_quirk) { ++ switch (rapl_model->unit_quirk) { + /* + * DRAM domain on HSW server and KNL has fixed energy unit which can be + * different than the unit from power unit MSR. See +@@ -798,21 +799,20 @@ MODULE_DEVICE_TABLE(x86cpu, rapl_model_m + static int __init rapl_pmu_init(void) + { + const struct x86_cpu_id *id; +- struct rapl_model *rm; + int ret; + + id = x86_match_cpu(rapl_model_match); + if (!id) + return -ENODEV; + +- rm = (struct rapl_model *) id->driver_data; ++ rapl_model = (struct rapl_model *) id->driver_data; + +- rapl_msrs = rm->rapl_msrs; ++ rapl_msrs = rapl_model->rapl_msrs; + + rapl_cntr_mask = perf_msr_probe(rapl_msrs, PERF_RAPL_MAX, +- false, (void *) &rm->events); ++ false, (void *) &rapl_model->events); + +- ret = rapl_check_hw_unit(rm); ++ ret = rapl_check_hw_unit(); + if (ret) + return ret; + diff --git a/debian/patches/patchset-pf/amd-rapl/0013-perf-x86-rapl-Add-arguments-to-the-cleanup-and-init-.patch b/debian/patches/patchset-pf/amd-rapl/0013-perf-x86-rapl-Add-arguments-to-the-cleanup-and-init-.patch new file mode 100644 index 0000000..9fb5743 --- /dev/null +++ b/debian/patches/patchset-pf/amd-rapl/0013-perf-x86-rapl-Add-arguments-to-the-cleanup-and-init-.patch @@ -0,0 +1,112 @@ +From b10b887510ccb0b6bc7294888982b862703c9c32 Mon Sep 17 00:00:00 2001 +From: Dhananjay Ugwekar +Date: Fri, 13 Sep 2024 15:47:57 +0000 +Subject: perf/x86/rapl: Add arguments to the cleanup and init functions + +Prep for per-core RAPL PMU addition. + +No functional change. + +Signed-off-by: Dhananjay Ugwekar +--- + arch/x86/events/rapl.c | 32 +++++++++++++++++++------------- + 1 file changed, 19 insertions(+), 13 deletions(-) + +--- a/arch/x86/events/rapl.c ++++ b/arch/x86/events/rapl.c +@@ -597,7 +597,7 @@ static void __init rapl_advertise(void) + } + } + +-static void cleanup_rapl_pmus(void) ++static void cleanup_rapl_pmus(struct rapl_pmus *rapl_pmus) + { + int i; + +@@ -615,7 +615,7 @@ static const struct attribute_group *rap + NULL, + }; + +-static void __init init_rapl_pmu(void) ++static void __init init_rapl_pmu(struct rapl_pmus *rapl_pmus) + { + struct rapl_pmu *rapl_pmu; + int cpu, rapl_pmu_idx; +@@ -645,20 +645,22 @@ static void __init init_rapl_pmu(void) + cpus_read_unlock(); + } + +-static int __init init_rapl_pmus(void) ++static int __init init_rapl_pmus(struct rapl_pmus **rapl_pmus_ptr, int rapl_pmu_scope) + { +- int nr_rapl_pmu = topology_max_packages() * topology_max_dies_per_package(); +- int rapl_pmu_scope = PERF_PMU_SCOPE_DIE; ++ int nr_rapl_pmu; ++ struct rapl_pmus *rapl_pmus; + +- if (rapl_pmu_is_pkg_scope()) { +- nr_rapl_pmu = topology_max_packages(); +- rapl_pmu_scope = PERF_PMU_SCOPE_PKG; +- } ++ if (rapl_pmu_scope == PERF_PMU_SCOPE_PKG) ++ nr_rapl_pmu = topology_max_packages(); ++ else ++ nr_rapl_pmu = topology_max_packages() * topology_max_dies_per_package(); + + rapl_pmus = kzalloc(struct_size(rapl_pmus, rapl_pmu, nr_rapl_pmu), GFP_KERNEL); + if (!rapl_pmus) + return -ENOMEM; + ++ *rapl_pmus_ptr = rapl_pmus; ++ + rapl_pmus->nr_rapl_pmu = nr_rapl_pmu; + rapl_pmus->pmu.attr_groups = rapl_attr_groups; + rapl_pmus->pmu.attr_update = rapl_attr_update; +@@ -673,7 +675,7 @@ static int __init init_rapl_pmus(void) + rapl_pmus->pmu.module = THIS_MODULE; + rapl_pmus->pmu.capabilities = PERF_PMU_CAP_NO_EXCLUDE; + +- init_rapl_pmu(); ++ init_rapl_pmu(rapl_pmus); + + return 0; + } +@@ -799,8 +801,12 @@ MODULE_DEVICE_TABLE(x86cpu, rapl_model_m + static int __init rapl_pmu_init(void) + { + const struct x86_cpu_id *id; ++ int rapl_pmu_scope = PERF_PMU_SCOPE_DIE; + int ret; + ++ if (rapl_pmu_is_pkg_scope()) ++ rapl_pmu_scope = PERF_PMU_SCOPE_PKG; ++ + id = x86_match_cpu(rapl_model_match); + if (!id) + return -ENODEV; +@@ -816,7 +822,7 @@ static int __init rapl_pmu_init(void) + if (ret) + return ret; + +- ret = init_rapl_pmus(); ++ ret = init_rapl_pmus(&rapl_pmus, rapl_pmu_scope); + if (ret) + return ret; + +@@ -829,7 +835,7 @@ static int __init rapl_pmu_init(void) + + out: + pr_warn("Initialization failed (%d), disabled\n", ret); +- cleanup_rapl_pmus(); ++ cleanup_rapl_pmus(rapl_pmus); + return ret; + } + module_init(rapl_pmu_init); +@@ -837,6 +843,6 @@ module_init(rapl_pmu_init); + static void __exit intel_rapl_exit(void) + { + perf_pmu_unregister(&rapl_pmus->pmu); +- cleanup_rapl_pmus(); ++ cleanup_rapl_pmus(rapl_pmus); + } + module_exit(intel_rapl_exit); diff --git a/debian/patches/patchset-pf/amd-rapl/0014-perf-x86-rapl-Modify-the-generic-variable-names-to-_.patch b/debian/patches/patchset-pf/amd-rapl/0014-perf-x86-rapl-Modify-the-generic-variable-names-to-_.patch new file mode 100644 index 0000000..198598b --- /dev/null +++ b/debian/patches/patchset-pf/amd-rapl/0014-perf-x86-rapl-Modify-the-generic-variable-names-to-_.patch @@ -0,0 +1,358 @@ +From b5c83c40540298a39f8314034b705f1236b17a9f Mon Sep 17 00:00:00 2001 +From: Dhananjay Ugwekar +Date: Fri, 13 Sep 2024 15:47:58 +0000 +Subject: perf/x86/rapl: Modify the generic variable names to *_pkg* + +Prep for addition of power_per_core PMU to handle core scope energy +consumption for AMD CPUs. + +Replace the generic names with *_pkg*, to differentiate between the +scopes of the two different PMUs and their variables. + +No functional change. + +Signed-off-by: Dhananjay Ugwekar +--- + arch/x86/events/rapl.c | 118 ++++++++++++++++++++--------------------- + 1 file changed, 59 insertions(+), 59 deletions(-) + +--- a/arch/x86/events/rapl.c ++++ b/arch/x86/events/rapl.c +@@ -70,18 +70,18 @@ MODULE_LICENSE("GPL"); + /* + * RAPL energy status counters + */ +-enum perf_rapl_events { ++enum perf_rapl_pkg_events { + PERF_RAPL_PP0 = 0, /* all cores */ + PERF_RAPL_PKG, /* entire package */ + PERF_RAPL_RAM, /* DRAM */ + PERF_RAPL_PP1, /* gpu */ + PERF_RAPL_PSYS, /* psys */ + +- PERF_RAPL_MAX, +- NR_RAPL_DOMAINS = PERF_RAPL_MAX, ++ PERF_RAPL_PKG_EVENTS_MAX, ++ NR_RAPL_PKG_DOMAINS = PERF_RAPL_PKG_EVENTS_MAX, + }; + +-static const char *const rapl_domain_names[NR_RAPL_DOMAINS] __initconst = { ++static const char *const rapl_pkg_domain_names[NR_RAPL_PKG_DOMAINS] __initconst = { + "pp0-core", + "package", + "dram", +@@ -126,16 +126,16 @@ enum rapl_unit_quirk { + }; + + struct rapl_model { +- struct perf_msr *rapl_msrs; +- unsigned long events; ++ struct perf_msr *rapl_pkg_msrs; ++ unsigned long pkg_events; + unsigned int msr_power_unit; + enum rapl_unit_quirk unit_quirk; + }; + + /* 1/2^hw_unit Joule */ +-static int rapl_hw_unit[NR_RAPL_DOMAINS] __read_mostly; +-static struct rapl_pmus *rapl_pmus; +-static unsigned int rapl_cntr_mask; ++static int rapl_pkg_hw_unit[NR_RAPL_PKG_DOMAINS] __read_mostly; ++static struct rapl_pmus *rapl_pmus_pkg; ++static unsigned int rapl_pkg_cntr_mask; + static u64 rapl_timer_ms; + static struct perf_msr *rapl_msrs; + static struct rapl_model *rapl_model; +@@ -149,7 +149,7 @@ static struct rapl_model *rapl_model; + * considered as either pkg-scope or die-scope, and we are considering + * them as die-scope. + */ +-#define rapl_pmu_is_pkg_scope() \ ++#define rapl_pkg_pmu_is_pkg_scope() \ + (boot_cpu_data.x86_vendor == X86_VENDOR_AMD || \ + boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) + +@@ -159,7 +159,7 @@ static struct rapl_model *rapl_model; + */ + static inline unsigned int get_rapl_pmu_idx(int cpu) + { +- return rapl_pmu_is_pkg_scope() ? topology_logical_package_id(cpu) : ++ return rapl_pkg_pmu_is_pkg_scope() ? topology_logical_package_id(cpu) : + topology_logical_die_id(cpu); + } + +@@ -172,7 +172,7 @@ static inline u64 rapl_read_counter(stru + + static inline u64 rapl_scale(u64 v, int cfg) + { +- if (cfg > NR_RAPL_DOMAINS) { ++ if (cfg > NR_RAPL_PKG_DOMAINS) { + pr_warn("Invalid domain %d, failed to scale data\n", cfg); + return v; + } +@@ -182,7 +182,7 @@ static inline u64 rapl_scale(u64 v, int + * or use ldexp(count, -32). + * Watts = Joules/Time delta + */ +- return v << (32 - rapl_hw_unit[cfg - 1]); ++ return v << (32 - rapl_pkg_hw_unit[cfg - 1]); + } + + static u64 rapl_event_update(struct perf_event *event) +@@ -342,7 +342,7 @@ static int rapl_pmu_event_init(struct pe + struct rapl_pmu *rapl_pmu; + + /* only look at RAPL events */ +- if (event->attr.type != rapl_pmus->pmu.type) ++ if (event->attr.type != rapl_pmus_pkg->pmu.type) + return -ENOENT; + + /* check only supported bits are set */ +@@ -352,14 +352,14 @@ static int rapl_pmu_event_init(struct pe + if (event->cpu < 0) + return -EINVAL; + +- if (!cfg || cfg >= NR_RAPL_DOMAINS + 1) ++ if (!cfg || cfg >= NR_RAPL_PKG_DOMAINS + 1) + return -EINVAL; + +- cfg = array_index_nospec((long)cfg, NR_RAPL_DOMAINS + 1); ++ cfg = array_index_nospec((long)cfg, NR_RAPL_PKG_DOMAINS + 1); + bit = cfg - 1; + + /* check event supported */ +- if (!(rapl_cntr_mask & (1 << bit))) ++ if (!(rapl_pkg_cntr_mask & (1 << bit))) + return -EINVAL; + + /* unsupported modes and filters */ +@@ -367,11 +367,11 @@ static int rapl_pmu_event_init(struct pe + return -EINVAL; + + rapl_pmu_idx = get_rapl_pmu_idx(event->cpu); +- if (rapl_pmu_idx >= rapl_pmus->nr_rapl_pmu) ++ if (rapl_pmu_idx >= rapl_pmus_pkg->nr_rapl_pmu) + return -EINVAL; + + /* must be done before validate_group */ +- rapl_pmu = rapl_pmus->rapl_pmu[rapl_pmu_idx]; ++ rapl_pmu = rapl_pmus_pkg->rapl_pmu[rapl_pmu_idx]; + if (!rapl_pmu) + return -EINVAL; + +@@ -525,11 +525,11 @@ static struct perf_msr intel_rapl_spr_ms + }; + + /* +- * Force to PERF_RAPL_MAX size due to: +- * - perf_msr_probe(PERF_RAPL_MAX) ++ * Force to PERF_RAPL_PKG_EVENTS_MAX size due to: ++ * - perf_msr_probe(PERF_RAPL_PKG_EVENTS_MAX) + * - want to use same event codes across both architectures + */ +-static struct perf_msr amd_rapl_msrs[] = { ++static struct perf_msr amd_rapl_pkg_msrs[] = { + [PERF_RAPL_PP0] = { 0, &rapl_events_cores_group, NULL, false, 0 }, + [PERF_RAPL_PKG] = { MSR_AMD_PKG_ENERGY_STATUS, &rapl_events_pkg_group, test_msr, false, RAPL_MSR_MASK }, + [PERF_RAPL_RAM] = { 0, &rapl_events_ram_group, NULL, false, 0 }, +@@ -545,8 +545,8 @@ static int rapl_check_hw_unit(void) + /* protect rdmsrl() to handle virtualization */ + if (rdmsrl_safe(rapl_model->msr_power_unit, &msr_rapl_power_unit_bits)) + return -1; +- for (i = 0; i < NR_RAPL_DOMAINS; i++) +- rapl_hw_unit[i] = (msr_rapl_power_unit_bits >> 8) & 0x1FULL; ++ for (i = 0; i < NR_RAPL_PKG_DOMAINS; i++) ++ rapl_pkg_hw_unit[i] = (msr_rapl_power_unit_bits >> 8) & 0x1FULL; + + switch (rapl_model->unit_quirk) { + /* +@@ -556,11 +556,11 @@ static int rapl_check_hw_unit(void) + * of 2. Datasheet, September 2014, Reference Number: 330784-001 " + */ + case RAPL_UNIT_QUIRK_INTEL_HSW: +- rapl_hw_unit[PERF_RAPL_RAM] = 16; ++ rapl_pkg_hw_unit[PERF_RAPL_RAM] = 16; + break; + /* SPR uses a fixed energy unit for Psys domain. */ + case RAPL_UNIT_QUIRK_INTEL_SPR: +- rapl_hw_unit[PERF_RAPL_PSYS] = 0; ++ rapl_pkg_hw_unit[PERF_RAPL_PSYS] = 0; + break; + default: + break; +@@ -575,9 +575,9 @@ static int rapl_check_hw_unit(void) + * if hw unit is 32, then we use 2 ms 1/200/2 + */ + rapl_timer_ms = 2; +- if (rapl_hw_unit[0] < 32) { ++ if (rapl_pkg_hw_unit[0] < 32) { + rapl_timer_ms = (1000 / (2 * 100)); +- rapl_timer_ms *= (1ULL << (32 - rapl_hw_unit[0] - 1)); ++ rapl_timer_ms *= (1ULL << (32 - rapl_pkg_hw_unit[0] - 1)); + } + return 0; + } +@@ -587,12 +587,12 @@ static void __init rapl_advertise(void) + int i; + + pr_info("API unit is 2^-32 Joules, %d fixed counters, %llu ms ovfl timer\n", +- hweight32(rapl_cntr_mask), rapl_timer_ms); ++ hweight32(rapl_pkg_cntr_mask), rapl_timer_ms); + +- for (i = 0; i < NR_RAPL_DOMAINS; i++) { +- if (rapl_cntr_mask & (1 << i)) { ++ for (i = 0; i < NR_RAPL_PKG_DOMAINS; i++) { ++ if (rapl_pkg_cntr_mask & (1 << i)) { + pr_info("hw unit of domain %s 2^-%d Joules\n", +- rapl_domain_names[i], rapl_hw_unit[i]); ++ rapl_pkg_domain_names[i], rapl_pkg_hw_unit[i]); + } + } + } +@@ -681,71 +681,71 @@ static int __init init_rapl_pmus(struct + } + + static struct rapl_model model_snb = { +- .events = BIT(PERF_RAPL_PP0) | ++ .pkg_events = BIT(PERF_RAPL_PP0) | + BIT(PERF_RAPL_PKG) | + BIT(PERF_RAPL_PP1), + .msr_power_unit = MSR_RAPL_POWER_UNIT, +- .rapl_msrs = intel_rapl_msrs, ++ .rapl_pkg_msrs = intel_rapl_msrs, + }; + + static struct rapl_model model_snbep = { +- .events = BIT(PERF_RAPL_PP0) | ++ .pkg_events = BIT(PERF_RAPL_PP0) | + BIT(PERF_RAPL_PKG) | + BIT(PERF_RAPL_RAM), + .msr_power_unit = MSR_RAPL_POWER_UNIT, +- .rapl_msrs = intel_rapl_msrs, ++ .rapl_pkg_msrs = intel_rapl_msrs, + }; + + static struct rapl_model model_hsw = { +- .events = BIT(PERF_RAPL_PP0) | ++ .pkg_events = BIT(PERF_RAPL_PP0) | + BIT(PERF_RAPL_PKG) | + BIT(PERF_RAPL_RAM) | + BIT(PERF_RAPL_PP1), + .msr_power_unit = MSR_RAPL_POWER_UNIT, +- .rapl_msrs = intel_rapl_msrs, ++ .rapl_pkg_msrs = intel_rapl_msrs, + }; + + static struct rapl_model model_hsx = { +- .events = BIT(PERF_RAPL_PP0) | ++ .pkg_events = BIT(PERF_RAPL_PP0) | + BIT(PERF_RAPL_PKG) | + BIT(PERF_RAPL_RAM), + .unit_quirk = RAPL_UNIT_QUIRK_INTEL_HSW, + .msr_power_unit = MSR_RAPL_POWER_UNIT, +- .rapl_msrs = intel_rapl_msrs, ++ .rapl_pkg_msrs = intel_rapl_msrs, + }; + + static struct rapl_model model_knl = { +- .events = BIT(PERF_RAPL_PKG) | ++ .pkg_events = BIT(PERF_RAPL_PKG) | + BIT(PERF_RAPL_RAM), + .unit_quirk = RAPL_UNIT_QUIRK_INTEL_HSW, + .msr_power_unit = MSR_RAPL_POWER_UNIT, +- .rapl_msrs = intel_rapl_msrs, ++ .rapl_pkg_msrs = intel_rapl_msrs, + }; + + static struct rapl_model model_skl = { +- .events = BIT(PERF_RAPL_PP0) | ++ .pkg_events = BIT(PERF_RAPL_PP0) | + BIT(PERF_RAPL_PKG) | + BIT(PERF_RAPL_RAM) | + BIT(PERF_RAPL_PP1) | + BIT(PERF_RAPL_PSYS), + .msr_power_unit = MSR_RAPL_POWER_UNIT, +- .rapl_msrs = intel_rapl_msrs, ++ .rapl_pkg_msrs = intel_rapl_msrs, + }; + + static struct rapl_model model_spr = { +- .events = BIT(PERF_RAPL_PP0) | ++ .pkg_events = BIT(PERF_RAPL_PP0) | + BIT(PERF_RAPL_PKG) | + BIT(PERF_RAPL_RAM) | + BIT(PERF_RAPL_PSYS), + .unit_quirk = RAPL_UNIT_QUIRK_INTEL_SPR, + .msr_power_unit = MSR_RAPL_POWER_UNIT, +- .rapl_msrs = intel_rapl_spr_msrs, ++ .rapl_pkg_msrs = intel_rapl_spr_msrs, + }; + + static struct rapl_model model_amd_hygon = { +- .events = BIT(PERF_RAPL_PKG), ++ .pkg_events = BIT(PERF_RAPL_PKG), + .msr_power_unit = MSR_AMD_RAPL_POWER_UNIT, +- .rapl_msrs = amd_rapl_msrs, ++ .rapl_pkg_msrs = amd_rapl_pkg_msrs, + }; + + static const struct x86_cpu_id rapl_model_match[] __initconst = { +@@ -801,11 +801,11 @@ MODULE_DEVICE_TABLE(x86cpu, rapl_model_m + static int __init rapl_pmu_init(void) + { + const struct x86_cpu_id *id; +- int rapl_pmu_scope = PERF_PMU_SCOPE_DIE; ++ int rapl_pkg_pmu_scope = PERF_PMU_SCOPE_DIE; + int ret; + +- if (rapl_pmu_is_pkg_scope()) +- rapl_pmu_scope = PERF_PMU_SCOPE_PKG; ++ if (rapl_pkg_pmu_is_pkg_scope()) ++ rapl_pkg_pmu_scope = PERF_PMU_SCOPE_PKG; + + id = x86_match_cpu(rapl_model_match); + if (!id) +@@ -813,20 +813,20 @@ static int __init rapl_pmu_init(void) + + rapl_model = (struct rapl_model *) id->driver_data; + +- rapl_msrs = rapl_model->rapl_msrs; ++ rapl_msrs = rapl_model->rapl_pkg_msrs; + +- rapl_cntr_mask = perf_msr_probe(rapl_msrs, PERF_RAPL_MAX, +- false, (void *) &rapl_model->events); ++ rapl_pkg_cntr_mask = perf_msr_probe(rapl_msrs, PERF_RAPL_PKG_EVENTS_MAX, ++ false, (void *) &rapl_model->pkg_events); + + ret = rapl_check_hw_unit(); + if (ret) + return ret; + +- ret = init_rapl_pmus(&rapl_pmus, rapl_pmu_scope); ++ ret = init_rapl_pmus(&rapl_pmus_pkg, rapl_pkg_pmu_scope); + if (ret) + return ret; + +- ret = perf_pmu_register(&rapl_pmus->pmu, "power", -1); ++ ret = perf_pmu_register(&rapl_pmus_pkg->pmu, "power", -1); + if (ret) + goto out; + +@@ -835,14 +835,14 @@ static int __init rapl_pmu_init(void) + + out: + pr_warn("Initialization failed (%d), disabled\n", ret); +- cleanup_rapl_pmus(rapl_pmus); ++ cleanup_rapl_pmus(rapl_pmus_pkg); + return ret; + } + module_init(rapl_pmu_init); + + static void __exit intel_rapl_exit(void) + { +- perf_pmu_unregister(&rapl_pmus->pmu); +- cleanup_rapl_pmus(rapl_pmus); ++ perf_pmu_unregister(&rapl_pmus_pkg->pmu); ++ cleanup_rapl_pmus(rapl_pmus_pkg); + } + module_exit(intel_rapl_exit); diff --git a/debian/patches/patchset-pf/amd-rapl/0015-perf-x86-rapl-Remove-the-global-variable-rapl_msrs.patch b/debian/patches/patchset-pf/amd-rapl/0015-perf-x86-rapl-Remove-the-global-variable-rapl_msrs.patch new file mode 100644 index 0000000..bce1965 --- /dev/null +++ b/debian/patches/patchset-pf/amd-rapl/0015-perf-x86-rapl-Remove-the-global-variable-rapl_msrs.patch @@ -0,0 +1,47 @@ +From dbc0343069c8f86fad0d8d9075f70f79114ef10a Mon Sep 17 00:00:00 2001 +From: Dhananjay Ugwekar +Date: Fri, 13 Sep 2024 15:47:59 +0000 +Subject: perf/x86/rapl: Remove the global variable rapl_msrs + +After making the rapl_model struct global, the rapl_msrs global +variable isn't needed, so remove it. + +Also it will be cleaner when new per-core scope PMU is added. As we will +need to maintain two rapl_msrs array(one for per-core scope and one for +package scope PMU), inside the rapl_model struct. + +Signed-off-by: Dhananjay Ugwekar +--- + arch/x86/events/rapl.c | 7 ++----- + 1 file changed, 2 insertions(+), 5 deletions(-) + +--- a/arch/x86/events/rapl.c ++++ b/arch/x86/events/rapl.c +@@ -137,7 +137,6 @@ static int rapl_pkg_hw_unit[NR_RAPL_PKG_ + static struct rapl_pmus *rapl_pmus_pkg; + static unsigned int rapl_pkg_cntr_mask; + static u64 rapl_timer_ms; +-static struct perf_msr *rapl_msrs; + static struct rapl_model *rapl_model; + + /* +@@ -376,7 +375,7 @@ static int rapl_pmu_event_init(struct pe + return -EINVAL; + + event->pmu_private = rapl_pmu; +- event->hw.event_base = rapl_msrs[bit].msr; ++ event->hw.event_base = rapl_model->rapl_pkg_msrs[bit].msr; + event->hw.config = cfg; + event->hw.idx = bit; + +@@ -813,9 +812,7 @@ static int __init rapl_pmu_init(void) + + rapl_model = (struct rapl_model *) id->driver_data; + +- rapl_msrs = rapl_model->rapl_pkg_msrs; +- +- rapl_pkg_cntr_mask = perf_msr_probe(rapl_msrs, PERF_RAPL_PKG_EVENTS_MAX, ++ rapl_pkg_cntr_mask = perf_msr_probe(rapl_model->rapl_pkg_msrs, PERF_RAPL_PKG_EVENTS_MAX, + false, (void *) &rapl_model->pkg_events); + + ret = rapl_check_hw_unit(); diff --git a/debian/patches/patchset-pf/amd-rapl/0016-perf-x86-rapl-Move-the-cntr_mask-to-rapl_pmus-struct.patch b/debian/patches/patchset-pf/amd-rapl/0016-perf-x86-rapl-Move-the-cntr_mask-to-rapl_pmus-struct.patch new file mode 100644 index 0000000..606ef19 --- /dev/null +++ b/debian/patches/patchset-pf/amd-rapl/0016-perf-x86-rapl-Move-the-cntr_mask-to-rapl_pmus-struct.patch @@ -0,0 +1,79 @@ +From d6a5a28382558b896767a78db795d421015831a7 Mon Sep 17 00:00:00 2001 +From: Dhananjay Ugwekar +Date: Fri, 13 Sep 2024 15:48:00 +0000 +Subject: perf/x86/rapl: Move the cntr_mask to rapl_pmus struct + +Preparation for the addition of per-core RAPL energy counter for AMD +CPUs. + +Moving cntr_mask to rapl_pmus struct instead of adding a new global +cntr_mask for the per-core RAPL energy counter, will ensure that the +"per_core_cntr_mask" is only created if needed (i.e. in case of AMD +CPUs). + +Signed-off-by: Dhananjay Ugwekar +--- + arch/x86/events/rapl.c | 15 ++++++++------- + 1 file changed, 8 insertions(+), 7 deletions(-) + +--- a/arch/x86/events/rapl.c ++++ b/arch/x86/events/rapl.c +@@ -116,6 +116,7 @@ struct rapl_pmu { + struct rapl_pmus { + struct pmu pmu; + unsigned int nr_rapl_pmu; ++ unsigned int cntr_mask; + struct rapl_pmu *rapl_pmu[] __counted_by(nr_rapl_pmu); + }; + +@@ -135,7 +136,6 @@ struct rapl_model { + /* 1/2^hw_unit Joule */ + static int rapl_pkg_hw_unit[NR_RAPL_PKG_DOMAINS] __read_mostly; + static struct rapl_pmus *rapl_pmus_pkg; +-static unsigned int rapl_pkg_cntr_mask; + static u64 rapl_timer_ms; + static struct rapl_model *rapl_model; + +@@ -358,7 +358,7 @@ static int rapl_pmu_event_init(struct pe + bit = cfg - 1; + + /* check event supported */ +- if (!(rapl_pkg_cntr_mask & (1 << bit))) ++ if (!(rapl_pmus_pkg->cntr_mask & (1 << bit))) + return -EINVAL; + + /* unsupported modes and filters */ +@@ -586,10 +586,10 @@ static void __init rapl_advertise(void) + int i; + + pr_info("API unit is 2^-32 Joules, %d fixed counters, %llu ms ovfl timer\n", +- hweight32(rapl_pkg_cntr_mask), rapl_timer_ms); ++ hweight32(rapl_pmus_pkg->cntr_mask), rapl_timer_ms); + + for (i = 0; i < NR_RAPL_PKG_DOMAINS; i++) { +- if (rapl_pkg_cntr_mask & (1 << i)) { ++ if (rapl_pmus_pkg->cntr_mask & (1 << i)) { + pr_info("hw unit of domain %s 2^-%d Joules\n", + rapl_pkg_domain_names[i], rapl_pkg_hw_unit[i]); + } +@@ -812,9 +812,6 @@ static int __init rapl_pmu_init(void) + + rapl_model = (struct rapl_model *) id->driver_data; + +- rapl_pkg_cntr_mask = perf_msr_probe(rapl_model->rapl_pkg_msrs, PERF_RAPL_PKG_EVENTS_MAX, +- false, (void *) &rapl_model->pkg_events); +- + ret = rapl_check_hw_unit(); + if (ret) + return ret; +@@ -823,6 +820,10 @@ static int __init rapl_pmu_init(void) + if (ret) + return ret; + ++ rapl_pmus_pkg->cntr_mask = perf_msr_probe(rapl_model->rapl_pkg_msrs, ++ PERF_RAPL_PKG_EVENTS_MAX, false, ++ (void *) &rapl_model->pkg_events); ++ + ret = perf_pmu_register(&rapl_pmus_pkg->pmu, "power", -1); + if (ret) + goto out; diff --git a/debian/patches/patchset-pf/amd-rapl/0017-perf-x86-rapl-Add-per-core-energy-counter-support-fo.patch b/debian/patches/patchset-pf/amd-rapl/0017-perf-x86-rapl-Add-per-core-energy-counter-support-fo.patch new file mode 100644 index 0000000..55f7a3c --- /dev/null +++ b/debian/patches/patchset-pf/amd-rapl/0017-perf-x86-rapl-Add-per-core-energy-counter-support-fo.patch @@ -0,0 +1,439 @@ +From 3cb480ec2950f4c6351c602552fc4f9a8e524b89 Mon Sep 17 00:00:00 2001 +From: Dhananjay Ugwekar +Date: Fri, 13 Sep 2024 15:48:01 +0000 +Subject: perf/x86/rapl: Add per-core energy counter support for AMD CPUs + +Add a new "power_per_core" PMU and "energy-per-core" event for +monitoring energy consumption by each core. The existing energy-cores +event aggregates the energy consumption at the package level. +This new event aligns with the AMD's per_core energy counters. + +Tested the package level and core level PMU counters with workloads +pinned to different CPUs. + +Results with workload pinned to CPU 1 in core 1 on a AMD Zen4 Genoa +machine: + +$ perf stat -a --per-core -e power_per_core/energy-per-core/ sleep 1 + + Performance counter stats for 'system wide': + +S0-D0-C0 1 0.02 Joules power_per_core/energy-per-core/ +S0-D0-C1 1 5.72 Joules power_per_core/energy-per-core/ +S0-D0-C2 1 0.02 Joules power_per_core/energy-per-core/ +S0-D0-C3 1 0.02 Joules power_per_core/energy-per-core/ +S0-D0-C4 1 0.02 Joules power_per_core/energy-per-core/ +S0-D0-C5 1 0.02 Joules power_per_core/energy-per-core/ +S0-D0-C6 1 0.02 Joules power_per_core/energy-per-core/ +S0-D0-C7 1 0.02 Joules power_per_core/energy-per-core/ +S0-D0-C8 1 0.02 Joules power_per_core/energy-per-core/ +S0-D0-C9 1 0.02 Joules power_per_core/energy-per-core/ +S0-D0-C10 1 0.02 Joules power_per_core/energy-per-core/ + +Signed-off-by: Dhananjay Ugwekar +--- + arch/x86/events/rapl.c | 178 +++++++++++++++++++++++++++++++++-------- + 1 file changed, 143 insertions(+), 35 deletions(-) + +--- a/arch/x86/events/rapl.c ++++ b/arch/x86/events/rapl.c +@@ -39,6 +39,10 @@ + * event: rapl_energy_psys + * perf code: 0x5 + * ++ * per_core counter: consumption of a single physical core ++ * event: rapl_energy_per_core (power_per_core PMU) ++ * perf code: 0x1 ++ * + * We manage those counters as free running (read-only). They may be + * use simultaneously by other tools, such as turbostat. + * +@@ -81,6 +85,10 @@ enum perf_rapl_pkg_events { + NR_RAPL_PKG_DOMAINS = PERF_RAPL_PKG_EVENTS_MAX, + }; + ++#define PERF_RAPL_PER_CORE 0 /* per-core */ ++#define PERF_RAPL_CORE_EVENTS_MAX 1 ++#define NR_RAPL_CORE_DOMAINS PERF_RAPL_CORE_EVENTS_MAX ++ + static const char *const rapl_pkg_domain_names[NR_RAPL_PKG_DOMAINS] __initconst = { + "pp0-core", + "package", +@@ -89,6 +97,8 @@ static const char *const rapl_pkg_domain + "psys", + }; + ++static const char *const rapl_core_domain_name __initconst = "per-core"; ++ + /* + * event code: LSB 8 bits, passed in attr->config + * any other bit is reserved +@@ -128,14 +138,18 @@ enum rapl_unit_quirk { + + struct rapl_model { + struct perf_msr *rapl_pkg_msrs; ++ struct perf_msr *rapl_core_msrs; + unsigned long pkg_events; ++ unsigned long core_events; + unsigned int msr_power_unit; + enum rapl_unit_quirk unit_quirk; + }; + + /* 1/2^hw_unit Joule */ + static int rapl_pkg_hw_unit[NR_RAPL_PKG_DOMAINS] __read_mostly; ++static int rapl_core_hw_unit __read_mostly; + static struct rapl_pmus *rapl_pmus_pkg; ++static struct rapl_pmus *rapl_pmus_core; + static u64 rapl_timer_ms; + static struct rapl_model *rapl_model; + +@@ -156,10 +170,14 @@ static struct rapl_model *rapl_model; + * Helper function to get the correct topology id according to the + * RAPL PMU scope. + */ +-static inline unsigned int get_rapl_pmu_idx(int cpu) ++static inline unsigned int get_rapl_pmu_idx(int cpu, int scope) + { +- return rapl_pkg_pmu_is_pkg_scope() ? topology_logical_package_id(cpu) : +- topology_logical_die_id(cpu); ++ if (scope == PERF_PMU_SCOPE_PKG) ++ return topology_logical_package_id(cpu); ++ else if (scope == PERF_PMU_SCOPE_DIE) ++ return topology_logical_die_id(cpu); ++ else ++ return topology_logical_core_id(cpu); + } + + static inline u64 rapl_read_counter(struct perf_event *event) +@@ -169,19 +187,20 @@ static inline u64 rapl_read_counter(stru + return raw; + } + +-static inline u64 rapl_scale(u64 v, int cfg) ++static inline u64 rapl_scale(u64 v, struct perf_event *event) + { +- if (cfg > NR_RAPL_PKG_DOMAINS) { +- pr_warn("Invalid domain %d, failed to scale data\n", cfg); +- return v; +- } ++ int hw_unit = rapl_pkg_hw_unit[event->hw.config - 1]; ++ ++ if (event->pmu->scope == PERF_PMU_SCOPE_CORE) ++ hw_unit = rapl_core_hw_unit; ++ + /* + * scale delta to smallest unit (1/2^32) + * users must then scale back: count * 1/(1e9*2^32) to get Joules + * or use ldexp(count, -32). + * Watts = Joules/Time delta + */ +- return v << (32 - rapl_pkg_hw_unit[cfg - 1]); ++ return v << (32 - hw_unit); + } + + static u64 rapl_event_update(struct perf_event *event) +@@ -208,7 +227,7 @@ static u64 rapl_event_update(struct perf + delta = (new_raw_count << shift) - (prev_raw_count << shift); + delta >>= shift; + +- sdelta = rapl_scale(delta, event->hw.config); ++ sdelta = rapl_scale(delta, event); + + local64_add(sdelta, &event->count); + +@@ -337,12 +356,13 @@ static void rapl_pmu_event_del(struct pe + static int rapl_pmu_event_init(struct perf_event *event) + { + u64 cfg = event->attr.config & RAPL_EVENT_MASK; +- int bit, rapl_pmu_idx, ret = 0; ++ int bit, rapl_pmus_scope, rapl_pmu_idx, ret = 0; + struct rapl_pmu *rapl_pmu; ++ struct rapl_pmus *rapl_pmus; + +- /* only look at RAPL events */ +- if (event->attr.type != rapl_pmus_pkg->pmu.type) +- return -ENOENT; ++ /* unsupported modes and filters */ ++ if (event->attr.sample_period) /* no sampling */ ++ return -EINVAL; + + /* check only supported bits are set */ + if (event->attr.config & ~RAPL_EVENT_MASK) +@@ -351,31 +371,49 @@ static int rapl_pmu_event_init(struct pe + if (event->cpu < 0) + return -EINVAL; + +- if (!cfg || cfg >= NR_RAPL_PKG_DOMAINS + 1) ++ rapl_pmus = container_of(event->pmu, struct rapl_pmus, pmu); ++ if (!rapl_pmus) + return -EINVAL; ++ rapl_pmus_scope = rapl_pmus->pmu.scope; + +- cfg = array_index_nospec((long)cfg, NR_RAPL_PKG_DOMAINS + 1); +- bit = cfg - 1; +- +- /* check event supported */ +- if (!(rapl_pmus_pkg->cntr_mask & (1 << bit))) ++ if (rapl_pmus_scope == PERF_PMU_SCOPE_PKG || rapl_pmus_scope == PERF_PMU_SCOPE_DIE) { ++ /* only look at RAPL package events */ ++ if (event->attr.type != rapl_pmus_pkg->pmu.type) ++ return -ENOENT; ++ ++ cfg = array_index_nospec((long)cfg, NR_RAPL_PKG_DOMAINS + 1); ++ if (!cfg || cfg >= NR_RAPL_PKG_DOMAINS + 1) ++ return -EINVAL; ++ ++ bit = cfg - 1; ++ event->hw.event_base = rapl_model->rapl_pkg_msrs[bit].msr; ++ } else if (rapl_pmus_scope == PERF_PMU_SCOPE_CORE) { ++ /* only look at RAPL per-core events */ ++ if (event->attr.type != rapl_pmus_core->pmu.type) ++ return -ENOENT; ++ ++ cfg = array_index_nospec((long)cfg, NR_RAPL_CORE_DOMAINS + 1); ++ if (!cfg || cfg >= NR_RAPL_PKG_DOMAINS + 1) ++ return -EINVAL; ++ ++ bit = cfg - 1; ++ event->hw.event_base = rapl_model->rapl_core_msrs[bit].msr; ++ } else + return -EINVAL; + +- /* unsupported modes and filters */ +- if (event->attr.sample_period) /* no sampling */ ++ /* check event supported */ ++ if (!(rapl_pmus->cntr_mask & (1 << bit))) + return -EINVAL; + +- rapl_pmu_idx = get_rapl_pmu_idx(event->cpu); +- if (rapl_pmu_idx >= rapl_pmus_pkg->nr_rapl_pmu) ++ rapl_pmu_idx = get_rapl_pmu_idx(event->cpu, rapl_pmus_scope); ++ if (rapl_pmu_idx >= rapl_pmus->nr_rapl_pmu) + return -EINVAL; +- + /* must be done before validate_group */ +- rapl_pmu = rapl_pmus_pkg->rapl_pmu[rapl_pmu_idx]; ++ rapl_pmu = rapl_pmus->rapl_pmu[rapl_pmu_idx]; + if (!rapl_pmu) + return -EINVAL; + + event->pmu_private = rapl_pmu; +- event->hw.event_base = rapl_model->rapl_pkg_msrs[bit].msr; + event->hw.config = cfg; + event->hw.idx = bit; + +@@ -392,12 +430,14 @@ RAPL_EVENT_ATTR_STR(energy-pkg , rapl + RAPL_EVENT_ATTR_STR(energy-ram , rapl_ram, "event=0x03"); + RAPL_EVENT_ATTR_STR(energy-gpu , rapl_gpu, "event=0x04"); + RAPL_EVENT_ATTR_STR(energy-psys, rapl_psys, "event=0x05"); ++RAPL_EVENT_ATTR_STR(energy-per-core, rapl_per_core, "event=0x01"); + + RAPL_EVENT_ATTR_STR(energy-cores.unit, rapl_cores_unit, "Joules"); + RAPL_EVENT_ATTR_STR(energy-pkg.unit , rapl_pkg_unit, "Joules"); + RAPL_EVENT_ATTR_STR(energy-ram.unit , rapl_ram_unit, "Joules"); + RAPL_EVENT_ATTR_STR(energy-gpu.unit , rapl_gpu_unit, "Joules"); + RAPL_EVENT_ATTR_STR(energy-psys.unit, rapl_psys_unit, "Joules"); ++RAPL_EVENT_ATTR_STR(energy-per-core.unit, rapl_per_core_unit, "Joules"); + + /* + * we compute in 0.23 nJ increments regardless of MSR +@@ -407,6 +447,7 @@ RAPL_EVENT_ATTR_STR(energy-pkg.scale, + RAPL_EVENT_ATTR_STR(energy-ram.scale, rapl_ram_scale, "2.3283064365386962890625e-10"); + RAPL_EVENT_ATTR_STR(energy-gpu.scale, rapl_gpu_scale, "2.3283064365386962890625e-10"); + RAPL_EVENT_ATTR_STR(energy-psys.scale, rapl_psys_scale, "2.3283064365386962890625e-10"); ++RAPL_EVENT_ATTR_STR(energy-per-core.scale, rapl_per_core_scale, "2.3283064365386962890625e-10"); + + /* + * There are no default events, but we need to create +@@ -439,6 +480,12 @@ static const struct attribute_group *rap + NULL, + }; + ++static const struct attribute_group *rapl_per_core_attr_groups[] = { ++ &rapl_pmu_format_group, ++ &rapl_pmu_events_group, ++ NULL, ++}; ++ + static struct attribute *rapl_events_cores[] = { + EVENT_PTR(rapl_cores), + EVENT_PTR(rapl_cores_unit), +@@ -499,6 +546,18 @@ static struct attribute_group rapl_event + .attrs = rapl_events_psys, + }; + ++static struct attribute *rapl_events_per_core[] = { ++ EVENT_PTR(rapl_per_core), ++ EVENT_PTR(rapl_per_core_unit), ++ EVENT_PTR(rapl_per_core_scale), ++ NULL, ++}; ++ ++static struct attribute_group rapl_events_per_core_group = { ++ .name = "events", ++ .attrs = rapl_events_per_core, ++}; ++ + static bool test_msr(int idx, void *data) + { + return test_bit(idx, (unsigned long *) data); +@@ -536,6 +595,11 @@ static struct perf_msr amd_rapl_pkg_msrs + [PERF_RAPL_PSYS] = { 0, &rapl_events_psys_group, NULL, false, 0 }, + }; + ++static struct perf_msr amd_rapl_core_msrs[] = { ++ [PERF_RAPL_PER_CORE] = { MSR_AMD_CORE_ENERGY_STATUS, &rapl_events_per_core_group, ++ test_msr, false, RAPL_MSR_MASK }, ++}; ++ + static int rapl_check_hw_unit(void) + { + u64 msr_rapl_power_unit_bits; +@@ -547,6 +611,8 @@ static int rapl_check_hw_unit(void) + for (i = 0; i < NR_RAPL_PKG_DOMAINS; i++) + rapl_pkg_hw_unit[i] = (msr_rapl_power_unit_bits >> 8) & 0x1FULL; + ++ rapl_core_hw_unit = (msr_rapl_power_unit_bits >> 8) & 0x1FULL; ++ + switch (rapl_model->unit_quirk) { + /* + * DRAM domain on HSW server and KNL has fixed energy unit which can be +@@ -565,7 +631,6 @@ static int rapl_check_hw_unit(void) + break; + } + +- + /* + * Calculate the timer rate: + * Use reference of 200W for scaling the timeout to avoid counter +@@ -584,9 +649,13 @@ static int rapl_check_hw_unit(void) + static void __init rapl_advertise(void) + { + int i; ++ int num_counters = hweight32(rapl_pmus_pkg->cntr_mask); ++ ++ if (rapl_pmus_core) ++ num_counters += hweight32(rapl_pmus_core->cntr_mask); + + pr_info("API unit is 2^-32 Joules, %d fixed counters, %llu ms ovfl timer\n", +- hweight32(rapl_pmus_pkg->cntr_mask), rapl_timer_ms); ++ num_counters, rapl_timer_ms); + + for (i = 0; i < NR_RAPL_PKG_DOMAINS; i++) { + if (rapl_pmus_pkg->cntr_mask & (1 << i)) { +@@ -594,6 +663,10 @@ static void __init rapl_advertise(void) + rapl_pkg_domain_names[i], rapl_pkg_hw_unit[i]); + } + } ++ ++ if (rapl_pmus_core && (rapl_pmus_core->cntr_mask & (1 << PERF_RAPL_PER_CORE))) ++ pr_info("hw unit of domain %s 2^-%d Joules\n", ++ rapl_core_domain_name, rapl_core_hw_unit); + } + + static void cleanup_rapl_pmus(struct rapl_pmus *rapl_pmus) +@@ -614,6 +687,10 @@ static const struct attribute_group *rap + NULL, + }; + ++static const struct attribute_group *rapl_per_core_attr_update[] = { ++ &rapl_events_per_core_group, ++}; ++ + static void __init init_rapl_pmu(struct rapl_pmus *rapl_pmus) + { + struct rapl_pmu *rapl_pmu; +@@ -622,10 +699,9 @@ static void __init init_rapl_pmu(struct + cpus_read_lock(); + + for_each_cpu(cpu, cpu_online_mask) { +- rapl_pmu_idx = get_rapl_pmu_idx(cpu); ++ rapl_pmu_idx = get_rapl_pmu_idx(cpu, rapl_pmus->pmu.scope); + if (rapl_pmu_idx >= rapl_pmus->nr_rapl_pmu) + continue; +- + rapl_pmu = rapl_pmus->rapl_pmu[rapl_pmu_idx]; + if (rapl_pmu) + continue; +@@ -644,15 +720,19 @@ static void __init init_rapl_pmu(struct + cpus_read_unlock(); + } + +-static int __init init_rapl_pmus(struct rapl_pmus **rapl_pmus_ptr, int rapl_pmu_scope) ++static int __init init_rapl_pmus(struct rapl_pmus **rapl_pmus_ptr, int rapl_pmu_scope, ++ const struct attribute_group **rapl_attr_groups, ++ const struct attribute_group **rapl_attr_update) + { + int nr_rapl_pmu; + struct rapl_pmus *rapl_pmus; + + if (rapl_pmu_scope == PERF_PMU_SCOPE_PKG) + nr_rapl_pmu = topology_max_packages(); +- else ++ else if (rapl_pmu_scope == PERF_PMU_SCOPE_DIE) + nr_rapl_pmu = topology_max_packages() * topology_max_dies_per_package(); ++ else ++ nr_rapl_pmu = topology_max_packages() * topology_num_cores_per_package(); + + rapl_pmus = kzalloc(struct_size(rapl_pmus, rapl_pmu, nr_rapl_pmu), GFP_KERNEL); + if (!rapl_pmus) +@@ -743,8 +823,10 @@ static struct rapl_model model_spr = { + + static struct rapl_model model_amd_hygon = { + .pkg_events = BIT(PERF_RAPL_PKG), ++ .core_events = BIT(PERF_RAPL_PER_CORE), + .msr_power_unit = MSR_AMD_RAPL_POWER_UNIT, + .rapl_pkg_msrs = amd_rapl_pkg_msrs, ++ .rapl_core_msrs = amd_rapl_core_msrs, + }; + + static const struct x86_cpu_id rapl_model_match[] __initconst = { +@@ -816,7 +898,8 @@ static int __init rapl_pmu_init(void) + if (ret) + return ret; + +- ret = init_rapl_pmus(&rapl_pmus_pkg, rapl_pkg_pmu_scope); ++ ret = init_rapl_pmus(&rapl_pmus_pkg, rapl_pkg_pmu_scope, rapl_attr_groups, ++ rapl_attr_update); + if (ret) + return ret; + +@@ -828,6 +911,27 @@ static int __init rapl_pmu_init(void) + if (ret) + goto out; + ++ if (rapl_model->core_events) { ++ ret = init_rapl_pmus(&rapl_pmus_core, PERF_PMU_SCOPE_CORE, ++ rapl_per_core_attr_groups, ++ rapl_per_core_attr_update); ++ if (ret) { ++ pr_warn("Per-core PMU initialization failed (%d)\n", ret); ++ goto per_core_init_failed; ++ } ++ ++ rapl_pmus_core->cntr_mask = perf_msr_probe(rapl_model->rapl_core_msrs, ++ PERF_RAPL_CORE_EVENTS_MAX, false, ++ (void *) &rapl_model->core_events); ++ ++ ret = perf_pmu_register(&rapl_pmus_core->pmu, "power_per_core", -1); ++ if (ret) { ++ pr_warn("Per-core PMU registration failed (%d)\n", ret); ++ cleanup_rapl_pmus(rapl_pmus_core); ++ } ++ } ++ ++per_core_init_failed: + rapl_advertise(); + return 0; + +@@ -840,6 +944,10 @@ module_init(rapl_pmu_init); + + static void __exit intel_rapl_exit(void) + { ++ if (rapl_pmus_core) { ++ perf_pmu_unregister(&rapl_pmus_core->pmu); ++ cleanup_rapl_pmus(rapl_pmus_core); ++ } + perf_pmu_unregister(&rapl_pmus_pkg->pmu); + cleanup_rapl_pmus(rapl_pmus_pkg); + } diff --git a/debian/patches/patchset-pf/cpuidle/0001-cpuidle-menu-Remove-iowait-influence.patch b/debian/patches/patchset-pf/cpuidle/0001-cpuidle-menu-Remove-iowait-influence.patch new file mode 100644 index 0000000..01dacc3 --- /dev/null +++ b/debian/patches/patchset-pf/cpuidle/0001-cpuidle-menu-Remove-iowait-influence.patch @@ -0,0 +1,180 @@ +From d31e903a364802c068ff23bdd448cc70eda71a7c Mon Sep 17 00:00:00 2001 +From: Christian Loehle +Date: Thu, 5 Sep 2024 10:26:38 +0100 +Subject: cpuidle: menu: Remove iowait influence + +Remove CPU iowaiters influence on idle state selection. +Remove the menu notion of performance multiplier which increased with +the number of tasks that went to iowait sleep on this CPU and haven't +woken up yet. + +Relying on iowait for cpuidle is problematic for a few reasons: +1. There is no guarantee that an iowaiting task will wake up on the +same CPU. +2. The task being in iowait says nothing about the idle duration, we +could be selecting shallower states for a long time. +3. The task being in iowait doesn't always imply a performance hit +with increased latency. +4. If there is such a performance hit, the number of iowaiting tasks +doesn't directly correlate. +5. The definition of iowait altogether is vague at best, it is +sprinkled across kernel code. + +Signed-off-by: Christian Loehle +--- + drivers/cpuidle/governors/menu.c | 76 ++++---------------------------- + 1 file changed, 9 insertions(+), 67 deletions(-) + +--- a/drivers/cpuidle/governors/menu.c ++++ b/drivers/cpuidle/governors/menu.c +@@ -19,7 +19,7 @@ + + #include "gov.h" + +-#define BUCKETS 12 ++#define BUCKETS 6 + #define INTERVAL_SHIFT 3 + #define INTERVALS (1UL << INTERVAL_SHIFT) + #define RESOLUTION 1024 +@@ -29,12 +29,11 @@ + /* + * Concepts and ideas behind the menu governor + * +- * For the menu governor, there are 3 decision factors for picking a C ++ * For the menu governor, there are 2 decision factors for picking a C + * state: + * 1) Energy break even point +- * 2) Performance impact +- * 3) Latency tolerance (from pmqos infrastructure) +- * These three factors are treated independently. ++ * 2) Latency tolerance (from pmqos infrastructure) ++ * These two factors are treated independently. + * + * Energy break even point + * ----------------------- +@@ -75,30 +74,6 @@ + * intervals and if the stand deviation of these 8 intervals is below a + * threshold value, we use the average of these intervals as prediction. + * +- * Limiting Performance Impact +- * --------------------------- +- * C states, especially those with large exit latencies, can have a real +- * noticeable impact on workloads, which is not acceptable for most sysadmins, +- * and in addition, less performance has a power price of its own. +- * +- * As a general rule of thumb, menu assumes that the following heuristic +- * holds: +- * The busier the system, the less impact of C states is acceptable +- * +- * This rule-of-thumb is implemented using a performance-multiplier: +- * If the exit latency times the performance multiplier is longer than +- * the predicted duration, the C state is not considered a candidate +- * for selection due to a too high performance impact. So the higher +- * this multiplier is, the longer we need to be idle to pick a deep C +- * state, and thus the less likely a busy CPU will hit such a deep +- * C state. +- * +- * Currently there is only one value determining the factor: +- * 10 points are added for each process that is waiting for IO on this CPU. +- * (This value was experimentally determined.) +- * Utilization is no longer a factor as it was shown that it never contributed +- * significantly to the performance multiplier in the first place. +- * + */ + + struct menu_device { +@@ -112,19 +87,10 @@ struct menu_device { + int interval_ptr; + }; + +-static inline int which_bucket(u64 duration_ns, unsigned int nr_iowaiters) ++static inline int which_bucket(u64 duration_ns) + { + int bucket = 0; + +- /* +- * We keep two groups of stats; one with no +- * IO pending, one without. +- * This allows us to calculate +- * E(duration)|iowait +- */ +- if (nr_iowaiters) +- bucket = BUCKETS/2; +- + if (duration_ns < 10ULL * NSEC_PER_USEC) + return bucket; + if (duration_ns < 100ULL * NSEC_PER_USEC) +@@ -138,19 +104,6 @@ static inline int which_bucket(u64 durat + return bucket + 5; + } + +-/* +- * Return a multiplier for the exit latency that is intended +- * to take performance requirements into account. +- * The more performance critical we estimate the system +- * to be, the higher this multiplier, and thus the higher +- * the barrier to go to an expensive C state. +- */ +-static inline int performance_multiplier(unsigned int nr_iowaiters) +-{ +- /* for IO wait tasks (per cpu!) we add 10x each */ +- return 1 + 10 * nr_iowaiters; +-} +- + static DEFINE_PER_CPU(struct menu_device, menu_devices); + + static void menu_update(struct cpuidle_driver *drv, struct cpuidle_device *dev); +@@ -258,8 +211,6 @@ static int menu_select(struct cpuidle_dr + struct menu_device *data = this_cpu_ptr(&menu_devices); + s64 latency_req = cpuidle_governor_latency_req(dev->cpu); + u64 predicted_ns; +- u64 interactivity_req; +- unsigned int nr_iowaiters; + ktime_t delta, delta_tick; + int i, idx; + +@@ -268,8 +219,6 @@ static int menu_select(struct cpuidle_dr + data->needs_update = 0; + } + +- nr_iowaiters = nr_iowait_cpu(dev->cpu); +- + /* Find the shortest expected idle interval. */ + predicted_ns = get_typical_interval(data) * NSEC_PER_USEC; + if (predicted_ns > RESIDENCY_THRESHOLD_NS) { +@@ -283,7 +232,7 @@ static int menu_select(struct cpuidle_dr + } + + data->next_timer_ns = delta; +- data->bucket = which_bucket(data->next_timer_ns, nr_iowaiters); ++ data->bucket = which_bucket(data->next_timer_ns); + + /* Round up the result for half microseconds. */ + timer_us = div_u64((RESOLUTION * DECAY * NSEC_PER_USEC) / 2 + +@@ -301,7 +250,7 @@ static int menu_select(struct cpuidle_dr + */ + data->next_timer_ns = KTIME_MAX; + delta_tick = TICK_NSEC / 2; +- data->bucket = which_bucket(KTIME_MAX, nr_iowaiters); ++ data->bucket = which_bucket(KTIME_MAX); + } + + if (unlikely(drv->state_count <= 1 || latency_req == 0) || +@@ -328,15 +277,8 @@ static int menu_select(struct cpuidle_dr + */ + if (predicted_ns < TICK_NSEC) + predicted_ns = data->next_timer_ns; +- } else { +- /* +- * Use the performance multiplier and the user-configurable +- * latency_req to determine the maximum exit latency. +- */ +- interactivity_req = div64_u64(predicted_ns, +- performance_multiplier(nr_iowaiters)); +- if (latency_req > interactivity_req) +- latency_req = interactivity_req; ++ } else if (latency_req > predicted_ns) { ++ latency_req = predicted_ns; + } + + /* diff --git a/debian/patches/patchset-pf/cpuidle/0002-cpuidle-Prefer-teo-over-menu-governor.patch b/debian/patches/patchset-pf/cpuidle/0002-cpuidle-Prefer-teo-over-menu-governor.patch new file mode 100644 index 0000000..d32e22a --- /dev/null +++ b/debian/patches/patchset-pf/cpuidle/0002-cpuidle-Prefer-teo-over-menu-governor.patch @@ -0,0 +1,58 @@ +From 3f840a42780323a4437dd1a417488d141c33af15 Mon Sep 17 00:00:00 2001 +From: Christian Loehle +Date: Thu, 5 Sep 2024 10:26:39 +0100 +Subject: cpuidle: Prefer teo over menu governor + +Since menu no longer has the interactivity boost teo works better +overall, so make it the default. + +Signed-off-by: Christian Loehle +--- + drivers/cpuidle/Kconfig | 5 +---- + drivers/cpuidle/governors/menu.c | 2 +- + drivers/cpuidle/governors/teo.c | 2 +- + 3 files changed, 3 insertions(+), 6 deletions(-) + +--- a/drivers/cpuidle/Kconfig ++++ b/drivers/cpuidle/Kconfig +@@ -5,7 +5,7 @@ config CPU_IDLE + bool "CPU idle PM support" + default y if ACPI || PPC_PSERIES + select CPU_IDLE_GOV_LADDER if (!NO_HZ && !NO_HZ_IDLE) +- select CPU_IDLE_GOV_MENU if (NO_HZ || NO_HZ_IDLE) && !CPU_IDLE_GOV_TEO ++ select CPU_IDLE_GOV_TEO if (NO_HZ || NO_HZ_IDLE) && !CPU_IDLE_GOV_MENU + help + CPU idle is a generic framework for supporting software-controlled + idle processor power management. It includes modular cross-platform +@@ -30,9 +30,6 @@ config CPU_IDLE_GOV_TEO + This governor implements a simplified idle state selection method + focused on timer events and does not do any interactivity boosting. + +- Some workloads benefit from using it and it generally should be safe +- to use. Say Y here if you are not happy with the alternatives. +- + config CPU_IDLE_GOV_HALTPOLL + bool "Haltpoll governor (for virtualized systems)" + depends on KVM_GUEST +--- a/drivers/cpuidle/governors/menu.c ++++ b/drivers/cpuidle/governors/menu.c +@@ -508,7 +508,7 @@ static int menu_enable_device(struct cpu + + static struct cpuidle_governor menu_governor = { + .name = "menu", +- .rating = 20, ++ .rating = 19, + .enable = menu_enable_device, + .select = menu_select, + .reflect = menu_reflect, +--- a/drivers/cpuidle/governors/teo.c ++++ b/drivers/cpuidle/governors/teo.c +@@ -537,7 +537,7 @@ static int teo_enable_device(struct cpui + + static struct cpuidle_governor teo_governor = { + .name = "teo", +- .rating = 19, ++ .rating = 20, + .enable = teo_enable_device, + .select = teo_select, + .reflect = teo_reflect, diff --git a/debian/patches/patchset-pf/cpuidle/0003-TEST-cpufreq-schedutil-Linear-iowait-boost-step.patch b/debian/patches/patchset-pf/cpuidle/0003-TEST-cpufreq-schedutil-Linear-iowait-boost-step.patch new file mode 100644 index 0000000..cc7e290 --- /dev/null +++ b/debian/patches/patchset-pf/cpuidle/0003-TEST-cpufreq-schedutil-Linear-iowait-boost-step.patch @@ -0,0 +1,39 @@ +From ca8c9368b6f28ef625716b03aa930acfb8afe158 Mon Sep 17 00:00:00 2001 +From: Christian Loehle +Date: Thu, 5 Sep 2024 10:26:40 +0100 +Subject: TEST: cpufreq/schedutil: Linear iowait boost step + +In preparation for capping iowait boost make the steps linear as +opposed to doubling. + +Signed-off-by: Christian Loehle +--- + kernel/sched/cpufreq_schedutil.c | 9 ++++----- + 1 file changed, 4 insertions(+), 5 deletions(-) + +--- a/kernel/sched/cpufreq_schedutil.c ++++ b/kernel/sched/cpufreq_schedutil.c +@@ -267,7 +267,8 @@ static void sugov_iowait_boost(struct su + /* Double the boost at each request */ + if (sg_cpu->iowait_boost) { + sg_cpu->iowait_boost = +- min_t(unsigned int, sg_cpu->iowait_boost << 1, SCHED_CAPACITY_SCALE); ++ min_t(unsigned int, ++ sg_cpu->iowait_boost + IOWAIT_BOOST_MIN, SCHED_CAPACITY_SCALE); + return; + } + +@@ -308,11 +309,9 @@ static unsigned long sugov_iowait_apply( + /* + * No boost pending; reduce the boost value. + */ +- sg_cpu->iowait_boost >>= 1; +- if (sg_cpu->iowait_boost < IOWAIT_BOOST_MIN) { +- sg_cpu->iowait_boost = 0; ++ sg_cpu->iowait_boost -= IOWAIT_BOOST_MIN; ++ if (!sg_cpu->iowait_boost) + return 0; +- } + } + + sg_cpu->iowait_boost_pending = false; diff --git a/debian/patches/patchset-pf/cpuidle/0004-TEST-cpufreq-schedutil-iowait-boost-cap-sysfs.patch b/debian/patches/patchset-pf/cpuidle/0004-TEST-cpufreq-schedutil-iowait-boost-cap-sysfs.patch new file mode 100644 index 0000000..a655dad --- /dev/null +++ b/debian/patches/patchset-pf/cpuidle/0004-TEST-cpufreq-schedutil-iowait-boost-cap-sysfs.patch @@ -0,0 +1,106 @@ +From 33f05bd16a4ac2f6f36c9eb88016e2375dcb597c Mon Sep 17 00:00:00 2001 +From: Christian Loehle +Date: Thu, 5 Sep 2024 10:26:41 +0100 +Subject: TEST: cpufreq/schedutil: iowait boost cap sysfs + +Add a knob to cap applied iowait_boost per sysfs. +This is to test for potential regressions. + +Signed-off-by: Christian Loehle +--- + kernel/sched/cpufreq_schedutil.c | 38 ++++++++++++++++++++++++++++++++ + 1 file changed, 38 insertions(+) + +--- a/kernel/sched/cpufreq_schedutil.c ++++ b/kernel/sched/cpufreq_schedutil.c +@@ -11,6 +11,7 @@ + struct sugov_tunables { + struct gov_attr_set attr_set; + unsigned int rate_limit_us; ++ unsigned int iowait_boost_cap; + }; + + struct sugov_policy { +@@ -35,6 +36,8 @@ struct sugov_policy { + + bool limits_changed; + bool need_freq_update; ++ ++ unsigned int iowait_boost_cap; + }; + + struct sugov_cpu { +@@ -316,6 +319,9 @@ static unsigned long sugov_iowait_apply( + + sg_cpu->iowait_boost_pending = false; + ++ if (sg_cpu->iowait_boost > sg_cpu->sg_policy->iowait_boost_cap) ++ sg_cpu->iowait_boost = sg_cpu->sg_policy->iowait_boost_cap; ++ + /* + * sg_cpu->util is already in capacity scale; convert iowait_boost + * into the same scale so we can compare. +@@ -554,6 +560,14 @@ static ssize_t rate_limit_us_show(struct + return sprintf(buf, "%u\n", tunables->rate_limit_us); + } + ++ ++static ssize_t iowait_boost_cap_show(struct gov_attr_set *attr_set, char *buf) ++{ ++ struct sugov_tunables *tunables = to_sugov_tunables(attr_set); ++ ++ return sprintf(buf, "%u\n", tunables->iowait_boost_cap); ++} ++ + static ssize_t + rate_limit_us_store(struct gov_attr_set *attr_set, const char *buf, size_t count) + { +@@ -572,10 +586,30 @@ rate_limit_us_store(struct gov_attr_set + return count; + } + ++static ssize_t ++iowait_boost_cap_store(struct gov_attr_set *attr_set, const char *buf, size_t count) ++{ ++ struct sugov_tunables *tunables = to_sugov_tunables(attr_set); ++ struct sugov_policy *sg_policy; ++ unsigned int iowait_boost_cap; ++ ++ if (kstrtouint(buf, 10, &iowait_boost_cap)) ++ return -EINVAL; ++ ++ tunables->iowait_boost_cap = iowait_boost_cap; ++ ++ list_for_each_entry(sg_policy, &attr_set->policy_list, tunables_hook) ++ sg_policy->iowait_boost_cap = iowait_boost_cap; ++ ++ return count; ++} ++ + static struct governor_attr rate_limit_us = __ATTR_RW(rate_limit_us); ++static struct governor_attr iowait_boost_cap = __ATTR_RW(iowait_boost_cap); + + static struct attribute *sugov_attrs[] = { + &rate_limit_us.attr, ++ &iowait_boost_cap.attr, + NULL + }; + ATTRIBUTE_GROUPS(sugov); +@@ -765,6 +799,8 @@ static int sugov_init(struct cpufreq_pol + + tunables->rate_limit_us = cpufreq_policy_transition_delay_us(policy); + ++ tunables->iowait_boost_cap = SCHED_CAPACITY_SCALE; ++ + policy->governor_data = sg_policy; + sg_policy->tunables = tunables; + +@@ -834,6 +870,8 @@ static int sugov_start(struct cpufreq_po + sg_policy->limits_changed = false; + sg_policy->cached_raw_freq = 0; + ++ sg_policy->iowait_boost_cap = SCHED_CAPACITY_SCALE; ++ + sg_policy->need_freq_update = cpufreq_driver_test_flags(CPUFREQ_NEED_UPDATE_LIMITS); + + if (policy_is_shared(policy)) diff --git a/debian/patches/patchset-pf/cpuidle/0005-cpufreq-schedutil-Remove-iowait-boost.patch b/debian/patches/patchset-pf/cpuidle/0005-cpufreq-schedutil-Remove-iowait-boost.patch new file mode 100644 index 0000000..2c1db29 --- /dev/null +++ b/debian/patches/patchset-pf/cpuidle/0005-cpufreq-schedutil-Remove-iowait-boost.patch @@ -0,0 +1,325 @@ +From 33eb6c08d7c615fad308001921c7b1148cbccfde Mon Sep 17 00:00:00 2001 +From: Christian Loehle +Date: Thu, 5 Sep 2024 10:26:42 +0100 +Subject: cpufreq/schedutil: Remove iowait boost + +iowait boost in schedutil was introduced by +commit ("21ca6d2c52f8 cpufreq: schedutil: Add iowait boosting"). +with it more or less following intel_pstate's approach to increase +frequency after an iowait wakeup. +Behaviour that is piggy-backed onto iowait boost is problematic +due to a lot of reasons, so remove it. + +For schedutil specifically these are some of the reasons: +1. Boosting is applied even in scenarios where it doesn't improve +throughput. +2. The boost is not accounted for in EAS: a) feec() will only consider + the actual task utilization for task placement, but another CPU might + be more energy-efficient at that capacity than the boosted one.) + b) When placing a non-IO task while a CPU is boosted compute_energy() + assumes a lower OPP than what is actually applied. This leads to + wrong EAS decisions. +3. Actual IO heavy workloads are hardly distinguished from infrequent +in_iowait wakeups. +4. The boost isn't accounted for in task placement. +5. The boost isn't associated with a task, it therefore lingers on the +rq even after the responsible task has migrated / stopped. +6. The boost isn't associated with a task, it therefore needs to ramp +up again when migrated. +7. Since schedutil doesn't know which task is getting woken up, +multiple unrelated in_iowait tasks lead to boosting. +8. Boosting is hard to control with UCLAMP_MAX (which is only active +when the task is on the rq, which for boosted tasks is usually not +the case for most of the time). + +One benefit of schedutil specifically is the reliance on the +scheduler's utilization signals, which have evolved a lot since it's +original introduction. Some cases that benefitted from iowait boosting +in the past can now be covered by e.g. util_est. + +Signed-off-by: Christian Loehle +--- + kernel/sched/cpufreq_schedutil.c | 181 +------------------------------ + 1 file changed, 3 insertions(+), 178 deletions(-) + +--- a/kernel/sched/cpufreq_schedutil.c ++++ b/kernel/sched/cpufreq_schedutil.c +@@ -6,12 +6,9 @@ + * Author: Rafael J. Wysocki + */ + +-#define IOWAIT_BOOST_MIN (SCHED_CAPACITY_SCALE / 8) +- + struct sugov_tunables { + struct gov_attr_set attr_set; + unsigned int rate_limit_us; +- unsigned int iowait_boost_cap; + }; + + struct sugov_policy { +@@ -36,8 +33,6 @@ struct sugov_policy { + + bool limits_changed; + bool need_freq_update; +- +- unsigned int iowait_boost_cap; + }; + + struct sugov_cpu { +@@ -45,10 +40,6 @@ struct sugov_cpu { + struct sugov_policy *sg_policy; + unsigned int cpu; + +- bool iowait_boost_pending; +- unsigned int iowait_boost; +- u64 last_update; +- + unsigned long util; + unsigned long bw_min; + +@@ -198,137 +189,15 @@ unsigned long sugov_effective_cpu_perf(i + return max(min, max); + } + +-static void sugov_get_util(struct sugov_cpu *sg_cpu, unsigned long boost) ++static void sugov_get_util(struct sugov_cpu *sg_cpu) + { + unsigned long min, max, util = cpu_util_cfs_boost(sg_cpu->cpu); + + util = effective_cpu_util(sg_cpu->cpu, util, &min, &max); +- util = max(util, boost); + sg_cpu->bw_min = min; + sg_cpu->util = sugov_effective_cpu_perf(sg_cpu->cpu, util, min, max); + } + +-/** +- * sugov_iowait_reset() - Reset the IO boost status of a CPU. +- * @sg_cpu: the sugov data for the CPU to boost +- * @time: the update time from the caller +- * @set_iowait_boost: true if an IO boost has been requested +- * +- * The IO wait boost of a task is disabled after a tick since the last update +- * of a CPU. If a new IO wait boost is requested after more then a tick, then +- * we enable the boost starting from IOWAIT_BOOST_MIN, which improves energy +- * efficiency by ignoring sporadic wakeups from IO. +- */ +-static bool sugov_iowait_reset(struct sugov_cpu *sg_cpu, u64 time, +- bool set_iowait_boost) +-{ +- s64 delta_ns = time - sg_cpu->last_update; +- +- /* Reset boost only if a tick has elapsed since last request */ +- if (delta_ns <= TICK_NSEC) +- return false; +- +- sg_cpu->iowait_boost = set_iowait_boost ? IOWAIT_BOOST_MIN : 0; +- sg_cpu->iowait_boost_pending = set_iowait_boost; +- +- return true; +-} +- +-/** +- * sugov_iowait_boost() - Updates the IO boost status of a CPU. +- * @sg_cpu: the sugov data for the CPU to boost +- * @time: the update time from the caller +- * @flags: SCHED_CPUFREQ_IOWAIT if the task is waking up after an IO wait +- * +- * Each time a task wakes up after an IO operation, the CPU utilization can be +- * boosted to a certain utilization which doubles at each "frequent and +- * successive" wakeup from IO, ranging from IOWAIT_BOOST_MIN to the utilization +- * of the maximum OPP. +- * +- * To keep doubling, an IO boost has to be requested at least once per tick, +- * otherwise we restart from the utilization of the minimum OPP. +- */ +-static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time, +- unsigned int flags) +-{ +- bool set_iowait_boost = flags & SCHED_CPUFREQ_IOWAIT; +- +- /* Reset boost if the CPU appears to have been idle enough */ +- if (sg_cpu->iowait_boost && +- sugov_iowait_reset(sg_cpu, time, set_iowait_boost)) +- return; +- +- /* Boost only tasks waking up after IO */ +- if (!set_iowait_boost) +- return; +- +- /* Ensure boost doubles only one time at each request */ +- if (sg_cpu->iowait_boost_pending) +- return; +- sg_cpu->iowait_boost_pending = true; +- +- /* Double the boost at each request */ +- if (sg_cpu->iowait_boost) { +- sg_cpu->iowait_boost = +- min_t(unsigned int, +- sg_cpu->iowait_boost + IOWAIT_BOOST_MIN, SCHED_CAPACITY_SCALE); +- return; +- } +- +- /* First wakeup after IO: start with minimum boost */ +- sg_cpu->iowait_boost = IOWAIT_BOOST_MIN; +-} +- +-/** +- * sugov_iowait_apply() - Apply the IO boost to a CPU. +- * @sg_cpu: the sugov data for the cpu to boost +- * @time: the update time from the caller +- * @max_cap: the max CPU capacity +- * +- * A CPU running a task which woken up after an IO operation can have its +- * utilization boosted to speed up the completion of those IO operations. +- * The IO boost value is increased each time a task wakes up from IO, in +- * sugov_iowait_apply(), and it's instead decreased by this function, +- * each time an increase has not been requested (!iowait_boost_pending). +- * +- * A CPU which also appears to have been idle for at least one tick has also +- * its IO boost utilization reset. +- * +- * This mechanism is designed to boost high frequently IO waiting tasks, while +- * being more conservative on tasks which does sporadic IO operations. +- */ +-static unsigned long sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time, +- unsigned long max_cap) +-{ +- /* No boost currently required */ +- if (!sg_cpu->iowait_boost) +- return 0; +- +- /* Reset boost if the CPU appears to have been idle enough */ +- if (sugov_iowait_reset(sg_cpu, time, false)) +- return 0; +- +- if (!sg_cpu->iowait_boost_pending) { +- /* +- * No boost pending; reduce the boost value. +- */ +- sg_cpu->iowait_boost -= IOWAIT_BOOST_MIN; +- if (!sg_cpu->iowait_boost) +- return 0; +- } +- +- sg_cpu->iowait_boost_pending = false; +- +- if (sg_cpu->iowait_boost > sg_cpu->sg_policy->iowait_boost_cap) +- sg_cpu->iowait_boost = sg_cpu->sg_policy->iowait_boost_cap; +- +- /* +- * sg_cpu->util is already in capacity scale; convert iowait_boost +- * into the same scale so we can compare. +- */ +- return (sg_cpu->iowait_boost * max_cap) >> SCHED_CAPACITY_SHIFT; +-} +- + #ifdef CONFIG_NO_HZ_COMMON + static bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) + { +@@ -356,18 +225,12 @@ static inline bool sugov_update_single_c + u64 time, unsigned long max_cap, + unsigned int flags) + { +- unsigned long boost; +- +- sugov_iowait_boost(sg_cpu, time, flags); +- sg_cpu->last_update = time; +- + ignore_dl_rate_limit(sg_cpu); + + if (!sugov_should_update_freq(sg_cpu->sg_policy, time)) + return false; + +- boost = sugov_iowait_apply(sg_cpu, time, max_cap); +- sugov_get_util(sg_cpu, boost); ++ sugov_get_util(sg_cpu); + + return true; + } +@@ -468,11 +331,8 @@ static unsigned int sugov_next_freq_shar + + for_each_cpu(j, policy->cpus) { + struct sugov_cpu *j_sg_cpu = &per_cpu(sugov_cpu, j); +- unsigned long boost; +- +- boost = sugov_iowait_apply(j_sg_cpu, time, max_cap); +- sugov_get_util(j_sg_cpu, boost); + ++ sugov_get_util(j_sg_cpu); + util = max(j_sg_cpu->util, util); + } + +@@ -488,9 +348,6 @@ sugov_update_shared(struct update_util_d + + raw_spin_lock(&sg_policy->update_lock); + +- sugov_iowait_boost(sg_cpu, time, flags); +- sg_cpu->last_update = time; +- + ignore_dl_rate_limit(sg_cpu); + + if (sugov_should_update_freq(sg_policy, time)) { +@@ -560,14 +417,6 @@ static ssize_t rate_limit_us_show(struct + return sprintf(buf, "%u\n", tunables->rate_limit_us); + } + +- +-static ssize_t iowait_boost_cap_show(struct gov_attr_set *attr_set, char *buf) +-{ +- struct sugov_tunables *tunables = to_sugov_tunables(attr_set); +- +- return sprintf(buf, "%u\n", tunables->iowait_boost_cap); +-} +- + static ssize_t + rate_limit_us_store(struct gov_attr_set *attr_set, const char *buf, size_t count) + { +@@ -586,30 +435,10 @@ rate_limit_us_store(struct gov_attr_set + return count; + } + +-static ssize_t +-iowait_boost_cap_store(struct gov_attr_set *attr_set, const char *buf, size_t count) +-{ +- struct sugov_tunables *tunables = to_sugov_tunables(attr_set); +- struct sugov_policy *sg_policy; +- unsigned int iowait_boost_cap; +- +- if (kstrtouint(buf, 10, &iowait_boost_cap)) +- return -EINVAL; +- +- tunables->iowait_boost_cap = iowait_boost_cap; +- +- list_for_each_entry(sg_policy, &attr_set->policy_list, tunables_hook) +- sg_policy->iowait_boost_cap = iowait_boost_cap; +- +- return count; +-} +- + static struct governor_attr rate_limit_us = __ATTR_RW(rate_limit_us); +-static struct governor_attr iowait_boost_cap = __ATTR_RW(iowait_boost_cap); + + static struct attribute *sugov_attrs[] = { + &rate_limit_us.attr, +- &iowait_boost_cap.attr, + NULL + }; + ATTRIBUTE_GROUPS(sugov); +@@ -799,8 +628,6 @@ static int sugov_init(struct cpufreq_pol + + tunables->rate_limit_us = cpufreq_policy_transition_delay_us(policy); + +- tunables->iowait_boost_cap = SCHED_CAPACITY_SCALE; +- + policy->governor_data = sg_policy; + sg_policy->tunables = tunables; + +@@ -870,8 +697,6 @@ static int sugov_start(struct cpufreq_po + sg_policy->limits_changed = false; + sg_policy->cached_raw_freq = 0; + +- sg_policy->iowait_boost_cap = SCHED_CAPACITY_SCALE; +- + sg_policy->need_freq_update = cpufreq_driver_test_flags(CPUFREQ_NEED_UPDATE_LIMITS); + + if (policy_is_shared(policy)) diff --git a/debian/patches/patchset-pf/cpuidle/0006-cpufreq-intel_pstate-Remove-iowait-boost.patch b/debian/patches/patchset-pf/cpuidle/0006-cpufreq-intel_pstate-Remove-iowait-boost.patch new file mode 100644 index 0000000..c255931 --- /dev/null +++ b/debian/patches/patchset-pf/cpuidle/0006-cpufreq-intel_pstate-Remove-iowait-boost.patch @@ -0,0 +1,113 @@ +From af7bbb59c2411e985a5d79173af5686337b4af9b Mon Sep 17 00:00:00 2001 +From: Christian Loehle +Date: Thu, 5 Sep 2024 10:26:43 +0100 +Subject: cpufreq: intel_pstate: Remove iowait boost + +Analogous to schedutil, remove iowait boost for the same reasons. + +Signed-off-by: Christian Loehle +--- + drivers/cpufreq/intel_pstate.c | 50 ++-------------------------------- + 1 file changed, 3 insertions(+), 47 deletions(-) + +--- a/drivers/cpufreq/intel_pstate.c ++++ b/drivers/cpufreq/intel_pstate.c +@@ -191,7 +191,6 @@ struct global_params { + * @policy: CPUFreq policy value + * @update_util: CPUFreq utility callback information + * @update_util_set: CPUFreq utility callback is set +- * @iowait_boost: iowait-related boost fraction + * @last_update: Time of the last update. + * @pstate: Stores P state limits for this CPU + * @vid: Stores VID limits for this CPU +@@ -245,7 +244,6 @@ struct cpudata { + struct acpi_processor_performance acpi_perf_data; + bool valid_pss_table; + #endif +- unsigned int iowait_boost; + s16 epp_powersave; + s16 epp_policy; + s16 epp_default; +@@ -2136,28 +2134,7 @@ static inline void intel_pstate_update_u + { + cpu->sample.time = time; + +- if (cpu->sched_flags & SCHED_CPUFREQ_IOWAIT) { +- bool do_io = false; +- +- cpu->sched_flags = 0; +- /* +- * Set iowait_boost flag and update time. Since IO WAIT flag +- * is set all the time, we can't just conclude that there is +- * some IO bound activity is scheduled on this CPU with just +- * one occurrence. If we receive at least two in two +- * consecutive ticks, then we treat as boost candidate. +- */ +- if (time_before64(time, cpu->last_io_update + 2 * TICK_NSEC)) +- do_io = true; +- +- cpu->last_io_update = time; +- +- if (do_io) +- intel_pstate_hwp_boost_up(cpu); +- +- } else { +- intel_pstate_hwp_boost_down(cpu); +- } ++ intel_pstate_hwp_boost_down(cpu); + } + + static inline void intel_pstate_update_util_hwp(struct update_util_data *data, +@@ -2240,9 +2217,6 @@ static inline int32_t get_target_pstate( + busy_frac = div_fp(sample->mperf << cpu->aperf_mperf_shift, + sample->tsc); + +- if (busy_frac < cpu->iowait_boost) +- busy_frac = cpu->iowait_boost; +- + sample->busy_scaled = busy_frac * 100; + + target = READ_ONCE(global.no_turbo) ? +@@ -2303,7 +2277,7 @@ static void intel_pstate_adjust_pstate(s + sample->aperf, + sample->tsc, + get_avg_frequency(cpu), +- fp_toint(cpu->iowait_boost * 100)); ++ 0); + } + + static void intel_pstate_update_util(struct update_util_data *data, u64 time, +@@ -2317,24 +2291,6 @@ static void intel_pstate_update_util(str + return; + + delta_ns = time - cpu->last_update; +- if (flags & SCHED_CPUFREQ_IOWAIT) { +- /* Start over if the CPU may have been idle. */ +- if (delta_ns > TICK_NSEC) { +- cpu->iowait_boost = ONE_EIGHTH_FP; +- } else if (cpu->iowait_boost >= ONE_EIGHTH_FP) { +- cpu->iowait_boost <<= 1; +- if (cpu->iowait_boost > int_tofp(1)) +- cpu->iowait_boost = int_tofp(1); +- } else { +- cpu->iowait_boost = ONE_EIGHTH_FP; +- } +- } else if (cpu->iowait_boost) { +- /* Clear iowait_boost if the CPU may have been idle. */ +- if (delta_ns > TICK_NSEC) +- cpu->iowait_boost = 0; +- else +- cpu->iowait_boost >>= 1; +- } + cpu->last_update = time; + delta_ns = time - cpu->sample.time; + if ((s64)delta_ns < INTEL_PSTATE_SAMPLING_INTERVAL) +@@ -2832,7 +2788,7 @@ static void intel_cpufreq_trace(struct c + sample->aperf, + sample->tsc, + get_avg_frequency(cpu), +- fp_toint(cpu->iowait_boost * 100)); ++ 0); + } + + static void intel_cpufreq_hwp_update(struct cpudata *cpu, u32 min, u32 max, diff --git a/debian/patches/patchset-pf/cpuidle/0007-cpufreq-Remove-SCHED_CPUFREQ_IOWAIT-update.patch b/debian/patches/patchset-pf/cpuidle/0007-cpufreq-Remove-SCHED_CPUFREQ_IOWAIT-update.patch new file mode 100644 index 0000000..238f3b0 --- /dev/null +++ b/debian/patches/patchset-pf/cpuidle/0007-cpufreq-Remove-SCHED_CPUFREQ_IOWAIT-update.patch @@ -0,0 +1,42 @@ +From fd1e0723b0a7ad140d2bf7cd9154997d5ece2b37 Mon Sep 17 00:00:00 2001 +From: Christian Loehle +Date: Thu, 5 Sep 2024 10:26:44 +0100 +Subject: cpufreq: Remove SCHED_CPUFREQ_IOWAIT update + +Neither intel_pstate nor schedutil care for the flag anymore, so +remove the update and flag definition. + +Signed-off-by: Christian Loehle +--- + include/linux/sched/cpufreq.h | 2 -- + kernel/sched/fair.c | 8 -------- + 2 files changed, 10 deletions(-) + +--- a/include/linux/sched/cpufreq.h ++++ b/include/linux/sched/cpufreq.h +@@ -8,8 +8,6 @@ + * Interface between cpufreq drivers and the scheduler: + */ + +-#define SCHED_CPUFREQ_IOWAIT (1U << 0) +- + #ifdef CONFIG_CPU_FREQ + struct cpufreq_policy; + +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -6768,14 +6768,6 @@ enqueue_task_fair(struct rq *rq, struct + */ + util_est_enqueue(&rq->cfs, p); + +- /* +- * If in_iowait is set, the code below may not trigger any cpufreq +- * utilization updates, so do it here explicitly with the IOWAIT flag +- * passed. +- */ +- if (p->in_iowait) +- cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT); +- + for_each_sched_entity(se) { + if (se->on_rq) + break; diff --git a/debian/patches/patchset-pf/cpuidle/0008-io_uring-Do-not-set-iowait-before-sleeping.patch b/debian/patches/patchset-pf/cpuidle/0008-io_uring-Do-not-set-iowait-before-sleeping.patch new file mode 100644 index 0000000..6a2d4c4 --- /dev/null +++ b/debian/patches/patchset-pf/cpuidle/0008-io_uring-Do-not-set-iowait-before-sleeping.patch @@ -0,0 +1,55 @@ +From 30cdb8d7d06f51bb86142c537ea05bd01c31bb40 Mon Sep 17 00:00:00 2001 +From: Christian Loehle +Date: Thu, 5 Sep 2024 10:26:45 +0100 +Subject: io_uring: Do not set iowait before sleeping + +Setting in_iowait was introduced in commit +8a796565cec3 ("io_uring: Use io_schedule* in cqring wait") +to tackle a perf regression that was caused by menu taking iowait into +account for synchronous IO and thus not selecting deeper states like in +the io_uring counterpart. +That behaviour is gone, so the workaround can be removed. + +Signed-off-by: Christian Loehle +--- + io_uring/io_uring.c | 17 ----------------- + 1 file changed, 17 deletions(-) + +--- a/io_uring/io_uring.c ++++ b/io_uring/io_uring.c +@@ -2359,15 +2359,6 @@ int io_run_task_work_sig(struct io_ring_ + return 0; + } + +-static bool current_pending_io(void) +-{ +- struct io_uring_task *tctx = current->io_uring; +- +- if (!tctx) +- return false; +- return percpu_counter_read_positive(&tctx->inflight); +-} +- + /* when returns >0, the caller should retry */ + static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx, + struct io_wait_queue *iowq) +@@ -2385,19 +2376,11 @@ static inline int io_cqring_wait_schedul + if (unlikely(io_should_wake(iowq))) + return 0; + +- /* +- * Mark us as being in io_wait if we have pending requests, so cpufreq +- * can take into account that the task is waiting for IO - turns out +- * to be important for low QD IO. +- */ +- if (current_pending_io()) +- current->in_iowait = 1; + ret = 0; + if (iowq->timeout == KTIME_MAX) + schedule(); + else if (!schedule_hrtimeout(&iowq->timeout, HRTIMER_MODE_ABS)) + ret = -ETIME; +- current->in_iowait = 0; + return ret; + } + diff --git a/debian/patches/patchset-pf/crypto/0001-crypto-x86-crc32c-simplify-code-for-handling-fewer-t.patch b/debian/patches/patchset-pf/crypto/0001-crypto-x86-crc32c-simplify-code-for-handling-fewer-t.patch new file mode 100644 index 0000000..7fd04a8 --- /dev/null +++ b/debian/patches/patchset-pf/crypto/0001-crypto-x86-crc32c-simplify-code-for-handling-fewer-t.patch @@ -0,0 +1,181 @@ +From e6b67a8d14e86d63062e6f1f234c5afc235561d4 Mon Sep 17 00:00:00 2001 +From: Eric Biggers +Date: Sun, 13 Oct 2024 21:06:49 -0700 +Subject: crypto: x86/crc32c - simplify code for handling fewer than 200 bytes + +The assembly code in crc32c-pcl-intel-asm_64.S is invoked only for +lengths >= 512, due to the overhead of saving and restoring FPU state. +Therefore, it is unnecessary for this code to be excessively "optimized" +for lengths < 200. Eliminate the excessive unrolling of this part of +the code and use a more straightforward qword-at-a-time loop. + +Note: the part of the code in question is not entirely redundant, as it +is still used to process any remainder mod 24, as well as any remaining +data when fewer than 200 bytes remain after least one 3072-byte chunk. + +Signed-off-by: Eric Biggers +--- + arch/x86/crypto/crc32c-pcl-intel-asm_64.S | 116 ++++++---------------- + 1 file changed, 33 insertions(+), 83 deletions(-) + +--- a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S ++++ b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S +@@ -56,20 +56,10 @@ + .quad .Lcrc_\i + .endm + +-.macro JNC_LESS_THAN j +- jnc .Lless_than_\j +-.endm +- +-# Define threshold where buffers are considered "small" and routed to more +-# efficient "by-1" code. This "by-1" code only handles up to 255 bytes, so +-# SMALL_SIZE can be no larger than 255. +- ++# Define threshold below which buffers are considered "small" and routed to ++# regular CRC code that does not interleave the CRC instructions. + #define SMALL_SIZE 200 + +-.if (SMALL_SIZE > 255) +-.error "SMALL_ SIZE must be < 256" +-.endif +- + # unsigned int crc_pcl(u8 *buffer, int len, unsigned int crc_init); + + .text +@@ -100,25 +90,18 @@ SYM_FUNC_START(crc_pcl) + ## Move crc_init for Linux to a different + mov crc_init_arg, crc_init + ++ mov %bufp, bufptmp # rdi = *buf ++ cmp $SMALL_SIZE, len ++ jb .Lsmall ++ + ################################################################ + ## 1) ALIGN: + ################################################################ +- +- mov %bufp, bufptmp # rdi = *buf + neg %bufp + and $7, %bufp # calculate the unalignment amount of + # the address + je .Lproc_block # Skip if aligned + +- ## If len is less than 8 and we're unaligned, we need to jump +- ## to special code to avoid reading beyond the end of the buffer +- cmp $8, len +- jae .Ldo_align +- # less_than_8 expects length in upper 3 bits of len_dw +- # less_than_8_post_shl1 expects length = carryflag * 8 + len_dw[31:30] +- shl $32-3+1, len_dw +- jmp .Lless_than_8_post_shl1 +- + .Ldo_align: + #### Calculate CRC of unaligned bytes of the buffer (if any) + movq (bufptmp), tmp # load a quadward from the buffer +@@ -144,9 +127,6 @@ SYM_FUNC_START(crc_pcl) + jae .Lfull_block + + .Lcontinue_block: +- cmpq $SMALL_SIZE, len +- jb .Lsmall +- + ## len < 128*24 + movq $2731, %rax # 2731 = ceil(2^16 / 24) + mul len_dw +@@ -243,68 +223,38 @@ LABEL crc_ 0 + mov tmp, len + cmp $128*24, tmp + jae .Lfull_block +- cmp $24, tmp ++ cmp $SMALL_SIZE, tmp + jae .Lcontinue_block + +-.Lless_than_24: +- shl $32-4, len_dw # less_than_16 expects length +- # in upper 4 bits of len_dw +- jnc .Lless_than_16 +- crc32q (bufptmp), crc_init +- crc32q 8(bufptmp), crc_init +- jz .Ldo_return +- add $16, bufptmp +- # len is less than 8 if we got here +- # less_than_8 expects length in upper 3 bits of len_dw +- # less_than_8_post_shl1 expects length = carryflag * 8 + len_dw[31:30] +- shl $2, len_dw +- jmp .Lless_than_8_post_shl1 +- + ####################################################################### +- ## 6) LESS THAN 256-bytes REMAIN AT THIS POINT (8-bits of len are full) ++ ## 6) Process any remainder without interleaving: + ####################################################################### + .Lsmall: +- shl $32-8, len_dw # Prepare len_dw for less_than_256 +- j=256 +-.rept 5 # j = {256, 128, 64, 32, 16} +-.altmacro +-LABEL less_than_ %j # less_than_j: Length should be in +- # upper lg(j) bits of len_dw +- j=(j/2) +- shl $1, len_dw # Get next MSB +- JNC_LESS_THAN %j +-.noaltmacro +- i=0 +-.rept (j/8) +- crc32q i(bufptmp), crc_init # Compute crc32 of 8-byte data +- i=i+8 +-.endr +- jz .Ldo_return # Return if remaining length is zero +- add $j, bufptmp # Advance buf +-.endr +- +-.Lless_than_8: # Length should be stored in +- # upper 3 bits of len_dw +- shl $1, len_dw +-.Lless_than_8_post_shl1: +- jnc .Lless_than_4 +- crc32l (bufptmp), crc_init_dw # CRC of 4 bytes +- jz .Ldo_return # return if remaining data is zero +- add $4, bufptmp +-.Lless_than_4: # Length should be stored in +- # upper 2 bits of len_dw +- shl $1, len_dw +- jnc .Lless_than_2 +- crc32w (bufptmp), crc_init_dw # CRC of 2 bytes +- jz .Ldo_return # return if remaining data is zero +- add $2, bufptmp +-.Lless_than_2: # Length should be stored in the MSB +- # of len_dw +- shl $1, len_dw +- jnc .Lless_than_1 +- crc32b (bufptmp), crc_init_dw # CRC of 1 byte +-.Lless_than_1: # Length should be zero +-.Ldo_return: ++ test len, len ++ jz .Ldone ++ mov len_dw, %eax ++ shr $3, %eax ++ jz .Ldo_dword ++.Ldo_qwords: ++ crc32q (bufptmp), crc_init ++ add $8, bufptmp ++ dec %eax ++ jnz .Ldo_qwords ++.Ldo_dword: ++ test $4, len_dw ++ jz .Ldo_word ++ crc32l (bufptmp), crc_init_dw ++ add $4, bufptmp ++.Ldo_word: ++ test $2, len_dw ++ jz .Ldo_byte ++ crc32w (bufptmp), crc_init_dw ++ add $2, bufptmp ++.Ldo_byte: ++ test $1, len_dw ++ jz .Ldone ++ crc32b (bufptmp), crc_init_dw ++.Ldone: + movq crc_init, %rax + popq %rsi + popq %rdi diff --git a/debian/patches/patchset-pf/crypto/0002-crypto-x86-crc32c-access-32-bit-arguments-as-32-bit.patch b/debian/patches/patchset-pf/crypto/0002-crypto-x86-crc32c-access-32-bit-arguments-as-32-bit.patch new file mode 100644 index 0000000..b3e564c --- /dev/null +++ b/debian/patches/patchset-pf/crypto/0002-crypto-x86-crc32c-access-32-bit-arguments-as-32-bit.patch @@ -0,0 +1,187 @@ +From 430478d63b1403878f2fd4b12de2cd21ee502184 Mon Sep 17 00:00:00 2001 +From: Eric Biggers +Date: Sun, 13 Oct 2024 21:06:49 -0700 +Subject: crypto: x86/crc32c - access 32-bit arguments as 32-bit + +Fix crc32c-pcl-intel-asm_64.S to access 32-bit arguments as 32-bit +values instead of 64-bit, since the upper bits of the corresponding +64-bit registers are not guaranteed to be zero. Also update the type of +the length argument to be unsigned int rather than int, as the assembly +code treats it as unsigned. + +Note: there haven't been any reports of this bug actually causing +incorrect behavior. Neither gcc nor clang guarantee zero-extension to +64 bits, but zero-extension is likely to happen in practice because most +instructions that operate on 32-bit registers zero-extend to 64 bits. + +Signed-off-by: Eric Biggers +--- + arch/x86/crypto/crc32c-intel_glue.c | 2 +- + arch/x86/crypto/crc32c-pcl-intel-asm_64.S | 57 +++++++++++------------ + 2 files changed, 27 insertions(+), 32 deletions(-) + +--- a/arch/x86/crypto/crc32c-intel_glue.c ++++ b/arch/x86/crypto/crc32c-intel_glue.c +@@ -41,7 +41,7 @@ + */ + #define CRC32C_PCL_BREAKEVEN 512 + +-asmlinkage unsigned int crc_pcl(const u8 *buffer, int len, ++asmlinkage unsigned int crc_pcl(const u8 *buffer, unsigned int len, + unsigned int crc_init); + #endif /* CONFIG_X86_64 */ + +--- a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S ++++ b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S +@@ -60,7 +60,7 @@ + # regular CRC code that does not interleave the CRC instructions. + #define SMALL_SIZE 200 + +-# unsigned int crc_pcl(u8 *buffer, int len, unsigned int crc_init); ++# unsigned int crc_pcl(const u8 *buffer, unsigned int len, unsigned int crc_init); + + .text + SYM_FUNC_START(crc_pcl) +@@ -72,14 +72,11 @@ SYM_FUNC_START(crc_pcl) + #define block_0 %rcx + #define block_1 %rdx + #define block_2 %r11 +-#define len %rsi +-#define len_dw %esi +-#define len_w %si +-#define len_b %sil +-#define crc_init_arg %rdx ++#define len %esi ++#define crc_init_arg %edx + #define tmp %rbx +-#define crc_init %r8 +-#define crc_init_dw %r8d ++#define crc_init %r8d ++#define crc_init_q %r8 + #define crc1 %r9 + #define crc2 %r10 + +@@ -107,9 +104,9 @@ SYM_FUNC_START(crc_pcl) + movq (bufptmp), tmp # load a quadward from the buffer + add %bufp, bufptmp # align buffer pointer for quadword + # processing +- sub %bufp, len # update buffer length ++ sub bufp_dw, len # update buffer length + .Lalign_loop: +- crc32b %bl, crc_init_dw # compute crc32 of 1-byte ++ crc32b %bl, crc_init # compute crc32 of 1-byte + shr $8, tmp # get next byte + dec %bufp + jne .Lalign_loop +@@ -121,15 +118,14 @@ SYM_FUNC_START(crc_pcl) + ################################################################ + + ## compute num of bytes to be processed +- movq len, tmp # save num bytes in tmp + +- cmpq $128*24, len ++ cmp $128*24, len + jae .Lfull_block + + .Lcontinue_block: + ## len < 128*24 + movq $2731, %rax # 2731 = ceil(2^16 / 24) +- mul len_dw ++ mul len + shrq $16, %rax + + ## eax contains floor(bytes / 24) = num 24-byte chunks to do +@@ -176,7 +172,7 @@ SYM_FUNC_START(crc_pcl) + LABEL crc_ %i + .noaltmacro + ENDBR +- crc32q -i*8(block_0), crc_init ++ crc32q -i*8(block_0), crc_init_q + crc32q -i*8(block_1), crc1 + crc32q -i*8(block_2), crc2 + i=(i-1) +@@ -186,7 +182,7 @@ LABEL crc_ %i + LABEL crc_ %i + .noaltmacro + ENDBR +- crc32q -i*8(block_0), crc_init ++ crc32q -i*8(block_0), crc_init_q + crc32q -i*8(block_1), crc1 + # SKIP crc32 -i*8(block_2), crc2 ; Don't do this one yet + +@@ -200,9 +196,9 @@ LABEL crc_ %i + shlq $3, %rax # rax *= 8 + pmovzxdq (%bufp,%rax), %xmm0 # 2 consts: K1:K2 + leal (%eax,%eax,2), %eax # rax *= 3 (total *24) +- subq %rax, tmp # tmp -= rax*24 ++ sub %eax, len # len -= rax*24 + +- movq crc_init, %xmm1 # CRC for block 1 ++ movq crc_init_q, %xmm1 # CRC for block 1 + pclmulqdq $0x00, %xmm0, %xmm1 # Multiply by K2 + + movq crc1, %xmm2 # CRC for block 2 +@@ -211,8 +207,8 @@ LABEL crc_ %i + pxor %xmm2,%xmm1 + movq %xmm1, %rax + xor -i*8(block_2), %rax +- mov crc2, crc_init +- crc32 %rax, crc_init ++ mov crc2, crc_init_q ++ crc32 %rax, crc_init_q + + ################################################################ + ## 5) Check for end: +@@ -220,10 +216,9 @@ LABEL crc_ %i + + LABEL crc_ 0 + ENDBR +- mov tmp, len +- cmp $128*24, tmp ++ cmp $128*24, len + jae .Lfull_block +- cmp $SMALL_SIZE, tmp ++ cmp $SMALL_SIZE, len + jae .Lcontinue_block + + ####################################################################### +@@ -232,30 +227,30 @@ LABEL crc_ 0 + .Lsmall: + test len, len + jz .Ldone +- mov len_dw, %eax ++ mov len, %eax + shr $3, %eax + jz .Ldo_dword + .Ldo_qwords: +- crc32q (bufptmp), crc_init ++ crc32q (bufptmp), crc_init_q + add $8, bufptmp + dec %eax + jnz .Ldo_qwords + .Ldo_dword: +- test $4, len_dw ++ test $4, len + jz .Ldo_word +- crc32l (bufptmp), crc_init_dw ++ crc32l (bufptmp), crc_init + add $4, bufptmp + .Ldo_word: +- test $2, len_dw ++ test $2, len + jz .Ldo_byte +- crc32w (bufptmp), crc_init_dw ++ crc32w (bufptmp), crc_init + add $2, bufptmp + .Ldo_byte: +- test $1, len_dw ++ test $1, len + jz .Ldone +- crc32b (bufptmp), crc_init_dw ++ crc32b (bufptmp), crc_init + .Ldone: +- movq crc_init, %rax ++ mov crc_init, %eax + popq %rsi + popq %rdi + popq %rbx diff --git a/debian/patches/patchset-pf/crypto/0003-crypto-x86-crc32c-eliminate-jump-table-and-excessive.patch b/debian/patches/patchset-pf/crypto/0003-crypto-x86-crc32c-eliminate-jump-table-and-excessive.patch new file mode 100644 index 0000000..faaae59 --- /dev/null +++ b/debian/patches/patchset-pf/crypto/0003-crypto-x86-crc32c-eliminate-jump-table-and-excessive.patch @@ -0,0 +1,374 @@ +From 8706bf3e3cba8c708f9933f0d1c6a23f9c2c8c33 Mon Sep 17 00:00:00 2001 +From: Eric Biggers +Date: Sun, 13 Oct 2024 21:06:49 -0700 +Subject: crypto: x86/crc32c - eliminate jump table and excessive unrolling + +crc32c-pcl-intel-asm_64.S has a loop with 1 to 127 iterations fully +unrolled and uses a jump table to jump into the correct location. This +optimization is misguided, as it bloats the binary code size and +introduces an indirect call. x86_64 CPUs can predict loops well, so it +is fine to just use a loop instead. Loop bookkeeping instructions can +compete with the crc instructions for the ALUs, but this is easily +mitigated by unrolling the loop by a smaller amount, such as 4 times. + +Therefore, re-roll the loop and make related tweaks to the code. + +This reduces the binary code size of crc_pclmul() from 4546 bytes to 418 +bytes, a 91% reduction. In general it also makes the code faster, with +some large improvements seen when retpoline is enabled. + +More detailed performance results are shown below. They are given as +percent improvement in throughput (negative means regressed) for CPU +microarchitecture vs. input length in bytes. E.g. an improvement from +40 GB/s to 50 GB/s would be listed as 25%. + +Table 1: Results with retpoline enabled (the default): + + | 512 | 833 | 1024 | 2000 | 3173 | 4096 | + ---------------------+-------+-------+-------+------ +-------+-------+ + Intel Haswell | 35.0% | 20.7% | 17.8% | 9.7% | -0.2% | 4.4% | + Intel Emerald Rapids | 66.8% | 45.2% | 36.3% | 19.3% | 0.0% | 5.4% | + AMD Zen 2 | 29.5% | 17.2% | 13.5% | 8.6% | -0.5% | 2.8% | + +Table 2: Results with retpoline disabled: + + | 512 | 833 | 1024 | 2000 | 3173 | 4096 | + ---------------------+-------+-------+-------+------ +-------+-------+ + Intel Haswell | 3.3% | 4.8% | 4.5% | 0.9% | -2.9% | 0.3% | + Intel Emerald Rapids | 7.5% | 6.4% | 5.2% | 2.3% | -0.0% | 0.6% | + AMD Zen 2 | 11.8% | 1.4% | 0.2% | 1.3% | -0.9% | -0.2% | + +Signed-off-by: Eric Biggers +--- + arch/x86/crypto/crc32c-pcl-intel-asm_64.S | 233 +++++++++------------- + 1 file changed, 92 insertions(+), 141 deletions(-) + +--- a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S ++++ b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S +@@ -7,6 +7,7 @@ + * http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-paper.pdf + * + * Copyright (C) 2012 Intel Corporation. ++ * Copyright 2024 Google LLC + * + * Authors: + * Wajdi Feghali +@@ -44,18 +45,9 @@ + */ + + #include +-#include + + ## ISCSI CRC 32 Implementation with crc32 and pclmulqdq Instruction + +-.macro LABEL prefix n +-.L\prefix\n\(): +-.endm +- +-.macro JMPTBL_ENTRY i +-.quad .Lcrc_\i +-.endm +- + # Define threshold below which buffers are considered "small" and routed to + # regular CRC code that does not interleave the CRC instructions. + #define SMALL_SIZE 200 +@@ -64,139 +56,116 @@ + + .text + SYM_FUNC_START(crc_pcl) +-#define bufp rdi +-#define bufp_dw %edi +-#define bufp_w %di +-#define bufp_b %dil +-#define bufptmp %rcx +-#define block_0 %rcx +-#define block_1 %rdx +-#define block_2 %r11 +-#define len %esi +-#define crc_init_arg %edx +-#define tmp %rbx +-#define crc_init %r8d +-#define crc_init_q %r8 +-#define crc1 %r9 +-#define crc2 %r10 +- +- pushq %rbx +- pushq %rdi +- pushq %rsi +- +- ## Move crc_init for Linux to a different +- mov crc_init_arg, crc_init ++#define bufp %rdi ++#define bufp_d %edi ++#define len %esi ++#define crc_init %edx ++#define crc_init_q %rdx ++#define n_misaligned %ecx /* overlaps chunk_bytes! */ ++#define n_misaligned_q %rcx ++#define chunk_bytes %ecx /* overlaps n_misaligned! */ ++#define chunk_bytes_q %rcx ++#define crc1 %r8 ++#define crc2 %r9 + +- mov %bufp, bufptmp # rdi = *buf + cmp $SMALL_SIZE, len + jb .Lsmall + + ################################################################ + ## 1) ALIGN: + ################################################################ +- neg %bufp +- and $7, %bufp # calculate the unalignment amount of ++ mov bufp_d, n_misaligned ++ neg n_misaligned ++ and $7, n_misaligned # calculate the misalignment amount of + # the address +- je .Lproc_block # Skip if aligned ++ je .Laligned # Skip if aligned + ++ # Process 1 <= n_misaligned <= 7 bytes individually in order to align ++ # the remaining data to an 8-byte boundary. + .Ldo_align: +- #### Calculate CRC of unaligned bytes of the buffer (if any) +- movq (bufptmp), tmp # load a quadward from the buffer +- add %bufp, bufptmp # align buffer pointer for quadword +- # processing +- sub bufp_dw, len # update buffer length ++ movq (bufp), %rax ++ add n_misaligned_q, bufp ++ sub n_misaligned, len + .Lalign_loop: +- crc32b %bl, crc_init # compute crc32 of 1-byte +- shr $8, tmp # get next byte +- dec %bufp ++ crc32b %al, crc_init # compute crc32 of 1-byte ++ shr $8, %rax # get next byte ++ dec n_misaligned + jne .Lalign_loop +- +-.Lproc_block: ++.Laligned: + + ################################################################ +- ## 2) PROCESS BLOCKS: ++ ## 2) PROCESS BLOCK: + ################################################################ + +- ## compute num of bytes to be processed +- + cmp $128*24, len + jae .Lfull_block + +-.Lcontinue_block: +- ## len < 128*24 +- movq $2731, %rax # 2731 = ceil(2^16 / 24) +- mul len +- shrq $16, %rax +- +- ## eax contains floor(bytes / 24) = num 24-byte chunks to do +- +- ## process rax 24-byte chunks (128 >= rax >= 0) +- +- ## compute end address of each block +- ## block 0 (base addr + RAX * 8) +- ## block 1 (base addr + RAX * 16) +- ## block 2 (base addr + RAX * 24) +- lea (bufptmp, %rax, 8), block_0 +- lea (block_0, %rax, 8), block_1 +- lea (block_1, %rax, 8), block_2 +- +- xor crc1, crc1 +- xor crc2, crc2 +- +- ## branch into array +- leaq jump_table(%rip), %bufp +- mov (%bufp,%rax,8), %bufp +- JMP_NOSPEC bufp ++.Lpartial_block: ++ # Compute floor(len / 24) to get num qwords to process from each lane. ++ imul $2731, len, %eax # 2731 = ceil(2^16 / 24) ++ shr $16, %eax ++ jmp .Lcrc_3lanes + +- ################################################################ +- ## 2a) PROCESS FULL BLOCKS: +- ################################################################ + .Lfull_block: +- movl $128,%eax +- lea 128*8*2(block_0), block_1 +- lea 128*8*3(block_0), block_2 +- add $128*8*1, block_0 +- +- xor crc1,crc1 +- xor crc2,crc2 +- +- # Fall through into top of crc array (crc_128) ++ # Processing 128 qwords from each lane. ++ mov $128, %eax + + ################################################################ +- ## 3) CRC Array: ++ ## 3) CRC each of three lanes: + ################################################################ + +- i=128 +-.rept 128-1 +-.altmacro +-LABEL crc_ %i +-.noaltmacro +- ENDBR +- crc32q -i*8(block_0), crc_init_q +- crc32q -i*8(block_1), crc1 +- crc32q -i*8(block_2), crc2 +- i=(i-1) +-.endr +- +-.altmacro +-LABEL crc_ %i +-.noaltmacro +- ENDBR +- crc32q -i*8(block_0), crc_init_q +- crc32q -i*8(block_1), crc1 +-# SKIP crc32 -i*8(block_2), crc2 ; Don't do this one yet ++.Lcrc_3lanes: ++ xor crc1,crc1 ++ xor crc2,crc2 ++ mov %eax, chunk_bytes ++ shl $3, chunk_bytes # num bytes to process from each lane ++ sub $5, %eax # 4 for 4x_loop, 1 for special last iter ++ jl .Lcrc_3lanes_4x_done ++ ++ # Unroll the loop by a factor of 4 to reduce the overhead of the loop ++ # bookkeeping instructions, which can compete with crc32q for the ALUs. ++.Lcrc_3lanes_4x_loop: ++ crc32q (bufp), crc_init_q ++ crc32q (bufp,chunk_bytes_q), crc1 ++ crc32q (bufp,chunk_bytes_q,2), crc2 ++ crc32q 8(bufp), crc_init_q ++ crc32q 8(bufp,chunk_bytes_q), crc1 ++ crc32q 8(bufp,chunk_bytes_q,2), crc2 ++ crc32q 16(bufp), crc_init_q ++ crc32q 16(bufp,chunk_bytes_q), crc1 ++ crc32q 16(bufp,chunk_bytes_q,2), crc2 ++ crc32q 24(bufp), crc_init_q ++ crc32q 24(bufp,chunk_bytes_q), crc1 ++ crc32q 24(bufp,chunk_bytes_q,2), crc2 ++ add $32, bufp ++ sub $4, %eax ++ jge .Lcrc_3lanes_4x_loop ++ ++.Lcrc_3lanes_4x_done: ++ add $4, %eax ++ jz .Lcrc_3lanes_last_qword ++ ++.Lcrc_3lanes_1x_loop: ++ crc32q (bufp), crc_init_q ++ crc32q (bufp,chunk_bytes_q), crc1 ++ crc32q (bufp,chunk_bytes_q,2), crc2 ++ add $8, bufp ++ dec %eax ++ jnz .Lcrc_3lanes_1x_loop + +- mov block_2, block_0 ++.Lcrc_3lanes_last_qword: ++ crc32q (bufp), crc_init_q ++ crc32q (bufp,chunk_bytes_q), crc1 ++# SKIP crc32q (bufp,chunk_bytes_q,2), crc2 ; Don't do this one yet + + ################################################################ + ## 4) Combine three results: + ################################################################ + +- lea (K_table-8)(%rip), %bufp # first entry is for idx 1 +- shlq $3, %rax # rax *= 8 +- pmovzxdq (%bufp,%rax), %xmm0 # 2 consts: K1:K2 +- leal (%eax,%eax,2), %eax # rax *= 3 (total *24) +- sub %eax, len # len -= rax*24 ++ lea (K_table-8)(%rip), %rax # first entry is for idx 1 ++ pmovzxdq (%rax,chunk_bytes_q), %xmm0 # 2 consts: K1:K2 ++ lea (chunk_bytes,chunk_bytes,2), %eax # chunk_bytes * 3 ++ sub %eax, len # len -= chunk_bytes * 3 + + movq crc_init_q, %xmm1 # CRC for block 1 + pclmulqdq $0x00, %xmm0, %xmm1 # Multiply by K2 +@@ -206,20 +175,19 @@ LABEL crc_ %i + + pxor %xmm2,%xmm1 + movq %xmm1, %rax +- xor -i*8(block_2), %rax ++ xor (bufp,chunk_bytes_q,2), %rax + mov crc2, crc_init_q + crc32 %rax, crc_init_q ++ lea 8(bufp,chunk_bytes_q,2), bufp + + ################################################################ +- ## 5) Check for end: ++ ## 5) If more blocks remain, goto (2): + ################################################################ + +-LABEL crc_ 0 +- ENDBR + cmp $128*24, len +- jae .Lfull_block ++ jae .Lfull_block + cmp $SMALL_SIZE, len +- jae .Lcontinue_block ++ jae .Lpartial_block + + ####################################################################### + ## 6) Process any remainder without interleaving: +@@ -231,47 +199,30 @@ LABEL crc_ 0 + shr $3, %eax + jz .Ldo_dword + .Ldo_qwords: +- crc32q (bufptmp), crc_init_q +- add $8, bufptmp ++ crc32q (bufp), crc_init_q ++ add $8, bufp + dec %eax + jnz .Ldo_qwords + .Ldo_dword: + test $4, len + jz .Ldo_word +- crc32l (bufptmp), crc_init +- add $4, bufptmp ++ crc32l (bufp), crc_init ++ add $4, bufp + .Ldo_word: + test $2, len + jz .Ldo_byte +- crc32w (bufptmp), crc_init +- add $2, bufptmp ++ crc32w (bufp), crc_init ++ add $2, bufp + .Ldo_byte: + test $1, len + jz .Ldone +- crc32b (bufptmp), crc_init ++ crc32b (bufp), crc_init + .Ldone: + mov crc_init, %eax +- popq %rsi +- popq %rdi +- popq %rbx + RET + SYM_FUNC_END(crc_pcl) + + .section .rodata, "a", @progbits +- ################################################################ +- ## jump table Table is 129 entries x 2 bytes each +- ################################################################ +-.align 4 +-jump_table: +- i=0 +-.rept 129 +-.altmacro +-JMPTBL_ENTRY %i +-.noaltmacro +- i=i+1 +-.endr +- +- + ################################################################ + ## PCLMULQDQ tables + ## Table is 128 entries x 2 words (8 bytes) each diff --git a/debian/patches/patchset-pf/fixes/0001-arch-Kconfig-Default-to-maximum-amount-of-ASLR-bits.patch b/debian/patches/patchset-pf/fixes/0001-arch-Kconfig-Default-to-maximum-amount-of-ASLR-bits.patch new file mode 100644 index 0000000..1fb3740 --- /dev/null +++ b/debian/patches/patchset-pf/fixes/0001-arch-Kconfig-Default-to-maximum-amount-of-ASLR-bits.patch @@ -0,0 +1,31 @@ +From cda0e050fec85635986e9cfe991e26339bf305dc Mon Sep 17 00:00:00 2001 +From: "Jan Alexander Steffens (heftig)" +Date: Sat, 13 Jan 2024 15:29:25 +0100 +Subject: arch/Kconfig: Default to maximum amount of ASLR bits + +To mitigate https://zolutal.github.io/aslrnt/; do this with a patch to +avoid having to enable `CONFIG_EXPERT`. +--- + arch/Kconfig | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +--- a/arch/Kconfig ++++ b/arch/Kconfig +@@ -1050,7 +1050,7 @@ config ARCH_MMAP_RND_BITS + int "Number of bits to use for ASLR of mmap base address" if EXPERT + range ARCH_MMAP_RND_BITS_MIN ARCH_MMAP_RND_BITS_MAX + default ARCH_MMAP_RND_BITS_DEFAULT if ARCH_MMAP_RND_BITS_DEFAULT +- default ARCH_MMAP_RND_BITS_MIN ++ default ARCH_MMAP_RND_BITS_MAX + depends on HAVE_ARCH_MMAP_RND_BITS + help + This value can be used to select the number of bits to use to +@@ -1084,7 +1084,7 @@ config ARCH_MMAP_RND_COMPAT_BITS + int "Number of bits to use for ASLR of mmap base address for compatible applications" if EXPERT + range ARCH_MMAP_RND_COMPAT_BITS_MIN ARCH_MMAP_RND_COMPAT_BITS_MAX + default ARCH_MMAP_RND_COMPAT_BITS_DEFAULT if ARCH_MMAP_RND_COMPAT_BITS_DEFAULT +- default ARCH_MMAP_RND_COMPAT_BITS_MIN ++ default ARCH_MMAP_RND_COMPAT_BITS_MAX + depends on HAVE_ARCH_MMAP_RND_COMPAT_BITS + help + This value can be used to select the number of bits to use to diff --git a/debian/patches/patchset-pf/fixes/0002-cpufreq-Remove-LATENCY_MULTIPLIER.patch b/debian/patches/patchset-pf/fixes/0002-cpufreq-Remove-LATENCY_MULTIPLIER.patch new file mode 100644 index 0000000..8b422f7 --- /dev/null +++ b/debian/patches/patchset-pf/fixes/0002-cpufreq-Remove-LATENCY_MULTIPLIER.patch @@ -0,0 +1,112 @@ +From b7d96c1f19ef15ea431a8d5d7ab2cad22c35edba Mon Sep 17 00:00:00 2001 +From: Qais Yousef +Date: Sun, 28 Jul 2024 20:26:59 +0100 +Subject: cpufreq: Remove LATENCY_MULTIPLIER + +The current LATENCY_MULTIPLIER which has been around for nearly 20 years +causes rate_limit_us to be always in ms range. + +On M1 mac mini I get 50 and 56us transition latency, but due to the 1000 +multiplier we end up setting rate_limit_us to 50 and 56ms, which gets +capped into 2ms and was 10ms before e13aa799c2a6 ("cpufreq: Change +default transition delay to 2ms") + +On Intel I5 system transition latency is 20us but due to the multiplier +we end up with 20ms that again is capped to 2ms. + +Given how good modern hardware and how modern workloads require systems +to be more responsive to cater for sudden changes in workload (tasks +sleeping/wakeup/migrating, uclamp causing a sudden boost or cap) and +that 2ms is quarter of the time of 120Hz refresh rate system, drop the +old logic in favour of providing 50% headroom. + + rate_limit_us = 1.5 * latency. + +I considered not adding any headroom which could mean that we can end up +with infinite back-to-back requests. + +I also considered providing a constant headroom (e.g: 100us) assuming +that any h/w or f/w dealing with the request shouldn't require a large +headroom when transition_latency is actually high. + +But for both cases I wasn't sure if h/w or f/w can end up being +overwhelmed dealing with the freq requests in a potentially busy system. +So I opted for providing 50% breathing room. + +This is expected to impact schedutil only as the other user, +dbs_governor, takes the max(2*tick, transition_delay_us) and the former +was at least 2ms on 1ms TICK, which is equivalent to the max_delay_us +before applying this patch. For systems with TICK of 4ms, this value +would have almost always ended up with 8ms sampling rate. + +For systems that report 0 transition latency, we still default to +returning 1ms as transition delay. + +This helps in eliminating a source of latency for applying requests as +mentioned in [1]. For example if we have a 1ms tick, most systems will +miss sending an update at tick when updating the util_avg for a task/CPU +(rate_limit_us will be 2ms for most systems). + +Link: https://lore.kernel.org/lkml/20240724212255.mfr2ybiv2j2uqek7@airbuntu/ # [1] +Link: https://lore.kernel.org/lkml/20240205022500.2232124-1-qyousef@layalina.io/ +Signed-off-by: Qais Yousef +Link: https://patch.msgid.link/20240728192659.58115-1-qyousef@layalina.io +[ rjw: Subject edits ] +Signed-off-by: Rafael J. Wysocki +--- + drivers/cpufreq/cpufreq.c | 27 ++++----------------------- + include/linux/cpufreq.h | 6 ------ + 2 files changed, 4 insertions(+), 29 deletions(-) + +--- a/drivers/cpufreq/cpufreq.c ++++ b/drivers/cpufreq/cpufreq.c +@@ -575,30 +575,11 @@ unsigned int cpufreq_policy_transition_d + return policy->transition_delay_us; + + latency = policy->cpuinfo.transition_latency / NSEC_PER_USEC; +- if (latency) { +- unsigned int max_delay_us = 2 * MSEC_PER_SEC; ++ if (latency) ++ /* Give a 50% breathing room between updates */ ++ return latency + (latency >> 1); + +- /* +- * If the platform already has high transition_latency, use it +- * as-is. +- */ +- if (latency > max_delay_us) +- return latency; +- +- /* +- * For platforms that can change the frequency very fast (< 2 +- * us), the above formula gives a decent transition delay. But +- * for platforms where transition_latency is in milliseconds, it +- * ends up giving unrealistic values. +- * +- * Cap the default transition delay to 2 ms, which seems to be +- * a reasonable amount of time after which we should reevaluate +- * the frequency. +- */ +- return min(latency * LATENCY_MULTIPLIER, max_delay_us); +- } +- +- return LATENCY_MULTIPLIER; ++ return USEC_PER_MSEC; + } + EXPORT_SYMBOL_GPL(cpufreq_policy_transition_delay_us); + +--- a/include/linux/cpufreq.h ++++ b/include/linux/cpufreq.h +@@ -577,12 +577,6 @@ static inline unsigned long cpufreq_scal + #define CPUFREQ_POLICY_POWERSAVE (1) + #define CPUFREQ_POLICY_PERFORMANCE (2) + +-/* +- * The polling frequency depends on the capability of the processor. Default +- * polling frequency is 1000 times the transition latency of the processor. +- */ +-#define LATENCY_MULTIPLIER (1000) +- + struct cpufreq_governor { + char name[CPUFREQ_NAME_LEN]; + int (*init)(struct cpufreq_policy *policy); diff --git a/debian/patches/patchset-pf/fixes/0003-drivers-firmware-skip-simpledrm-if-nvidia-drm.modese.patch b/debian/patches/patchset-pf/fixes/0003-drivers-firmware-skip-simpledrm-if-nvidia-drm.modese.patch new file mode 100644 index 0000000..a97bffc --- /dev/null +++ b/debian/patches/patchset-pf/fixes/0003-drivers-firmware-skip-simpledrm-if-nvidia-drm.modese.patch @@ -0,0 +1,83 @@ +From 218e958524c673d6e68737e7f82d80ba2b6ef59a Mon Sep 17 00:00:00 2001 +From: Javier Martinez Canillas +Date: Thu, 19 May 2022 14:40:07 +0200 +Subject: drivers/firmware: skip simpledrm if nvidia-drm.modeset=1 is set + +The Nvidia proprietary driver has some bugs that leads to issues if used +with the simpledrm driver. The most noticeable is that does not register +an emulated fbdev device. + +It just relies on a fbdev to be registered by another driver, that could +be that could be attached to the framebuffer console. On UEFI machines, +this is the efifb driver. + +This means that disabling the efifb driver will cause virtual consoles to +not be present in the system when using the Nvidia driver. Legacy BIOS is +not affected just because fbcon is not used there, but instead vgacon. + +Unless a VGA mode is specified using the vga= kernel command line option, +in that case the vesafb driver is used instead and its fbdev attached to +the fbcon. + +This is a problem because with CONFIG_SYSFB_SIMPLEFB=y, the sysfb platform +code attempts to register a "simple-framebuffer" platform device (that is +matched against simpledrm) and only registers either an "efi-framebuffer" +or "vesa-framebuffer" if this fails to be registered due the video modes +not being compatible. + +The Nvidia driver relying on another driver to register the fbdev is quite +fragile, since it can't really assume those will stick around. For example +there are patches posted to remove the EFI and VESA platform devices once +a real DRM or fbdev driver probes. + +But in any case, moving to a simpledrm + emulated fbdev only breaks this +assumption and causes users to not have VT if the Nvidia driver is used. + +So to prevent this, let's add a workaround and make the sysfb to skip the +"simple-framebuffer" registration when nvidia-drm.modeset=1 option is set. + +This is quite horrible, but honestly I can't think of any other approach. + +For this to work, the CONFIG_FB_EFI and CONFIG_FB_VESA config options must +be enabled besides CONFIG_DRM_SIMPLEDRM. + +Signed-off-by: Javier Martinez Canillas +Cherry-picked-for: https://bugs.archlinux.org/task/73720 +--- + drivers/firmware/sysfb.c | 18 +++++++++++++++++- + 1 file changed, 17 insertions(+), 1 deletion(-) + +--- a/drivers/firmware/sysfb.c ++++ b/drivers/firmware/sysfb.c +@@ -35,6 +35,22 @@ + #include + #include + ++static int skip_simpledrm; ++ ++static int __init simpledrm_disable(char *opt) ++{ ++ if (!opt) ++ return -EINVAL; ++ ++ get_option(&opt, &skip_simpledrm); ++ ++ if (skip_simpledrm) ++ pr_info("The simpledrm driver will not be probed\n"); ++ ++ return 0; ++} ++early_param("nvidia-drm.modeset", simpledrm_disable); ++ + static struct platform_device *pd; + static DEFINE_MUTEX(disable_lock); + static bool disabled; +@@ -145,7 +161,7 @@ static __init int sysfb_init(void) + + /* try to create a simple-framebuffer device */ + compatible = sysfb_parse_mode(si, &mode); +- if (compatible) { ++ if (compatible && !skip_simpledrm) { + pd = sysfb_create_simplefb(si, &mode, parent); + if (!IS_ERR(pd)) + goto put_device; diff --git a/debian/patches/patchset-pf/fixes/0004-nfsd-add-more-info-to-WARN_ON_ONCE-on-failed-callbac.patch b/debian/patches/patchset-pf/fixes/0004-nfsd-add-more-info-to-WARN_ON_ONCE-on-failed-callbac.patch new file mode 100644 index 0000000..9086a94 --- /dev/null +++ b/debian/patches/patchset-pf/fixes/0004-nfsd-add-more-info-to-WARN_ON_ONCE-on-failed-callbac.patch @@ -0,0 +1,26 @@ +From b97d21a0aa65a6f7a7bb17bbc696b136688c96ed Mon Sep 17 00:00:00 2001 +From: Jeff Layton +Date: Mon, 26 Aug 2024 08:50:11 -0400 +Subject: nfsd: add more info to WARN_ON_ONCE on failed callbacks + +Currently, you get the warning and stack trace, but nothing is printed +about the relevant error codes. Add that in. + +Signed-off-by: Jeff Layton +Signed-off-by: Chuck Lever +--- + fs/nfsd/nfs4callback.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +--- a/fs/nfsd/nfs4callback.c ++++ b/fs/nfsd/nfs4callback.c +@@ -1333,7 +1333,8 @@ static void nfsd4_cb_done(struct rpc_tas + return; + + if (cb->cb_status) { +- WARN_ON_ONCE(task->tk_status); ++ WARN_ONCE(task->tk_status, "cb_status=%d tk_status=%d", ++ cb->cb_status, task->tk_status); + task->tk_status = cb->cb_status; + } + diff --git a/debian/patches/patchset-pf/fixes/0005-e1000e-Remove-Meteor-Lake-SMBUS-workarounds.patch b/debian/patches/patchset-pf/fixes/0005-e1000e-Remove-Meteor-Lake-SMBUS-workarounds.patch new file mode 100644 index 0000000..5b33527 --- /dev/null +++ b/debian/patches/patchset-pf/fixes/0005-e1000e-Remove-Meteor-Lake-SMBUS-workarounds.patch @@ -0,0 +1,57 @@ +From 1d120544580708eae6bd5981b308ca17735edaac Mon Sep 17 00:00:00 2001 +From: Vitaly Lifshits +Date: Tue, 1 Oct 2024 20:08:48 +0300 +Subject: e1000e: Remove Meteor Lake SMBUS workarounds + +This is a partial revert to commit 76a0a3f9cc2f ("e1000e: fix force smbus +during suspend flow"). That commit fixed a sporadic PHY access issue but +introduced a regression in runtime suspend flows. +The original issue on Meteor Lake systems was rare in terms of the +reproduction rate and the number of the systems affected. + +After the integration of commit 0a6ad4d9e169 ("e1000e: avoid failing the +system during pm_suspend"), PHY access loss can no longer cause a +system-level suspend failure. As it only occurs when the LAN cable is +disconnected, and is recovered during system resume flow. Therefore, its +functional impact is low, and the priority is given to stabilizing +runtime suspend. + +Fixes: 76a0a3f9cc2f ("e1000e: fix force smbus during suspend flow") +Signed-off-by: Vitaly Lifshits +--- + drivers/net/ethernet/intel/e1000e/ich8lan.c | 17 ++++------------- + 1 file changed, 4 insertions(+), 13 deletions(-) + +--- a/drivers/net/ethernet/intel/e1000e/ich8lan.c ++++ b/drivers/net/ethernet/intel/e1000e/ich8lan.c +@@ -1205,12 +1205,10 @@ s32 e1000_enable_ulp_lpt_lp(struct e1000 + if (ret_val) + goto out; + +- if (hw->mac.type != e1000_pch_mtp) { +- ret_val = e1000e_force_smbus(hw); +- if (ret_val) { +- e_dbg("Failed to force SMBUS: %d\n", ret_val); +- goto release; +- } ++ ret_val = e1000e_force_smbus(hw); ++ if (ret_val) { ++ e_dbg("Failed to force SMBUS: %d\n", ret_val); ++ goto release; + } + + /* Si workaround for ULP entry flow on i127/rev6 h/w. Enable +@@ -1273,13 +1271,6 @@ s32 e1000_enable_ulp_lpt_lp(struct e1000 + } + + release: +- if (hw->mac.type == e1000_pch_mtp) { +- ret_val = e1000e_force_smbus(hw); +- if (ret_val) +- e_dbg("Failed to force SMBUS over MTL system: %d\n", +- ret_val); +- } +- + hw->phy.ops.release(hw); + out: + if (ret_val) diff --git a/debian/patches/patchset-pf/fixes/0006-btrfs-zoned-fix-zone-unusable-accounting-for-freed-r.patch b/debian/patches/patchset-pf/fixes/0006-btrfs-zoned-fix-zone-unusable-accounting-for-freed-r.patch new file mode 100644 index 0000000..cc462c6 --- /dev/null +++ b/debian/patches/patchset-pf/fixes/0006-btrfs-zoned-fix-zone-unusable-accounting-for-freed-r.patch @@ -0,0 +1,46 @@ +From 4086c1a804741c9c8f418d6088e8c531f2a481f3 Mon Sep 17 00:00:00 2001 +From: Naohiro Aota +Date: Tue, 1 Oct 2024 17:03:32 +0900 +Subject: btrfs: zoned: fix zone unusable accounting for freed reserved extent + +When btrfs reserves an extent and does not use it (e.g, by an error), it +calls btrfs_free_reserved_extent() to free the reserved extent. In the +process, it calls btrfs_add_free_space() and then it accounts the region +bytes as block_group->zone_unusable. + +However, it leaves the space_info->bytes_zone_unusable side not updated. As +a result, ENOSPC can happen while a space_info reservation succeeded. The +reservation is fine because the freed region is not added in +space_info->bytes_zone_unusable, leaving that space as "free". OTOH, +corresponding block group counts it as zone_unusable and its allocation +pointer is not rewound, we cannot allocate an extent from that block group. +That will also negate space_info's async/sync reclaim process, and cause an +ENOSPC error from the extent allocation process. + +Fix that by returning the space to space_info->bytes_zone_unusable. +Ideally, since a bio is not submitted for this reserved region, we should +return the space to free space and rewind the allocation pointer. But, it +needs rework on extent allocation handling, so let it work in this way for +now. + +Fixes: 169e0da91a21 ("btrfs: zoned: track unusable bytes for zones") +CC: stable@vger.kernel.org # 5.15+ +Reviewed-by: Johannes Thumshirn +Signed-off-by: Naohiro Aota +Reviewed-by: David Sterba +Signed-off-by: David Sterba +--- + fs/btrfs/block-group.c | 2 ++ + 1 file changed, 2 insertions(+) + +--- a/fs/btrfs/block-group.c ++++ b/fs/btrfs/block-group.c +@@ -3819,6 +3819,8 @@ void btrfs_free_reserved_bytes(struct bt + spin_lock(&cache->lock); + if (cache->ro) + space_info->bytes_readonly += num_bytes; ++ else if (btrfs_is_zoned(cache->fs_info)) ++ space_info->bytes_zone_unusable += num_bytes; + cache->reserved -= num_bytes; + space_info->bytes_reserved -= num_bytes; + space_info->max_extent_size = 0; diff --git a/debian/patches/patchset-pf/fixes/0007-btrfs-clear-force-compress-on-remount-when-compress-.patch b/debian/patches/patchset-pf/fixes/0007-btrfs-clear-force-compress-on-remount-when-compress-.patch new file mode 100644 index 0000000..42144a5 --- /dev/null +++ b/debian/patches/patchset-pf/fixes/0007-btrfs-clear-force-compress-on-remount-when-compress-.patch @@ -0,0 +1,64 @@ +From aa8155f0ba032729ec4f28c5cb9669fb14f6947b Mon Sep 17 00:00:00 2001 +From: Filipe Manana +Date: Mon, 14 Oct 2024 16:14:18 +0100 +Subject: btrfs: clear force-compress on remount when compress mount option is + given + +After the migration to use fs context for processing mount options we had +a slight change in the semantics for remounting a filesystem that was +mounted with compress-force. Before we could clear compress-force by +passing only "-o compress[=algo]" during a remount, but after that change +that does not work anymore, force-compress is still present and one needs +to pass "-o compress-force=no,compress[=algo]" to the mount command. + +Example, when running on a kernel 6.8+: + + $ mount -o compress-force=zlib:9 /dev/sdi /mnt/sdi + $ mount | grep sdi + /dev/sdi on /mnt/sdi type btrfs (rw,relatime,compress-force=zlib:9,discard=async,space_cache=v2,subvolid=5,subvol=/) + + $ mount -o remount,compress=zlib:5 /mnt/sdi + $ mount | grep sdi + /dev/sdi on /mnt/sdi type btrfs (rw,relatime,compress-force=zlib:5,discard=async,space_cache=v2,subvolid=5,subvol=/) + +On a 6.7 kernel (or older): + + $ mount -o compress-force=zlib:9 /dev/sdi /mnt/sdi + $ mount | grep sdi + /dev/sdi on /mnt/sdi type btrfs (rw,relatime,compress-force=zlib:9,discard=async,space_cache=v2,subvolid=5,subvol=/) + + $ mount -o remount,compress=zlib:5 /mnt/sdi + $ mount | grep sdi + /dev/sdi on /mnt/sdi type btrfs (rw,relatime,compress=zlib:5,discard=async,space_cache=v2,subvolid=5,subvol=/) + +So update btrfs_parse_param() to clear "compress-force" when "compress" is +given, providing the same semantics as kernel 6.7 and older. + +Reported-by: Roman Mamedov +Link: https://lore.kernel.org/linux-btrfs/20241014182416.13d0f8b0@nvm/ +CC: stable@vger.kernel.org # 6.8+ +Signed-off-by: Filipe Manana +Reviewed-by: David Sterba +Signed-off-by: David Sterba +--- + fs/btrfs/super.c | 9 +++++++++ + 1 file changed, 9 insertions(+) + +--- a/fs/btrfs/super.c ++++ b/fs/btrfs/super.c +@@ -340,6 +340,15 @@ static int btrfs_parse_param(struct fs_c + fallthrough; + case Opt_compress: + case Opt_compress_type: ++ /* ++ * Provide the same semantics as older kernels that don't use fs ++ * context, specifying the "compress" option clears ++ * "force-compress" without the need to pass ++ * "compress-force=[no|none]" before specifying "compress". ++ */ ++ if (opt != Opt_compress_force && opt != Opt_compress_force_type) ++ btrfs_clear_opt(ctx->mount_opt, FORCE_COMPRESS); ++ + if (opt == Opt_compress || opt == Opt_compress_force) { + ctx->compress_type = BTRFS_COMPRESS_ZLIB; + ctx->compress_level = BTRFS_ZLIB_DEFAULT_LEVEL; diff --git a/debian/patches/patchset-pf/fixes/0008-btrfs-qgroup-set-a-more-sane-default-value-for-subtr.patch b/debian/patches/patchset-pf/fixes/0008-btrfs-qgroup-set-a-more-sane-default-value-for-subtr.patch new file mode 100644 index 0000000..2c1b65f --- /dev/null +++ b/debian/patches/patchset-pf/fixes/0008-btrfs-qgroup-set-a-more-sane-default-value-for-subtr.patch @@ -0,0 +1,68 @@ +From 81baeb2a67d8245ac5b61299e54dd65defd4ac72 Mon Sep 17 00:00:00 2001 +From: Qu Wenruo +Date: Tue, 10 Sep 2024 15:21:04 +0930 +Subject: btrfs: qgroup: set a more sane default value for subtree drop + threshold + +Since commit 011b46c30476 ("btrfs: skip subtree scan if it's too high to +avoid low stall in btrfs_commit_transaction()"), btrfs qgroup can +automatically skip large subtree scan at the cost of marking qgroup +inconsistent. + +It's designed to address the final performance problem of snapshot drop +with qgroup enabled, but to be safe the default value is +BTRFS_MAX_LEVEL, requiring a user space daemon to set a different value +to make it work. + +I'd say it's not a good idea to rely on user space tool to set this +default value, especially when some operations (snapshot dropping) can +be triggered immediately after mount, leaving a very small window to +that that sysfs interface. + +So instead of disabling this new feature by default, enable it with a +low threshold (3), so that large subvolume tree drop at mount time won't +cause huge qgroup workload. + +CC: stable@vger.kernel.org # 6.1 +Signed-off-by: Qu Wenruo +Reviewed-by: David Sterba +Signed-off-by: David Sterba +--- + fs/btrfs/disk-io.c | 2 +- + fs/btrfs/qgroup.c | 2 +- + fs/btrfs/qgroup.h | 2 ++ + 3 files changed, 4 insertions(+), 2 deletions(-) + +--- a/fs/btrfs/disk-io.c ++++ b/fs/btrfs/disk-io.c +@@ -1960,7 +1960,7 @@ static void btrfs_init_qgroup(struct btr + fs_info->qgroup_seq = 1; + fs_info->qgroup_ulist = NULL; + fs_info->qgroup_rescan_running = false; +- fs_info->qgroup_drop_subtree_thres = BTRFS_MAX_LEVEL; ++ fs_info->qgroup_drop_subtree_thres = BTRFS_QGROUP_DROP_SUBTREE_THRES_DEFAULT; + mutex_init(&fs_info->qgroup_rescan_lock); + } + +--- a/fs/btrfs/qgroup.c ++++ b/fs/btrfs/qgroup.c +@@ -1407,7 +1407,7 @@ int btrfs_quota_disable(struct btrfs_fs_ + fs_info->quota_root = NULL; + fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON; + fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE; +- fs_info->qgroup_drop_subtree_thres = BTRFS_MAX_LEVEL; ++ fs_info->qgroup_drop_subtree_thres = BTRFS_QGROUP_DROP_SUBTREE_THRES_DEFAULT; + spin_unlock(&fs_info->qgroup_lock); + + btrfs_free_qgroup_config(fs_info); +--- a/fs/btrfs/qgroup.h ++++ b/fs/btrfs/qgroup.h +@@ -121,6 +121,8 @@ struct btrfs_inode; + #define BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN (1ULL << 63) + #define BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING (1ULL << 62) + ++#define BTRFS_QGROUP_DROP_SUBTREE_THRES_DEFAULT (3) ++ + /* + * Record a dirty extent, and info qgroup to update quota on it + */ diff --git a/debian/patches/patchset-pf/fixes/0009-btrfs-also-add-stripe-entries-for-NOCOW-writes.patch b/debian/patches/patchset-pf/fixes/0009-btrfs-also-add-stripe-entries-for-NOCOW-writes.patch new file mode 100644 index 0000000..44112eb --- /dev/null +++ b/debian/patches/patchset-pf/fixes/0009-btrfs-also-add-stripe-entries-for-NOCOW-writes.patch @@ -0,0 +1,32 @@ +From 8ea93b01558ea7a752e478ad25862e7441d6053a Mon Sep 17 00:00:00 2001 +From: Johannes Thumshirn +Date: Thu, 19 Sep 2024 12:16:38 +0200 +Subject: btrfs: also add stripe entries for NOCOW writes + +NOCOW writes do not generate stripe_extent entries in the RAID stripe +tree, as the RAID stripe-tree feature initially was designed with a +zoned filesystem in mind and on a zoned filesystem, we do not allow NOCOW +writes. But the RAID stripe-tree feature is independent from the zoned +feature, so we must also do NOCOW writes for RAID stripe-tree filesystems. + +Reviewed-by: Naohiro Aota +Signed-off-by: Johannes Thumshirn +Signed-off-by: David Sterba +--- + fs/btrfs/inode.c | 5 +++++ + 1 file changed, 5 insertions(+) + +--- a/fs/btrfs/inode.c ++++ b/fs/btrfs/inode.c +@@ -3087,6 +3087,11 @@ int btrfs_finish_one_ordered(struct btrf + ret = btrfs_update_inode_fallback(trans, inode); + if (ret) /* -ENOMEM or corruption */ + btrfs_abort_transaction(trans, ret); ++ ++ ret = btrfs_insert_raid_extent(trans, ordered_extent); ++ if (ret) ++ btrfs_abort_transaction(trans, ret); ++ + goto out; + } + diff --git a/debian/patches/patchset-pf/fixes/0010-btrfs-fix-read-corruption-due-to-race-with-extent-ma.patch b/debian/patches/patchset-pf/fixes/0010-btrfs-fix-read-corruption-due-to-race-with-extent-ma.patch new file mode 100644 index 0000000..496ce00 --- /dev/null +++ b/debian/patches/patchset-pf/fixes/0010-btrfs-fix-read-corruption-due-to-race-with-extent-ma.patch @@ -0,0 +1,107 @@ +From f6f5cd12972307324de5decd7fa41b0b3c98639c Mon Sep 17 00:00:00 2001 +From: Boris Burkov +Date: Fri, 18 Oct 2024 15:44:34 -0700 +Subject: btrfs: fix read corruption due to race with extent map merging + +In debugging some corrupt squashfs files, we observed symptoms of +corrupt page cache pages but correct on-disk contents. Further +investigation revealed that the exact symptom was a correct page +followed by an incorrect, duplicate, page. This got us thinking about +extent maps. + +commit ac05ca913e9f ("Btrfs: fix race between using extent maps and merging them") +enforces a reference count on the primary `em` extent_map being merged, +as that one gets modified. + +However, since, +commit 3d2ac9922465 ("btrfs: introduce new members for extent_map") +both 'em' and 'merge' get modified, which started modifying 'merge' +and thus introduced the same race. + +We were able to reproduce this by looping the affected squashfs workload +in parallel on a bunch of separate btrfs-es while also dropping caches. +We are still working on a simple enough reproducer to make into an fstest. + +The simplest fix is to stop modifying 'merge', which is not essential, +as it is dropped immediately after the merge. This behavior is simply +a consequence of the order of the two extent maps being important in +computing the new values. Modify merge_ondisk_extents to take prev and +next by const* and also take a third merged parameter that it puts the +results in. Note that this introduces the rather odd behavior of passing +'em' to merge_ondisk_extents as a const * and as a regular ptr. + +Fixes: 3d2ac9922465 ("btrfs: introduce new members for extent_map") +CC: stable@vger.kernel.org # 6.11+ +Reviewed-by: Qu Wenruo +Reviewed-by: Filipe Manana +Signed-off-by: Omar Sandoval +Signed-off-by: Boris Burkov +Signed-off-by: David Sterba +--- + fs/btrfs/extent_map.c | 31 ++++++++++++++++--------------- + 1 file changed, 16 insertions(+), 15 deletions(-) + +--- a/fs/btrfs/extent_map.c ++++ b/fs/btrfs/extent_map.c +@@ -240,13 +240,19 @@ static bool mergeable_maps(const struct + /* + * Handle the on-disk data extents merge for @prev and @next. + * ++ * @prev: left extent to merge ++ * @next: right extent to merge ++ * @merged: the extent we will not discard after the merge; updated with new values ++ * ++ * After this, one of the two extents is the new merged extent and the other is ++ * removed from the tree and likely freed. Note that @merged is one of @prev/@next ++ * so there is const/non-const aliasing occurring here. ++ * + * Only touches disk_bytenr/disk_num_bytes/offset/ram_bytes. + * For now only uncompressed regular extent can be merged. +- * +- * @prev and @next will be both updated to point to the new merged range. +- * Thus one of them should be removed by the caller. + */ +-static void merge_ondisk_extents(struct extent_map *prev, struct extent_map *next) ++static void merge_ondisk_extents(const struct extent_map *prev, const struct extent_map *next, ++ struct extent_map *merged) + { + u64 new_disk_bytenr; + u64 new_disk_num_bytes; +@@ -281,15 +287,10 @@ static void merge_ondisk_extents(struct + new_disk_bytenr; + new_offset = prev->disk_bytenr + prev->offset - new_disk_bytenr; + +- prev->disk_bytenr = new_disk_bytenr; +- prev->disk_num_bytes = new_disk_num_bytes; +- prev->ram_bytes = new_disk_num_bytes; +- prev->offset = new_offset; +- +- next->disk_bytenr = new_disk_bytenr; +- next->disk_num_bytes = new_disk_num_bytes; +- next->ram_bytes = new_disk_num_bytes; +- next->offset = new_offset; ++ merged->disk_bytenr = new_disk_bytenr; ++ merged->disk_num_bytes = new_disk_num_bytes; ++ merged->ram_bytes = new_disk_num_bytes; ++ merged->offset = new_offset; + } + + static void dump_extent_map(struct btrfs_fs_info *fs_info, const char *prefix, +@@ -358,7 +359,7 @@ static void try_merge_map(struct btrfs_i + em->generation = max(em->generation, merge->generation); + + if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE) +- merge_ondisk_extents(merge, em); ++ merge_ondisk_extents(merge, em, em); + em->flags |= EXTENT_FLAG_MERGED; + + validate_extent_map(fs_info, em); +@@ -375,7 +376,7 @@ static void try_merge_map(struct btrfs_i + if (rb && can_merge_extent_map(merge) && mergeable_maps(em, merge)) { + em->len += merge->len; + if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE) +- merge_ondisk_extents(em, merge); ++ merge_ondisk_extents(em, merge, em); + validate_extent_map(fs_info, em); + rb_erase(&merge->rb_node, &tree->root); + RB_CLEAR_NODE(&merge->rb_node); diff --git a/debian/patches/patchset-pf/fixes/0011-btrfs-reject-ro-rw-reconfiguration-if-there-are-hard.patch b/debian/patches/patchset-pf/fixes/0011-btrfs-reject-ro-rw-reconfiguration-if-there-are-hard.patch new file mode 100644 index 0000000..4851e23 --- /dev/null +++ b/debian/patches/patchset-pf/fixes/0011-btrfs-reject-ro-rw-reconfiguration-if-there-are-hard.patch @@ -0,0 +1,101 @@ +From 7f83049bda761f340991af8dce79a4e98c62b378 Mon Sep 17 00:00:00 2001 +From: Qu Wenruo +Date: Thu, 19 Sep 2024 20:18:11 +0930 +Subject: btrfs: reject ro->rw reconfiguration if there are hard ro + requirements + +[BUG] +Syzbot reports the following crash: + + BTRFS info (device loop0 state MCS): disabling free space tree + BTRFS info (device loop0 state MCS): clearing compat-ro feature flag for FREE_SPACE_TREE (0x1) + BTRFS info (device loop0 state MCS): clearing compat-ro feature flag for FREE_SPACE_TREE_VALID (0x2) + Oops: general protection fault, probably for non-canonical address 0xdffffc0000000003: 0000 [#1] PREEMPT SMP KASAN NOPTI + KASAN: null-ptr-deref in range [0x0000000000000018-0x000000000000001f] + Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-debian-1.16.3-2~bpo12+1 04/01/2014 + RIP: 0010:backup_super_roots fs/btrfs/disk-io.c:1691 [inline] + RIP: 0010:write_all_supers+0x97a/0x40f0 fs/btrfs/disk-io.c:4041 + Call Trace: + + btrfs_commit_transaction+0x1eae/0x3740 fs/btrfs/transaction.c:2530 + btrfs_delete_free_space_tree+0x383/0x730 fs/btrfs/free-space-tree.c:1312 + btrfs_start_pre_rw_mount+0xf28/0x1300 fs/btrfs/disk-io.c:3012 + btrfs_remount_rw fs/btrfs/super.c:1309 [inline] + btrfs_reconfigure+0xae6/0x2d40 fs/btrfs/super.c:1534 + btrfs_reconfigure_for_mount fs/btrfs/super.c:2020 [inline] + btrfs_get_tree_subvol fs/btrfs/super.c:2079 [inline] + btrfs_get_tree+0x918/0x1920 fs/btrfs/super.c:2115 + vfs_get_tree+0x90/0x2b0 fs/super.c:1800 + do_new_mount+0x2be/0xb40 fs/namespace.c:3472 + do_mount fs/namespace.c:3812 [inline] + __do_sys_mount fs/namespace.c:4020 [inline] + __se_sys_mount+0x2d6/0x3c0 fs/namespace.c:3997 + do_syscall_x64 arch/x86/entry/common.c:52 [inline] + do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83 + entry_SYSCALL_64_after_hwframe+0x77/0x7f + +[CAUSE] +To support mounting different subvolume with different RO/RW flags for +the new mount APIs, btrfs introduced two workaround to support this feature: + +- Skip mount option/feature checks if we are mounting a different + subvolume + +- Reconfigure the fs to RW if the initial mount is RO + +Combining these two, we can have the following sequence: + +- Mount the fs ro,rescue=all,clear_cache,space_cache=v1 + rescue=all will mark the fs as hard read-only, so no v2 cache clearing + will happen. + +- Mount a subvolume rw of the same fs. + We go into btrfs_get_tree_subvol(), but fc_mount() returns EBUSY + because our new fc is RW, different from the original fs. + + Now we enter btrfs_reconfigure_for_mount(), which switches the RO flag + first so that we can grab the existing fs_info. + Then we reconfigure the fs to RW. + +- During reconfiguration, option/features check is skipped + This means we will restart the v2 cache clearing, and convert back to + v1 cache. + This will trigger fs writes, and since the original fs has "rescue=all" + option, it skips the csum tree read. + + And eventually causing NULL pointer dereference in super block + writeback. + +[FIX] +For reconfiguration caused by different subvolume RO/RW flags, ensure we +always run btrfs_check_options() to ensure we have proper hard RO +requirements met. + +In fact the function btrfs_check_options() doesn't really do many +complex checks, but hard RO requirement and some feature dependency +checks, thus there is no special reason not to do the check for mount +reconfiguration. + +Reported-by: syzbot+56360f93efa90ff15870@syzkaller.appspotmail.com +Link: https://lore.kernel.org/linux-btrfs/0000000000008c5d090621cb2770@google.com/ +Fixes: f044b318675f ("btrfs: handle the ro->rw transition for mounting different subvolumes") +CC: stable@vger.kernel.org # 6.8+ +Reviewed-by: Johannes Thumshirn +Signed-off-by: Qu Wenruo +Signed-off-by: David Sterba +--- + fs/btrfs/super.c | 3 +-- + 1 file changed, 1 insertion(+), 2 deletions(-) + +--- a/fs/btrfs/super.c ++++ b/fs/btrfs/super.c +@@ -1519,8 +1519,7 @@ static int btrfs_reconfigure(struct fs_c + sync_filesystem(sb); + set_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state); + +- if (!mount_reconfigure && +- !btrfs_check_options(fs_info, &ctx->mount_opt, fc->sb_flags)) ++ if (!btrfs_check_options(fs_info, &ctx->mount_opt, fc->sb_flags)) + return -EINVAL; + + ret = btrfs_check_features(fs_info, !(fc->sb_flags & SB_RDONLY)); diff --git a/debian/patches/patchset-pf/fixes/0012-btrfs-fix-passing-0-to-ERR_PTR-in-btrfs_search_dir_i.patch b/debian/patches/patchset-pf/fixes/0012-btrfs-fix-passing-0-to-ERR_PTR-in-btrfs_search_dir_i.patch new file mode 100644 index 0000000..06b8612 --- /dev/null +++ b/debian/patches/patchset-pf/fixes/0012-btrfs-fix-passing-0-to-ERR_PTR-in-btrfs_search_dir_i.patch @@ -0,0 +1,54 @@ +From ed73b9279db9536a9672cba6506950c26cedb140 Mon Sep 17 00:00:00 2001 +From: Yue Haibing +Date: Tue, 22 Oct 2024 17:52:08 +0800 +Subject: btrfs: fix passing 0 to ERR_PTR in btrfs_search_dir_index_item() + +The ret may be zero in btrfs_search_dir_index_item() and should not +passed to ERR_PTR(). Now btrfs_unlink_subvol() is the only caller to +this, reconstructed it to check ERR_PTR(-ENOENT) while ret >= 0. + +This fixes smatch warnings: + +fs/btrfs/dir-item.c:353 + btrfs_search_dir_index_item() warn: passing zero to 'ERR_PTR' + +Fixes: 9dcbe16fccbb ("btrfs: use btrfs_for_each_slot in btrfs_search_dir_index_item") +CC: stable@vger.kernel.org # 6.1+ +Reviewed-by: Johannes Thumshirn +Signed-off-by: Yue Haibing +Reviewed-by: David Sterba +Signed-off-by: David Sterba +--- + fs/btrfs/dir-item.c | 4 ++-- + fs/btrfs/inode.c | 7 ++----- + 2 files changed, 4 insertions(+), 7 deletions(-) + +--- a/fs/btrfs/dir-item.c ++++ b/fs/btrfs/dir-item.c +@@ -347,8 +347,8 @@ btrfs_search_dir_index_item(struct btrfs + return di; + } + /* Adjust return code if the key was not found in the next leaf. */ +- if (ret > 0) +- ret = 0; ++ if (ret >= 0) ++ ret = -ENOENT; + + return ERR_PTR(ret); + } +--- a/fs/btrfs/inode.c ++++ b/fs/btrfs/inode.c +@@ -4344,11 +4344,8 @@ static int btrfs_unlink_subvol(struct bt + */ + if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) { + di = btrfs_search_dir_index_item(root, path, dir_ino, &fname.disk_name); +- if (IS_ERR_OR_NULL(di)) { +- if (!di) +- ret = -ENOENT; +- else +- ret = PTR_ERR(di); ++ if (IS_ERR(di)) { ++ ret = PTR_ERR(di); + btrfs_abort_transaction(trans, ret); + goto out; + } diff --git a/debian/patches/patchset-pf/ksm/0001-mm-expose-per-process-KSM-control-via-syscalls.patch b/debian/patches/patchset-pf/ksm/0001-mm-expose-per-process-KSM-control-via-syscalls.patch new file mode 100644 index 0000000..18a27a7 --- /dev/null +++ b/debian/patches/patchset-pf/ksm/0001-mm-expose-per-process-KSM-control-via-syscalls.patch @@ -0,0 +1,407 @@ +From 88362669534c70bbc7036f45bb23e63a30d4adfb Mon Sep 17 00:00:00 2001 +From: Oleksandr Natalenko +Date: Mon, 29 Jul 2024 00:38:24 +0200 +Subject: mm: expose per-process KSM control via syscalls + +d7597f59d1d3 added a new API to enable per-process KSM control. It +however uses prctl, which doesn't allow controlling KSM from outside of +the current process. + +Hence, expose this API via 3 syscalls: process_ksm_enable, +process_ksm_disable and process_ksm_status. Given sufficient privileges, +auto-KSM can be enable by another process. + +Since these syscalls are not in the upstream kernel, also expose their +numbers under /sys/kernel/process_ksm so that userspace tooling like +uksmd knows how to use them. + +Signed-off-by: Oleksandr Natalenko +--- + arch/alpha/kernel/syscalls/syscall.tbl | 3 + + arch/arm/tools/syscall.tbl | 3 + + arch/m68k/kernel/syscalls/syscall.tbl | 3 + + arch/microblaze/kernel/syscalls/syscall.tbl | 3 + + arch/mips/kernel/syscalls/syscall_n32.tbl | 3 + + arch/mips/kernel/syscalls/syscall_n64.tbl | 3 + + arch/mips/kernel/syscalls/syscall_o32.tbl | 3 + + arch/parisc/kernel/syscalls/syscall.tbl | 3 + + arch/powerpc/kernel/syscalls/syscall.tbl | 3 + + arch/s390/kernel/syscalls/syscall.tbl | 3 + + arch/sh/kernel/syscalls/syscall.tbl | 3 + + arch/sparc/kernel/syscalls/syscall.tbl | 3 + + arch/x86/entry/syscalls/syscall_32.tbl | 3 + + arch/x86/entry/syscalls/syscall_64.tbl | 3 + + arch/xtensa/kernel/syscalls/syscall.tbl | 3 + + include/linux/syscalls.h | 3 + + include/uapi/asm-generic/unistd.h | 9 +- + kernel/sys.c | 147 ++++++++++++++++++ + kernel/sys_ni.c | 3 + + scripts/syscall.tbl | 3 + + .../arch/powerpc/entry/syscalls/syscall.tbl | 3 + + .../perf/arch/s390/entry/syscalls/syscall.tbl | 3 + + 22 files changed, 215 insertions(+), 1 deletion(-) + +--- a/arch/alpha/kernel/syscalls/syscall.tbl ++++ b/arch/alpha/kernel/syscalls/syscall.tbl +@@ -502,3 +502,6 @@ + 570 common lsm_set_self_attr sys_lsm_set_self_attr + 571 common lsm_list_modules sys_lsm_list_modules + 572 common mseal sys_mseal ++573 common process_ksm_enable sys_process_ksm_enable ++574 common process_ksm_disable sys_process_ksm_disable ++575 common process_ksm_status sys_process_ksm_status +--- a/arch/arm/tools/syscall.tbl ++++ b/arch/arm/tools/syscall.tbl +@@ -477,3 +477,6 @@ + 460 common lsm_set_self_attr sys_lsm_set_self_attr + 461 common lsm_list_modules sys_lsm_list_modules + 462 common mseal sys_mseal ++463 common process_ksm_enable sys_process_ksm_enable ++464 common process_ksm_disable sys_process_ksm_disable ++465 common process_ksm_status sys_process_ksm_status +--- a/arch/m68k/kernel/syscalls/syscall.tbl ++++ b/arch/m68k/kernel/syscalls/syscall.tbl +@@ -462,3 +462,6 @@ + 460 common lsm_set_self_attr sys_lsm_set_self_attr + 461 common lsm_list_modules sys_lsm_list_modules + 462 common mseal sys_mseal ++463 common process_ksm_enable sys_process_ksm_enable ++464 common process_ksm_disable sys_process_ksm_disable ++465 common process_ksm_status sys_process_ksm_status +--- a/arch/microblaze/kernel/syscalls/syscall.tbl ++++ b/arch/microblaze/kernel/syscalls/syscall.tbl +@@ -468,3 +468,6 @@ + 460 common lsm_set_self_attr sys_lsm_set_self_attr + 461 common lsm_list_modules sys_lsm_list_modules + 462 common mseal sys_mseal ++463 common process_ksm_enable sys_process_ksm_enable ++464 common process_ksm_disable sys_process_ksm_disable ++465 common process_ksm_status sys_process_ksm_status +--- a/arch/mips/kernel/syscalls/syscall_n32.tbl ++++ b/arch/mips/kernel/syscalls/syscall_n32.tbl +@@ -401,3 +401,6 @@ + 460 n32 lsm_set_self_attr sys_lsm_set_self_attr + 461 n32 lsm_list_modules sys_lsm_list_modules + 462 n32 mseal sys_mseal ++463 n32 process_ksm_enable sys_process_ksm_enable ++464 n32 process_ksm_disable sys_process_ksm_disable ++465 n32 process_ksm_status sys_process_ksm_status +--- a/arch/mips/kernel/syscalls/syscall_n64.tbl ++++ b/arch/mips/kernel/syscalls/syscall_n64.tbl +@@ -377,3 +377,6 @@ + 460 n64 lsm_set_self_attr sys_lsm_set_self_attr + 461 n64 lsm_list_modules sys_lsm_list_modules + 462 n64 mseal sys_mseal ++463 n64 process_ksm_enable sys_process_ksm_enable ++464 n64 process_ksm_disable sys_process_ksm_disable ++465 n64 process_ksm_status sys_process_ksm_status +--- a/arch/mips/kernel/syscalls/syscall_o32.tbl ++++ b/arch/mips/kernel/syscalls/syscall_o32.tbl +@@ -450,3 +450,6 @@ + 460 o32 lsm_set_self_attr sys_lsm_set_self_attr + 461 o32 lsm_list_modules sys_lsm_list_modules + 462 o32 mseal sys_mseal ++463 o32 process_ksm_enable sys_process_ksm_enable ++464 o32 process_ksm_disable sys_process_ksm_disable ++465 o32 process_ksm_status sys_process_ksm_status +--- a/arch/parisc/kernel/syscalls/syscall.tbl ++++ b/arch/parisc/kernel/syscalls/syscall.tbl +@@ -461,3 +461,6 @@ + 460 common lsm_set_self_attr sys_lsm_set_self_attr + 461 common lsm_list_modules sys_lsm_list_modules + 462 common mseal sys_mseal ++463 common process_ksm_enable sys_process_ksm_enable ++464 common process_ksm_disable sys_process_ksm_disable ++465 common process_ksm_status sys_process_ksm_status +--- a/arch/powerpc/kernel/syscalls/syscall.tbl ++++ b/arch/powerpc/kernel/syscalls/syscall.tbl +@@ -553,3 +553,6 @@ + 460 common lsm_set_self_attr sys_lsm_set_self_attr + 461 common lsm_list_modules sys_lsm_list_modules + 462 common mseal sys_mseal ++463 common process_ksm_enable sys_process_ksm_enable ++464 common process_ksm_disable sys_process_ksm_disable ++465 common process_ksm_status sys_process_ksm_status +--- a/arch/s390/kernel/syscalls/syscall.tbl ++++ b/arch/s390/kernel/syscalls/syscall.tbl +@@ -465,3 +465,6 @@ + 460 common lsm_set_self_attr sys_lsm_set_self_attr sys_lsm_set_self_attr + 461 common lsm_list_modules sys_lsm_list_modules sys_lsm_list_modules + 462 common mseal sys_mseal sys_mseal ++463 common process_ksm_enable sys_process_ksm_enable sys_process_ksm_enable ++464 common process_ksm_disable sys_process_ksm_disable sys_process_ksm_disable ++465 common process_ksm_status sys_process_ksm_status sys_process_ksm_status +--- a/arch/sh/kernel/syscalls/syscall.tbl ++++ b/arch/sh/kernel/syscalls/syscall.tbl +@@ -466,3 +466,6 @@ + 460 common lsm_set_self_attr sys_lsm_set_self_attr + 461 common lsm_list_modules sys_lsm_list_modules + 462 common mseal sys_mseal ++463 common process_ksm_enable sys_process_ksm_enable ++464 common process_ksm_disable sys_process_ksm_disable ++465 common process_ksm_status sys_process_ksm_status +--- a/arch/sparc/kernel/syscalls/syscall.tbl ++++ b/arch/sparc/kernel/syscalls/syscall.tbl +@@ -508,3 +508,6 @@ + 460 common lsm_set_self_attr sys_lsm_set_self_attr + 461 common lsm_list_modules sys_lsm_list_modules + 462 common mseal sys_mseal ++463 common process_ksm_enable sys_process_ksm_enable ++464 common process_ksm_disable sys_process_ksm_disable ++465 common process_ksm_status sys_process_ksm_status +--- a/arch/x86/entry/syscalls/syscall_32.tbl ++++ b/arch/x86/entry/syscalls/syscall_32.tbl +@@ -468,3 +468,6 @@ + 460 i386 lsm_set_self_attr sys_lsm_set_self_attr + 461 i386 lsm_list_modules sys_lsm_list_modules + 462 i386 mseal sys_mseal ++463 i386 process_ksm_enable sys_process_ksm_enable ++464 i386 process_ksm_disable sys_process_ksm_disable ++465 i386 process_ksm_status sys_process_ksm_status +--- a/arch/x86/entry/syscalls/syscall_64.tbl ++++ b/arch/x86/entry/syscalls/syscall_64.tbl +@@ -386,6 +386,9 @@ + 460 common lsm_set_self_attr sys_lsm_set_self_attr + 461 common lsm_list_modules sys_lsm_list_modules + 462 common mseal sys_mseal ++463 common process_ksm_enable sys_process_ksm_enable ++464 common process_ksm_disable sys_process_ksm_disable ++465 common process_ksm_status sys_process_ksm_status + + # + # Due to a historical design error, certain syscalls are numbered differently +--- a/arch/xtensa/kernel/syscalls/syscall.tbl ++++ b/arch/xtensa/kernel/syscalls/syscall.tbl +@@ -433,3 +433,6 @@ + 460 common lsm_set_self_attr sys_lsm_set_self_attr + 461 common lsm_list_modules sys_lsm_list_modules + 462 common mseal sys_mseal ++463 common process_ksm_enable sys_process_ksm_enable ++464 common process_ksm_disable sys_process_ksm_disable ++465 common process_ksm_status sys_process_ksm_status +--- a/include/linux/syscalls.h ++++ b/include/linux/syscalls.h +@@ -818,6 +818,9 @@ asmlinkage long sys_madvise(unsigned lon + asmlinkage long sys_process_madvise(int pidfd, const struct iovec __user *vec, + size_t vlen, int behavior, unsigned int flags); + asmlinkage long sys_process_mrelease(int pidfd, unsigned int flags); ++asmlinkage long sys_process_ksm_enable(int pidfd, unsigned int flags); ++asmlinkage long sys_process_ksm_disable(int pidfd, unsigned int flags); ++asmlinkage long sys_process_ksm_status(int pidfd, unsigned int flags); + asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size, + unsigned long prot, unsigned long pgoff, + unsigned long flags); +--- a/include/uapi/asm-generic/unistd.h ++++ b/include/uapi/asm-generic/unistd.h +@@ -841,8 +841,15 @@ __SYSCALL(__NR_lsm_list_modules, sys_lsm + #define __NR_mseal 462 + __SYSCALL(__NR_mseal, sys_mseal) + ++#define __NR_process_ksm_enable 463 ++__SYSCALL(__NR_process_ksm_enable, sys_process_ksm_enable) ++#define __NR_process_ksm_disable 464 ++__SYSCALL(__NR_process_ksm_disable, sys_process_ksm_disable) ++#define __NR_process_ksm_status 465 ++__SYSCALL(__NR_process_ksm_status, sys_process_ksm_status) ++ + #undef __NR_syscalls +-#define __NR_syscalls 463 ++#define __NR_syscalls 466 + + /* + * 32 bit systems traditionally used different +--- a/kernel/sys.c ++++ b/kernel/sys.c +@@ -2789,6 +2789,153 @@ SYSCALL_DEFINE5(prctl, int, option, unsi + return error; + } + ++#ifdef CONFIG_KSM ++enum pkc_action { ++ PKSM_ENABLE = 0, ++ PKSM_DISABLE, ++ PKSM_STATUS, ++}; ++ ++static long do_process_ksm_control(int pidfd, enum pkc_action action) ++{ ++ long ret; ++ struct pid *pid; ++ struct task_struct *task; ++ struct mm_struct *mm; ++ unsigned int f_flags; ++ ++ pid = pidfd_get_pid(pidfd, &f_flags); ++ if (IS_ERR(pid)) { ++ ret = PTR_ERR(pid); ++ goto out; ++ } ++ ++ task = get_pid_task(pid, PIDTYPE_PID); ++ if (!task) { ++ ret = -ESRCH; ++ goto put_pid; ++ } ++ ++ /* Require PTRACE_MODE_READ to avoid leaking ASLR metadata. */ ++ mm = mm_access(task, PTRACE_MODE_READ_FSCREDS); ++ if (IS_ERR_OR_NULL(mm)) { ++ ret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH; ++ goto release_task; ++ } ++ ++ /* Require CAP_SYS_NICE for influencing process performance. */ ++ if (!capable(CAP_SYS_NICE)) { ++ ret = -EPERM; ++ goto release_mm; ++ } ++ ++ if (mmap_write_lock_killable(mm)) { ++ ret = -EINTR; ++ goto release_mm; ++ } ++ ++ switch (action) { ++ case PKSM_ENABLE: ++ ret = ksm_enable_merge_any(mm); ++ break; ++ case PKSM_DISABLE: ++ ret = ksm_disable_merge_any(mm); ++ break; ++ case PKSM_STATUS: ++ ret = !!test_bit(MMF_VM_MERGE_ANY, &mm->flags); ++ break; ++ } ++ ++ mmap_write_unlock(mm); ++ ++release_mm: ++ mmput(mm); ++release_task: ++ put_task_struct(task); ++put_pid: ++ put_pid(pid); ++out: ++ return ret; ++} ++#endif /* CONFIG_KSM */ ++ ++SYSCALL_DEFINE2(process_ksm_enable, int, pidfd, unsigned int, flags) ++{ ++#ifdef CONFIG_KSM ++ if (flags != 0) ++ return -EINVAL; ++ ++ return do_process_ksm_control(pidfd, PKSM_ENABLE); ++#else /* CONFIG_KSM */ ++ return -ENOSYS; ++#endif /* CONFIG_KSM */ ++} ++ ++SYSCALL_DEFINE2(process_ksm_disable, int, pidfd, unsigned int, flags) ++{ ++#ifdef CONFIG_KSM ++ if (flags != 0) ++ return -EINVAL; ++ ++ return do_process_ksm_control(pidfd, PKSM_DISABLE); ++#else /* CONFIG_KSM */ ++ return -ENOSYS; ++#endif /* CONFIG_KSM */ ++} ++ ++SYSCALL_DEFINE2(process_ksm_status, int, pidfd, unsigned int, flags) ++{ ++#ifdef CONFIG_KSM ++ if (flags != 0) ++ return -EINVAL; ++ ++ return do_process_ksm_control(pidfd, PKSM_STATUS); ++#else /* CONFIG_KSM */ ++ return -ENOSYS; ++#endif /* CONFIG_KSM */ ++} ++ ++#ifdef CONFIG_KSM ++static ssize_t process_ksm_enable_show(struct kobject *kobj, ++ struct kobj_attribute *attr, char *buf) ++{ ++ return sprintf(buf, "%u\n", __NR_process_ksm_enable); ++} ++static struct kobj_attribute process_ksm_enable_attr = __ATTR_RO(process_ksm_enable); ++ ++static ssize_t process_ksm_disable_show(struct kobject *kobj, ++ struct kobj_attribute *attr, char *buf) ++{ ++ return sprintf(buf, "%u\n", __NR_process_ksm_disable); ++} ++static struct kobj_attribute process_ksm_disable_attr = __ATTR_RO(process_ksm_disable); ++ ++static ssize_t process_ksm_status_show(struct kobject *kobj, ++ struct kobj_attribute *attr, char *buf) ++{ ++ return sprintf(buf, "%u\n", __NR_process_ksm_status); ++} ++static struct kobj_attribute process_ksm_status_attr = __ATTR_RO(process_ksm_status); ++ ++static struct attribute *process_ksm_sysfs_attrs[] = { ++ &process_ksm_enable_attr.attr, ++ &process_ksm_disable_attr.attr, ++ &process_ksm_status_attr.attr, ++ NULL, ++}; ++ ++static const struct attribute_group process_ksm_sysfs_attr_group = { ++ .attrs = process_ksm_sysfs_attrs, ++ .name = "process_ksm", ++}; ++ ++static int __init process_ksm_sysfs_init(void) ++{ ++ return sysfs_create_group(kernel_kobj, &process_ksm_sysfs_attr_group); ++} ++subsys_initcall(process_ksm_sysfs_init); ++#endif /* CONFIG_KSM */ ++ + SYSCALL_DEFINE3(getcpu, unsigned __user *, cpup, unsigned __user *, nodep, + struct getcpu_cache __user *, unused) + { +--- a/kernel/sys_ni.c ++++ b/kernel/sys_ni.c +@@ -186,6 +186,9 @@ COND_SYSCALL(mincore); + COND_SYSCALL(madvise); + COND_SYSCALL(process_madvise); + COND_SYSCALL(process_mrelease); ++COND_SYSCALL(process_ksm_enable); ++COND_SYSCALL(process_ksm_disable); ++COND_SYSCALL(process_ksm_status); + COND_SYSCALL(remap_file_pages); + COND_SYSCALL(mbind); + COND_SYSCALL(get_mempolicy); +--- a/scripts/syscall.tbl ++++ b/scripts/syscall.tbl +@@ -403,3 +403,6 @@ + 460 common lsm_set_self_attr sys_lsm_set_self_attr + 461 common lsm_list_modules sys_lsm_list_modules + 462 common mseal sys_mseal ++463 common process_ksm_enable sys_process_ksm_enable ++464 common process_ksm_disable sys_process_ksm_disable ++465 common process_ksm_status sys_process_ksm_status +--- a/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl ++++ b/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl +@@ -553,3 +553,6 @@ + 460 common lsm_set_self_attr sys_lsm_set_self_attr + 461 common lsm_list_modules sys_lsm_list_modules + 462 common mseal sys_mseal ++463 common process_ksm_enable sys_process_ksm_enable ++464 common process_ksm_disable sys_process_ksm_disable ++465 common process_ksm_status sys_process_ksm_status +--- a/tools/perf/arch/s390/entry/syscalls/syscall.tbl ++++ b/tools/perf/arch/s390/entry/syscalls/syscall.tbl +@@ -465,3 +465,6 @@ + 460 common lsm_set_self_attr sys_lsm_set_self_attr sys_lsm_set_self_attr + 461 common lsm_list_modules sys_lsm_list_modules sys_lsm_list_modules + 462 common mseal sys_mseal sys_mseal ++463 common process_ksm_enable sys_process_ksm_enable sys_process_ksm_enable ++464 common process_ksm_disable sys_process_ksm_disable sys_process_ksm_disable ++465 common process_ksm_status sys_process_ksm_status sys_process_ksm_status diff --git a/debian/patches/patchset-pf/ksm/0002-mm-process_ksm-use-pidfd_get_task-instead-of-pidfd_g.patch b/debian/patches/patchset-pf/ksm/0002-mm-process_ksm-use-pidfd_get_task-instead-of-pidfd_g.patch new file mode 100644 index 0000000..1213043 --- /dev/null +++ b/debian/patches/patchset-pf/ksm/0002-mm-process_ksm-use-pidfd_get_task-instead-of-pidfd_g.patch @@ -0,0 +1,50 @@ +From 9308d03bfeb941469da17e2903ca06254b110b25 Mon Sep 17 00:00:00 2001 +From: Oleksandr Natalenko +Date: Tue, 24 Sep 2024 11:58:41 +0200 +Subject: mm/process_ksm: use pidfd_get_task() instead of + pidfd_get_pid()+get_pid_task() + +Link: https://git.kernel.org/linus/ee9955d61a0a +Signed-off-by: Oleksandr Natalenko +--- + kernel/sys.c | 15 +++------------ + 1 file changed, 3 insertions(+), 12 deletions(-) + +--- a/kernel/sys.c ++++ b/kernel/sys.c +@@ -2799,23 +2799,16 @@ enum pkc_action { + static long do_process_ksm_control(int pidfd, enum pkc_action action) + { + long ret; +- struct pid *pid; + struct task_struct *task; + struct mm_struct *mm; + unsigned int f_flags; + +- pid = pidfd_get_pid(pidfd, &f_flags); +- if (IS_ERR(pid)) { +- ret = PTR_ERR(pid); ++ task = pidfd_get_task(pidfd, &f_flags); ++ if (IS_ERR(task)) { ++ ret = PTR_ERR(task); + goto out; + } + +- task = get_pid_task(pid, PIDTYPE_PID); +- if (!task) { +- ret = -ESRCH; +- goto put_pid; +- } +- + /* Require PTRACE_MODE_READ to avoid leaking ASLR metadata. */ + mm = mm_access(task, PTRACE_MODE_READ_FSCREDS); + if (IS_ERR_OR_NULL(mm)) { +@@ -2852,8 +2845,6 @@ release_mm: + mmput(mm); + release_task: + put_task_struct(task); +-put_pid: +- put_pid(pid); + out: + return ret; + } diff --git a/debian/patches/patchset-pf/zstd/0001-zstd-import-upstream-v1.5.6.patch b/debian/patches/patchset-pf/zstd/0001-zstd-import-upstream-v1.5.6.patch new file mode 100644 index 0000000..7b94907 --- /dev/null +++ b/debian/patches/patchset-pf/zstd/0001-zstd-import-upstream-v1.5.6.patch @@ -0,0 +1,18533 @@ +From 9b2b1b2e996c06793bc975dd66dccaadb0c0d637 Mon Sep 17 00:00:00 2001 +From: Oleksandr Natalenko +Date: Mon, 29 Jul 2024 00:42:23 +0200 +Subject: zstd: import upstream v1.5.6 + +Signed-off-by: Oleksandr Natalenko +--- + include/linux/zstd.h | 2 +- + include/linux/zstd_errors.h | 23 +- + include/linux/zstd_lib.h | 850 +++++-- + lib/zstd/Makefile | 2 +- + lib/zstd/common/allocations.h | 56 + + lib/zstd/common/bits.h | 149 ++ + lib/zstd/common/bitstream.h | 127 +- + lib/zstd/common/compiler.h | 134 +- + lib/zstd/common/cpu.h | 3 +- + lib/zstd/common/debug.c | 9 +- + lib/zstd/common/debug.h | 34 +- + lib/zstd/common/entropy_common.c | 42 +- + lib/zstd/common/error_private.c | 12 +- + lib/zstd/common/error_private.h | 84 +- + lib/zstd/common/fse.h | 94 +- + lib/zstd/common/fse_decompress.c | 130 +- + lib/zstd/common/huf.h | 237 +- + lib/zstd/common/mem.h | 3 +- + lib/zstd/common/portability_macros.h | 28 +- + lib/zstd/common/zstd_common.c | 38 +- + lib/zstd/common/zstd_deps.h | 16 +- + lib/zstd/common/zstd_internal.h | 109 +- + lib/zstd/compress/clevels.h | 3 +- + lib/zstd/compress/fse_compress.c | 74 +- + lib/zstd/compress/hist.c | 3 +- + lib/zstd/compress/hist.h | 3 +- + lib/zstd/compress/huf_compress.c | 441 ++-- + lib/zstd/compress/zstd_compress.c | 2111 ++++++++++++----- + lib/zstd/compress/zstd_compress_internal.h | 359 ++- + lib/zstd/compress/zstd_compress_literals.c | 155 +- + lib/zstd/compress/zstd_compress_literals.h | 25 +- + lib/zstd/compress/zstd_compress_sequences.c | 7 +- + lib/zstd/compress/zstd_compress_sequences.h | 3 +- + lib/zstd/compress/zstd_compress_superblock.c | 376 ++- + lib/zstd/compress/zstd_compress_superblock.h | 3 +- + lib/zstd/compress/zstd_cwksp.h | 169 +- + lib/zstd/compress/zstd_double_fast.c | 143 +- + lib/zstd/compress/zstd_double_fast.h | 17 +- + lib/zstd/compress/zstd_fast.c | 596 +++-- + lib/zstd/compress/zstd_fast.h | 6 +- + lib/zstd/compress/zstd_lazy.c | 732 +++--- + lib/zstd/compress/zstd_lazy.h | 138 +- + lib/zstd/compress/zstd_ldm.c | 21 +- + lib/zstd/compress/zstd_ldm.h | 3 +- + lib/zstd/compress/zstd_ldm_geartab.h | 3 +- + lib/zstd/compress/zstd_opt.c | 497 ++-- + lib/zstd/compress/zstd_opt.h | 41 +- + lib/zstd/decompress/huf_decompress.c | 887 ++++--- + lib/zstd/decompress/zstd_ddict.c | 9 +- + lib/zstd/decompress/zstd_ddict.h | 3 +- + lib/zstd/decompress/zstd_decompress.c | 356 ++- + lib/zstd/decompress/zstd_decompress_block.c | 708 +++--- + lib/zstd/decompress/zstd_decompress_block.h | 10 +- + .../decompress/zstd_decompress_internal.h | 9 +- + lib/zstd/decompress_sources.h | 2 +- + lib/zstd/zstd_common_module.c | 5 +- + lib/zstd/zstd_compress_module.c | 2 +- + lib/zstd/zstd_decompress_module.c | 4 +- + 58 files changed, 6576 insertions(+), 3530 deletions(-) + create mode 100644 lib/zstd/common/allocations.h + create mode 100644 lib/zstd/common/bits.h + +--- a/include/linux/zstd.h ++++ b/include/linux/zstd.h +@@ -1,6 +1,6 @@ + /* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +--- a/include/linux/zstd_errors.h ++++ b/include/linux/zstd_errors.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -17,8 +18,17 @@ + + + /* ===== ZSTDERRORLIB_API : control library symbols visibility ===== */ +-#define ZSTDERRORLIB_VISIBILITY +-#define ZSTDERRORLIB_API ZSTDERRORLIB_VISIBILITY ++#define ZSTDERRORLIB_VISIBLE ++ ++#ifndef ZSTDERRORLIB_HIDDEN ++# if (__GNUC__ >= 4) && !defined(__MINGW32__) ++# define ZSTDERRORLIB_HIDDEN __attribute__ ((visibility ("hidden"))) ++# else ++# define ZSTDERRORLIB_HIDDEN ++# endif ++#endif ++ ++#define ZSTDERRORLIB_API ZSTDERRORLIB_VISIBLE + + /*-********************************************* + * Error codes list +@@ -43,14 +53,17 @@ typedef enum { + ZSTD_error_frameParameter_windowTooLarge = 16, + ZSTD_error_corruption_detected = 20, + ZSTD_error_checksum_wrong = 22, ++ ZSTD_error_literals_headerWrong = 24, + ZSTD_error_dictionary_corrupted = 30, + ZSTD_error_dictionary_wrong = 32, + ZSTD_error_dictionaryCreation_failed = 34, + ZSTD_error_parameter_unsupported = 40, ++ ZSTD_error_parameter_combination_unsupported = 41, + ZSTD_error_parameter_outOfBound = 42, + ZSTD_error_tableLog_tooLarge = 44, + ZSTD_error_maxSymbolValue_tooLarge = 46, + ZSTD_error_maxSymbolValue_tooSmall = 48, ++ ZSTD_error_stabilityCondition_notRespected = 50, + ZSTD_error_stage_wrong = 60, + ZSTD_error_init_missing = 62, + ZSTD_error_memory_allocation = 64, +@@ -58,11 +71,15 @@ typedef enum { + ZSTD_error_dstSize_tooSmall = 70, + ZSTD_error_srcSize_wrong = 72, + ZSTD_error_dstBuffer_null = 74, ++ ZSTD_error_noForwardProgress_destFull = 80, ++ ZSTD_error_noForwardProgress_inputEmpty = 82, + /* following error codes are __NOT STABLE__, they can be removed or changed in future versions */ + ZSTD_error_frameIndex_tooLarge = 100, + ZSTD_error_seekableIO = 102, + ZSTD_error_dstBuffer_wrong = 104, + ZSTD_error_srcBuffer_wrong = 105, ++ ZSTD_error_sequenceProducer_failed = 106, ++ ZSTD_error_externalSequences_invalid = 107, + ZSTD_error_maxCode = 120 /* never EVER use this value directly, it can change in future versions! Use ZSTD_isError() instead */ + } ZSTD_ErrorCode; + +--- a/include/linux/zstd_lib.h ++++ b/include/linux/zstd_lib.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -11,23 +12,42 @@ + #ifndef ZSTD_H_235446 + #define ZSTD_H_235446 + +-/* ====== Dependency ======*/ ++/* ====== Dependencies ======*/ + #include /* INT_MAX */ + #include /* size_t */ + + + /* ===== ZSTDLIB_API : control library symbols visibility ===== */ +-#ifndef ZSTDLIB_VISIBLE ++#define ZSTDLIB_VISIBLE ++ ++#ifndef ZSTDLIB_HIDDEN + # if (__GNUC__ >= 4) && !defined(__MINGW32__) +-# define ZSTDLIB_VISIBLE __attribute__ ((visibility ("default"))) + # define ZSTDLIB_HIDDEN __attribute__ ((visibility ("hidden"))) + # else +-# define ZSTDLIB_VISIBLE + # define ZSTDLIB_HIDDEN + # endif + #endif ++ + #define ZSTDLIB_API ZSTDLIB_VISIBLE + ++/* Deprecation warnings : ++ * Should these warnings be a problem, it is generally possible to disable them, ++ * typically with -Wno-deprecated-declarations for gcc or _CRT_SECURE_NO_WARNINGS in Visual. ++ * Otherwise, it's also possible to define ZSTD_DISABLE_DEPRECATE_WARNINGS. ++ */ ++#ifdef ZSTD_DISABLE_DEPRECATE_WARNINGS ++# define ZSTD_DEPRECATED(message) /* disable deprecation warnings */ ++#else ++# if (defined(GNUC) && (GNUC > 4 || (GNUC == 4 && GNUC_MINOR >= 5))) || defined(__clang__) ++# define ZSTD_DEPRECATED(message) __attribute__((deprecated(message))) ++# elif (__GNUC__ >= 3) ++# define ZSTD_DEPRECATED(message) __attribute__((deprecated)) ++# else ++# pragma message("WARNING: You need to implement ZSTD_DEPRECATED for this compiler") ++# define ZSTD_DEPRECATED(message) ++# endif ++#endif /* ZSTD_DISABLE_DEPRECATE_WARNINGS */ ++ + + /* ***************************************************************************** + Introduction +@@ -65,7 +85,7 @@ + /*------ Version ------*/ + #define ZSTD_VERSION_MAJOR 1 + #define ZSTD_VERSION_MINOR 5 +-#define ZSTD_VERSION_RELEASE 2 ++#define ZSTD_VERSION_RELEASE 6 + #define ZSTD_VERSION_NUMBER (ZSTD_VERSION_MAJOR *100*100 + ZSTD_VERSION_MINOR *100 + ZSTD_VERSION_RELEASE) + + /*! ZSTD_versionNumber() : +@@ -107,7 +127,8 @@ ZSTDLIB_API const char* ZSTD_versionStri + ***************************************/ + /*! ZSTD_compress() : + * Compresses `src` content as a single zstd compressed frame into already allocated `dst`. +- * Hint : compression runs faster if `dstCapacity` >= `ZSTD_compressBound(srcSize)`. ++ * NOTE: Providing `dstCapacity >= ZSTD_compressBound(srcSize)` guarantees that zstd will have ++ * enough space to successfully compress the data. + * @return : compressed size written into `dst` (<= `dstCapacity), + * or an error code if it fails (which can be tested using ZSTD_isError()). */ + ZSTDLIB_API size_t ZSTD_compress( void* dst, size_t dstCapacity, +@@ -156,7 +177,9 @@ ZSTDLIB_API unsigned long long ZSTD_getF + * "empty", "unknown" and "error" results to the same return value (0), + * while ZSTD_getFrameContentSize() gives them separate return values. + * @return : decompressed size of `src` frame content _if known and not empty_, 0 otherwise. */ +-ZSTDLIB_API unsigned long long ZSTD_getDecompressedSize(const void* src, size_t srcSize); ++ZSTD_DEPRECATED("Replaced by ZSTD_getFrameContentSize") ++ZSTDLIB_API ++unsigned long long ZSTD_getDecompressedSize(const void* src, size_t srcSize); + + /*! ZSTD_findFrameCompressedSize() : Requires v1.4.0+ + * `src` should point to the start of a ZSTD frame or skippable frame. +@@ -168,8 +191,30 @@ ZSTDLIB_API size_t ZSTD_findFrameCompres + + + /*====== Helper functions ======*/ +-#define ZSTD_COMPRESSBOUND(srcSize) ((srcSize) + ((srcSize)>>8) + (((srcSize) < (128<<10)) ? (((128<<10) - (srcSize)) >> 11) /* margin, from 64 to 0 */ : 0)) /* this formula ensures that bound(A) + bound(B) <= bound(A+B) as long as A and B >= 128 KB */ +-ZSTDLIB_API size_t ZSTD_compressBound(size_t srcSize); /*!< maximum compressed size in worst case single-pass scenario */ ++/* ZSTD_compressBound() : ++ * maximum compressed size in worst case single-pass scenario. ++ * When invoking `ZSTD_compress()` or any other one-pass compression function, ++ * it's recommended to provide @dstCapacity >= ZSTD_compressBound(srcSize) ++ * as it eliminates one potential failure scenario, ++ * aka not enough room in dst buffer to write the compressed frame. ++ * Note : ZSTD_compressBound() itself can fail, if @srcSize > ZSTD_MAX_INPUT_SIZE . ++ * In which case, ZSTD_compressBound() will return an error code ++ * which can be tested using ZSTD_isError(). ++ * ++ * ZSTD_COMPRESSBOUND() : ++ * same as ZSTD_compressBound(), but as a macro. ++ * It can be used to produce constants, which can be useful for static allocation, ++ * for example to size a static array on stack. ++ * Will produce constant value 0 if srcSize too large. ++ */ ++#define ZSTD_MAX_INPUT_SIZE ((sizeof(size_t)==8) ? 0xFF00FF00FF00FF00ULL : 0xFF00FF00U) ++#define ZSTD_COMPRESSBOUND(srcSize) (((size_t)(srcSize) >= ZSTD_MAX_INPUT_SIZE) ? 0 : (srcSize) + ((srcSize)>>8) + (((srcSize) < (128<<10)) ? (((128<<10) - (srcSize)) >> 11) /* margin, from 64 to 0 */ : 0)) /* this formula ensures that bound(A) + bound(B) <= bound(A+B) as long as A and B >= 128 KB */ ++ZSTDLIB_API size_t ZSTD_compressBound(size_t srcSize); /*!< maximum compressed size in worst case single-pass scenario */ ++/* ZSTD_isError() : ++ * Most ZSTD_* functions returning a size_t value can be tested for error, ++ * using ZSTD_isError(). ++ * @return 1 if error, 0 otherwise ++ */ + ZSTDLIB_API unsigned ZSTD_isError(size_t code); /*!< tells if a `size_t` function result is an error code */ + ZSTDLIB_API const char* ZSTD_getErrorName(size_t code); /*!< provides readable string from an error code */ + ZSTDLIB_API int ZSTD_minCLevel(void); /*!< minimum negative compression level allowed, requires v1.4.0+ */ +@@ -183,7 +228,7 @@ ZSTDLIB_API int ZSTD_defaultCLev + /*= Compression context + * When compressing many times, + * it is recommended to allocate a context just once, +- * and re-use it for each successive compression operation. ++ * and reuse it for each successive compression operation. + * This will make workload friendlier for system's memory. + * Note : re-using context is just a speed / resource optimization. + * It doesn't change the compression ratio, which remains identical. +@@ -196,9 +241,9 @@ ZSTDLIB_API size_t ZSTD_freeCCtx(ZST + + /*! ZSTD_compressCCtx() : + * Same as ZSTD_compress(), using an explicit ZSTD_CCtx. +- * Important : in order to behave similarly to `ZSTD_compress()`, +- * this function compresses at requested compression level, +- * __ignoring any other parameter__ . ++ * Important : in order to mirror `ZSTD_compress()` behavior, ++ * this function compresses at the requested compression level, ++ * __ignoring any other advanced parameter__ . + * If any advanced parameter was set using the advanced API, + * they will all be reset. Only `compressionLevel` remains. + */ +@@ -210,7 +255,7 @@ ZSTDLIB_API size_t ZSTD_compressCCtx(ZST + /*= Decompression context + * When decompressing many times, + * it is recommended to allocate a context only once, +- * and re-use it for each successive compression operation. ++ * and reuse it for each successive compression operation. + * This will make workload friendlier for system's memory. + * Use one context per thread for parallel execution. */ + typedef struct ZSTD_DCtx_s ZSTD_DCtx; +@@ -220,7 +265,7 @@ ZSTDLIB_API size_t ZSTD_freeDCtx(ZST + /*! ZSTD_decompressDCtx() : + * Same as ZSTD_decompress(), + * requires an allocated ZSTD_DCtx. +- * Compatible with sticky parameters. ++ * Compatible with sticky parameters (see below). + */ + ZSTDLIB_API size_t ZSTD_decompressDCtx(ZSTD_DCtx* dctx, + void* dst, size_t dstCapacity, +@@ -236,12 +281,12 @@ ZSTDLIB_API size_t ZSTD_decompressDCtx(Z + * using ZSTD_CCtx_set*() functions. + * Pushed parameters are sticky : they are valid for next compressed frame, and any subsequent frame. + * "sticky" parameters are applicable to `ZSTD_compress2()` and `ZSTD_compressStream*()` ! +- * __They do not apply to "simple" one-shot variants such as ZSTD_compressCCtx()__ . ++ * __They do not apply to one-shot variants such as ZSTD_compressCCtx()__ . + * + * It's possible to reset all parameters to "default" using ZSTD_CCtx_reset(). + * + * This API supersedes all other "advanced" API entry points in the experimental section. +- * In the future, we expect to remove from experimental API entry points which are redundant with this API. ++ * In the future, we expect to remove API entry points from experimental which are redundant with this API. + */ + + +@@ -324,6 +369,19 @@ typedef enum { + * The higher the value of selected strategy, the more complex it is, + * resulting in stronger and slower compression. + * Special: value 0 means "use default strategy". */ ++ ++ ZSTD_c_targetCBlockSize=130, /* v1.5.6+ ++ * Attempts to fit compressed block size into approximatively targetCBlockSize. ++ * Bound by ZSTD_TARGETCBLOCKSIZE_MIN and ZSTD_TARGETCBLOCKSIZE_MAX. ++ * Note that it's not a guarantee, just a convergence target (default:0). ++ * No target when targetCBlockSize == 0. ++ * This is helpful in low bandwidth streaming environments to improve end-to-end latency, ++ * when a client can make use of partial documents (a prominent example being Chrome). ++ * Note: this parameter is stable since v1.5.6. ++ * It was present as an experimental parameter in earlier versions, ++ * but it's not recommended using it with earlier library versions ++ * due to massive performance regressions. ++ */ + /* LDM mode parameters */ + ZSTD_c_enableLongDistanceMatching=160, /* Enable long distance matching. + * This parameter is designed to improve compression ratio +@@ -403,7 +461,6 @@ typedef enum { + * ZSTD_c_forceMaxWindow + * ZSTD_c_forceAttachDict + * ZSTD_c_literalCompressionMode +- * ZSTD_c_targetCBlockSize + * ZSTD_c_srcSizeHint + * ZSTD_c_enableDedicatedDictSearch + * ZSTD_c_stableInBuffer +@@ -412,6 +469,9 @@ typedef enum { + * ZSTD_c_validateSequences + * ZSTD_c_useBlockSplitter + * ZSTD_c_useRowMatchFinder ++ * ZSTD_c_prefetchCDictTables ++ * ZSTD_c_enableSeqProducerFallback ++ * ZSTD_c_maxBlockSize + * Because they are not stable, it's necessary to define ZSTD_STATIC_LINKING_ONLY to access them. + * note : never ever use experimentalParam? names directly; + * also, the enums values themselves are unstable and can still change. +@@ -421,7 +481,7 @@ typedef enum { + ZSTD_c_experimentalParam3=1000, + ZSTD_c_experimentalParam4=1001, + ZSTD_c_experimentalParam5=1002, +- ZSTD_c_experimentalParam6=1003, ++ /* was ZSTD_c_experimentalParam6=1003; is now ZSTD_c_targetCBlockSize */ + ZSTD_c_experimentalParam7=1004, + ZSTD_c_experimentalParam8=1005, + ZSTD_c_experimentalParam9=1006, +@@ -430,7 +490,11 @@ typedef enum { + ZSTD_c_experimentalParam12=1009, + ZSTD_c_experimentalParam13=1010, + ZSTD_c_experimentalParam14=1011, +- ZSTD_c_experimentalParam15=1012 ++ ZSTD_c_experimentalParam15=1012, ++ ZSTD_c_experimentalParam16=1013, ++ ZSTD_c_experimentalParam17=1014, ++ ZSTD_c_experimentalParam18=1015, ++ ZSTD_c_experimentalParam19=1016 + } ZSTD_cParameter; + + typedef struct { +@@ -493,7 +557,7 @@ typedef enum { + * They will be used to compress next frame. + * Resetting session never fails. + * - The parameters : changes all parameters back to "default". +- * This removes any reference to any dictionary too. ++ * This also removes any reference to any dictionary or external sequence producer. + * Parameters can only be changed between 2 sessions (i.e. no compression is currently ongoing) + * otherwise the reset fails, and function returns an error value (which can be tested using ZSTD_isError()) + * - Both : similar to resetting the session, followed by resetting parameters. +@@ -502,11 +566,13 @@ ZSTDLIB_API size_t ZSTD_CCtx_reset(ZSTD_ + + /*! ZSTD_compress2() : + * Behave the same as ZSTD_compressCCtx(), but compression parameters are set using the advanced API. ++ * (note that this entry point doesn't even expose a compression level parameter). + * ZSTD_compress2() always starts a new frame. + * Should cctx hold data from a previously unfinished frame, everything about it is forgotten. + * - Compression parameters are pushed into CCtx before starting compression, using ZSTD_CCtx_set*() + * - The function is always blocking, returns when compression is completed. +- * Hint : compression runs faster if `dstCapacity` >= `ZSTD_compressBound(srcSize)`. ++ * NOTE: Providing `dstCapacity >= ZSTD_compressBound(srcSize)` guarantees that zstd will have ++ * enough space to successfully compress the data, though it is possible it fails for other reasons. + * @return : compressed size written into `dst` (<= `dstCapacity), + * or an error code if it fails (which can be tested using ZSTD_isError()). + */ +@@ -543,13 +609,17 @@ typedef enum { + * ZSTD_d_stableOutBuffer + * ZSTD_d_forceIgnoreChecksum + * ZSTD_d_refMultipleDDicts ++ * ZSTD_d_disableHuffmanAssembly ++ * ZSTD_d_maxBlockSize + * Because they are not stable, it's necessary to define ZSTD_STATIC_LINKING_ONLY to access them. + * note : never ever use experimentalParam? names directly + */ + ZSTD_d_experimentalParam1=1000, + ZSTD_d_experimentalParam2=1001, + ZSTD_d_experimentalParam3=1002, +- ZSTD_d_experimentalParam4=1003 ++ ZSTD_d_experimentalParam4=1003, ++ ZSTD_d_experimentalParam5=1004, ++ ZSTD_d_experimentalParam6=1005 + + } ZSTD_dParameter; + +@@ -604,14 +674,14 @@ typedef struct ZSTD_outBuffer_s { + * A ZSTD_CStream object is required to track streaming operation. + * Use ZSTD_createCStream() and ZSTD_freeCStream() to create/release resources. + * ZSTD_CStream objects can be reused multiple times on consecutive compression operations. +-* It is recommended to re-use ZSTD_CStream since it will play nicer with system's memory, by re-using already allocated memory. ++* It is recommended to reuse ZSTD_CStream since it will play nicer with system's memory, by re-using already allocated memory. + * + * For parallel execution, use one separate ZSTD_CStream per thread. + * + * note : since v1.3.0, ZSTD_CStream and ZSTD_CCtx are the same thing. + * + * Parameters are sticky : when starting a new compression on the same context, +-* it will re-use the same sticky parameters as previous compression session. ++* it will reuse the same sticky parameters as previous compression session. + * When in doubt, it's recommended to fully initialize the context before usage. + * Use ZSTD_CCtx_reset() to reset the context and ZSTD_CCtx_setParameter(), + * ZSTD_CCtx_setPledgedSrcSize(), or ZSTD_CCtx_loadDictionary() and friends to +@@ -700,6 +770,11 @@ typedef enum { + * only ZSTD_e_end or ZSTD_e_flush operations are allowed. + * Before starting a new compression job, or changing compression parameters, + * it is required to fully flush internal buffers. ++ * - note: if an operation ends with an error, it may leave @cctx in an undefined state. ++ * Therefore, it's UB to invoke ZSTD_compressStream2() of ZSTD_compressStream() on such a state. ++ * In order to be re-employed after an error, a state must be reset, ++ * which can be done explicitly (ZSTD_CCtx_reset()), ++ * or is sometimes implied by methods starting a new compression job (ZSTD_initCStream(), ZSTD_compressCCtx()) + */ + ZSTDLIB_API size_t ZSTD_compressStream2( ZSTD_CCtx* cctx, + ZSTD_outBuffer* output, +@@ -728,8 +803,6 @@ ZSTDLIB_API size_t ZSTD_CStreamOutSize(v + * This following is a legacy streaming API, available since v1.0+ . + * It can be replaced by ZSTD_CCtx_reset() and ZSTD_compressStream2(). + * It is redundant, but remains fully supported. +- * Streaming in combination with advanced parameters and dictionary compression +- * can only be used through the new API. + ******************************************************************************/ + + /*! +@@ -738,6 +811,9 @@ ZSTDLIB_API size_t ZSTD_CStreamOutSize(v + * ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only); + * ZSTD_CCtx_refCDict(zcs, NULL); // clear the dictionary (if any) + * ZSTD_CCtx_setParameter(zcs, ZSTD_c_compressionLevel, compressionLevel); ++ * ++ * Note that ZSTD_initCStream() clears any previously set dictionary. Use the new API ++ * to compress with a dictionary. + */ + ZSTDLIB_API size_t ZSTD_initCStream(ZSTD_CStream* zcs, int compressionLevel); + /*! +@@ -758,7 +834,7 @@ ZSTDLIB_API size_t ZSTD_endStream(ZSTD_C + * + * A ZSTD_DStream object is required to track streaming operations. + * Use ZSTD_createDStream() and ZSTD_freeDStream() to create/release resources. +-* ZSTD_DStream objects can be re-used multiple times. ++* ZSTD_DStream objects can be reused multiple times. + * + * Use ZSTD_initDStream() to start a new decompression operation. + * @return : recommended first input size +@@ -788,13 +864,37 @@ ZSTDLIB_API size_t ZSTD_freeDStream(ZSTD + + /*===== Streaming decompression functions =====*/ + +-/* This function is redundant with the advanced API and equivalent to: ++/*! ZSTD_initDStream() : ++ * Initialize/reset DStream state for new decompression operation. ++ * Call before new decompression operation using same DStream. + * ++ * Note : This function is redundant with the advanced API and equivalent to: + * ZSTD_DCtx_reset(zds, ZSTD_reset_session_only); + * ZSTD_DCtx_refDDict(zds, NULL); + */ + ZSTDLIB_API size_t ZSTD_initDStream(ZSTD_DStream* zds); + ++/*! ZSTD_decompressStream() : ++ * Streaming decompression function. ++ * Call repetitively to consume full input updating it as necessary. ++ * Function will update both input and output `pos` fields exposing current state via these fields: ++ * - `input.pos < input.size`, some input remaining and caller should provide remaining input ++ * on the next call. ++ * - `output.pos < output.size`, decoder finished and flushed all remaining buffers. ++ * - `output.pos == output.size`, potentially uncflushed data present in the internal buffers, ++ * call ZSTD_decompressStream() again to flush remaining data to output. ++ * Note : with no additional input, amount of data flushed <= ZSTD_BLOCKSIZE_MAX. ++ * ++ * @return : 0 when a frame is completely decoded and fully flushed, ++ * or an error code, which can be tested using ZSTD_isError(), ++ * or any other value > 0, which means there is some decoding or flushing to do to complete current frame. ++ * ++ * Note: when an operation returns with an error code, the @zds state may be left in undefined state. ++ * It's UB to invoke `ZSTD_decompressStream()` on such a state. ++ * In order to re-use such a state, it must be first reset, ++ * which can be done explicitly (`ZSTD_DCtx_reset()`), ++ * or is implied for operations starting some new decompression job (`ZSTD_initDStream`, `ZSTD_decompressDCtx()`, `ZSTD_decompress_usingDict()`) ++ */ + ZSTDLIB_API size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inBuffer* input); + + ZSTDLIB_API size_t ZSTD_DStreamInSize(void); /*!< recommended size for input buffer */ +@@ -913,7 +1013,7 @@ ZSTDLIB_API unsigned ZSTD_getDictID_from + * If @return == 0, the dictID could not be decoded. + * This could for one of the following reasons : + * - The frame does not require a dictionary to be decoded (most common case). +- * - The frame was built with dictID intentionally removed. Whatever dictionary is necessary is a hidden information. ++ * - The frame was built with dictID intentionally removed. Whatever dictionary is necessary is a hidden piece of information. + * Note : this use case also happens when using a non-conformant dictionary. + * - `srcSize` is too small, and as a result, the frame header could not be decoded (only possible if `srcSize < ZSTD_FRAMEHEADERSIZE_MAX`). + * - This is not a Zstandard frame. +@@ -925,9 +1025,11 @@ ZSTDLIB_API unsigned ZSTD_getDictID_from + * Advanced dictionary and prefix API (Requires v1.4.0+) + * + * This API allows dictionaries to be used with ZSTD_compress2(), +- * ZSTD_compressStream2(), and ZSTD_decompressDCtx(). Dictionaries are sticky, and +- * only reset with the context is reset with ZSTD_reset_parameters or +- * ZSTD_reset_session_and_parameters. Prefixes are single-use. ++ * ZSTD_compressStream2(), and ZSTD_decompressDCtx(). ++ * Dictionaries are sticky, they remain valid when same context is reused, ++ * they only reset when the context is reset ++ * with ZSTD_reset_parameters or ZSTD_reset_session_and_parameters. ++ * In contrast, Prefixes are single-use. + ******************************************************************************/ + + +@@ -937,8 +1039,9 @@ ZSTDLIB_API unsigned ZSTD_getDictID_from + * @result : 0, or an error code (which can be tested with ZSTD_isError()). + * Special: Loading a NULL (or 0-size) dictionary invalidates previous dictionary, + * meaning "return to no-dictionary mode". +- * Note 1 : Dictionary is sticky, it will be used for all future compressed frames. +- * To return to "no-dictionary" situation, load a NULL dictionary (or reset parameters). ++ * Note 1 : Dictionary is sticky, it will be used for all future compressed frames, ++ * until parameters are reset, a new dictionary is loaded, or the dictionary ++ * is explicitly invalidated by loading a NULL dictionary. + * Note 2 : Loading a dictionary involves building tables. + * It's also a CPU consuming operation, with non-negligible impact on latency. + * Tables are dependent on compression parameters, and for this reason, +@@ -947,11 +1050,15 @@ ZSTDLIB_API unsigned ZSTD_getDictID_from + * Use experimental ZSTD_CCtx_loadDictionary_byReference() to reference content instead. + * In such a case, dictionary buffer must outlive its users. + * Note 4 : Use ZSTD_CCtx_loadDictionary_advanced() +- * to precisely select how dictionary content must be interpreted. */ ++ * to precisely select how dictionary content must be interpreted. ++ * Note 5 : This method does not benefit from LDM (long distance mode). ++ * If you want to employ LDM on some large dictionary content, ++ * prefer employing ZSTD_CCtx_refPrefix() described below. ++ */ + ZSTDLIB_API size_t ZSTD_CCtx_loadDictionary(ZSTD_CCtx* cctx, const void* dict, size_t dictSize); + + /*! ZSTD_CCtx_refCDict() : Requires v1.4.0+ +- * Reference a prepared dictionary, to be used for all next compressed frames. ++ * Reference a prepared dictionary, to be used for all future compressed frames. + * Note that compression parameters are enforced from within CDict, + * and supersede any compression parameter previously set within CCtx. + * The parameters ignored are labelled as "superseded-by-cdict" in the ZSTD_cParameter enum docs. +@@ -970,6 +1077,7 @@ ZSTDLIB_API size_t ZSTD_CCtx_refCDict(ZS + * Decompression will need same prefix to properly regenerate data. + * Compressing with a prefix is similar in outcome as performing a diff and compressing it, + * but performs much faster, especially during decompression (compression speed is tunable with compression level). ++ * This method is compatible with LDM (long distance mode). + * @result : 0, or an error code (which can be tested with ZSTD_isError()). + * Special: Adding any prefix (including NULL) invalidates any previous prefix or dictionary + * Note 1 : Prefix buffer is referenced. It **must** outlive compression. +@@ -986,9 +1094,9 @@ ZSTDLIB_API size_t ZSTD_CCtx_refPrefix(Z + const void* prefix, size_t prefixSize); + + /*! ZSTD_DCtx_loadDictionary() : Requires v1.4.0+ +- * Create an internal DDict from dict buffer, +- * to be used to decompress next frames. +- * The dictionary remains valid for all future frames, until explicitly invalidated. ++ * Create an internal DDict from dict buffer, to be used to decompress all future frames. ++ * The dictionary remains valid for all future frames, until explicitly invalidated, or ++ * a new dictionary is loaded. + * @result : 0, or an error code (which can be tested with ZSTD_isError()). + * Special : Adding a NULL (or 0-size) dictionary invalidates any previous dictionary, + * meaning "return to no-dictionary mode". +@@ -1012,9 +1120,10 @@ ZSTDLIB_API size_t ZSTD_DCtx_loadDiction + * The memory for the table is allocated on the first call to refDDict, and can be + * freed with ZSTD_freeDCtx(). + * ++ * If called with ZSTD_d_refMultipleDDicts disabled (the default), only one dictionary ++ * will be managed, and referencing a dictionary effectively "discards" any previous one. ++ * + * @result : 0, or an error code (which can be tested with ZSTD_isError()). +- * Note 1 : Currently, only one dictionary can be managed. +- * Referencing a new dictionary effectively "discards" any previous one. + * Special: referencing a NULL DDict means "return to no-dictionary mode". + * Note 2 : DDict is just referenced, its lifetime must outlive its usage from DCtx. + */ +@@ -1071,24 +1180,6 @@ ZSTDLIB_API size_t ZSTD_sizeof_DDict(con + #define ZSTDLIB_STATIC_API ZSTDLIB_VISIBLE + #endif + +-/* Deprecation warnings : +- * Should these warnings be a problem, it is generally possible to disable them, +- * typically with -Wno-deprecated-declarations for gcc or _CRT_SECURE_NO_WARNINGS in Visual. +- * Otherwise, it's also possible to define ZSTD_DISABLE_DEPRECATE_WARNINGS. +- */ +-#ifdef ZSTD_DISABLE_DEPRECATE_WARNINGS +-# define ZSTD_DEPRECATED(message) ZSTDLIB_STATIC_API /* disable deprecation warnings */ +-#else +-# if (defined(GNUC) && (GNUC > 4 || (GNUC == 4 && GNUC_MINOR >= 5))) || defined(__clang__) +-# define ZSTD_DEPRECATED(message) ZSTDLIB_STATIC_API __attribute__((deprecated(message))) +-# elif (__GNUC__ >= 3) +-# define ZSTD_DEPRECATED(message) ZSTDLIB_STATIC_API __attribute__((deprecated)) +-# else +-# pragma message("WARNING: You need to implement ZSTD_DEPRECATED for this compiler") +-# define ZSTD_DEPRECATED(message) ZSTDLIB_STATIC_API +-# endif +-#endif /* ZSTD_DISABLE_DEPRECATE_WARNINGS */ +- + /* ************************************************************************************** + * experimental API (static linking only) + **************************************************************************************** +@@ -1123,6 +1214,7 @@ ZSTDLIB_API size_t ZSTD_sizeof_DDict(con + #define ZSTD_TARGETLENGTH_MIN 0 /* note : comparing this constant to an unsigned results in a tautological test */ + #define ZSTD_STRATEGY_MIN ZSTD_fast + #define ZSTD_STRATEGY_MAX ZSTD_btultra2 ++#define ZSTD_BLOCKSIZE_MAX_MIN (1 << 10) /* The minimum valid max blocksize. Maximum blocksizes smaller than this make compressBound() inaccurate. */ + + + #define ZSTD_OVERLAPLOG_MIN 0 +@@ -1146,7 +1238,7 @@ ZSTDLIB_API size_t ZSTD_sizeof_DDict(con + #define ZSTD_LDM_HASHRATELOG_MAX (ZSTD_WINDOWLOG_MAX - ZSTD_HASHLOG_MIN) + + /* Advanced parameter bounds */ +-#define ZSTD_TARGETCBLOCKSIZE_MIN 64 ++#define ZSTD_TARGETCBLOCKSIZE_MIN 1340 /* suitable to fit into an ethernet / wifi / 4G transport frame */ + #define ZSTD_TARGETCBLOCKSIZE_MAX ZSTD_BLOCKSIZE_MAX + #define ZSTD_SRCSIZEHINT_MIN 0 + #define ZSTD_SRCSIZEHINT_MAX INT_MAX +@@ -1303,7 +1395,7 @@ typedef enum { + } ZSTD_paramSwitch_e; + + /* ************************************* +-* Frame size functions ++* Frame header and size functions + ***************************************/ + + /*! ZSTD_findDecompressedSize() : +@@ -1350,29 +1442,122 @@ ZSTDLIB_STATIC_API unsigned long long ZS + * or an error code (if srcSize is too small) */ + ZSTDLIB_STATIC_API size_t ZSTD_frameHeaderSize(const void* src, size_t srcSize); + ++typedef enum { ZSTD_frame, ZSTD_skippableFrame } ZSTD_frameType_e; ++typedef struct { ++ unsigned long long frameContentSize; /* if == ZSTD_CONTENTSIZE_UNKNOWN, it means this field is not available. 0 means "empty" */ ++ unsigned long long windowSize; /* can be very large, up to <= frameContentSize */ ++ unsigned blockSizeMax; ++ ZSTD_frameType_e frameType; /* if == ZSTD_skippableFrame, frameContentSize is the size of skippable content */ ++ unsigned headerSize; ++ unsigned dictID; ++ unsigned checksumFlag; ++ unsigned _reserved1; ++ unsigned _reserved2; ++} ZSTD_frameHeader; ++ ++/*! ZSTD_getFrameHeader() : ++ * decode Frame Header, or requires larger `srcSize`. ++ * @return : 0, `zfhPtr` is correctly filled, ++ * >0, `srcSize` is too small, value is wanted `srcSize` amount, ++ * or an error code, which can be tested using ZSTD_isError() */ ++ZSTDLIB_STATIC_API size_t ZSTD_getFrameHeader(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize); /*< doesn't consume input */ ++/*! ZSTD_getFrameHeader_advanced() : ++ * same as ZSTD_getFrameHeader(), ++ * with added capability to select a format (like ZSTD_f_zstd1_magicless) */ ++ZSTDLIB_STATIC_API size_t ZSTD_getFrameHeader_advanced(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize, ZSTD_format_e format); ++ ++/*! ZSTD_decompressionMargin() : ++ * Zstd supports in-place decompression, where the input and output buffers overlap. ++ * In this case, the output buffer must be at least (Margin + Output_Size) bytes large, ++ * and the input buffer must be at the end of the output buffer. ++ * ++ * _______________________ Output Buffer ________________________ ++ * | | ++ * | ____ Input Buffer ____| ++ * | | | ++ * v v v ++ * |---------------------------------------|-----------|----------| ++ * ^ ^ ^ ++ * |___________________ Output_Size ___________________|_ Margin _| ++ * ++ * NOTE: See also ZSTD_DECOMPRESSION_MARGIN(). ++ * NOTE: This applies only to single-pass decompression through ZSTD_decompress() or ++ * ZSTD_decompressDCtx(). ++ * NOTE: This function supports multi-frame input. ++ * ++ * @param src The compressed frame(s) ++ * @param srcSize The size of the compressed frame(s) ++ * @returns The decompression margin or an error that can be checked with ZSTD_isError(). ++ */ ++ZSTDLIB_STATIC_API size_t ZSTD_decompressionMargin(const void* src, size_t srcSize); ++ ++/*! ZSTD_DECOMPRESS_MARGIN() : ++ * Similar to ZSTD_decompressionMargin(), but instead of computing the margin from ++ * the compressed frame, compute it from the original size and the blockSizeLog. ++ * See ZSTD_decompressionMargin() for details. ++ * ++ * WARNING: This macro does not support multi-frame input, the input must be a single ++ * zstd frame. If you need that support use the function, or implement it yourself. ++ * ++ * @param originalSize The original uncompressed size of the data. ++ * @param blockSize The block size == MIN(windowSize, ZSTD_BLOCKSIZE_MAX). ++ * Unless you explicitly set the windowLog smaller than ++ * ZSTD_BLOCKSIZELOG_MAX you can just use ZSTD_BLOCKSIZE_MAX. ++ */ ++#define ZSTD_DECOMPRESSION_MARGIN(originalSize, blockSize) ((size_t)( \ ++ ZSTD_FRAMEHEADERSIZE_MAX /* Frame header */ + \ ++ 4 /* checksum */ + \ ++ ((originalSize) == 0 ? 0 : 3 * (((originalSize) + (blockSize) - 1) / blockSize)) /* 3 bytes per block */ + \ ++ (blockSize) /* One block of margin */ \ ++ )) ++ + typedef enum { + ZSTD_sf_noBlockDelimiters = 0, /* Representation of ZSTD_Sequence has no block delimiters, sequences only */ + ZSTD_sf_explicitBlockDelimiters = 1 /* Representation of ZSTD_Sequence contains explicit block delimiters */ + } ZSTD_sequenceFormat_e; + ++/*! ZSTD_sequenceBound() : ++ * `srcSize` : size of the input buffer ++ * @return : upper-bound for the number of sequences that can be generated ++ * from a buffer of srcSize bytes ++ * ++ * note : returns number of sequences - to get bytes, multiply by sizeof(ZSTD_Sequence). ++ */ ++ZSTDLIB_STATIC_API size_t ZSTD_sequenceBound(size_t srcSize); ++ + /*! ZSTD_generateSequences() : +- * Generate sequences using ZSTD_compress2, given a source buffer. ++ * WARNING: This function is meant for debugging and informational purposes ONLY! ++ * Its implementation is flawed, and it will be deleted in a future version. ++ * It is not guaranteed to succeed, as there are several cases where it will give ++ * up and fail. You should NOT use this function in production code. ++ * ++ * This function is deprecated, and will be removed in a future version. ++ * ++ * Generate sequences using ZSTD_compress2(), given a source buffer. ++ * ++ * @param zc The compression context to be used for ZSTD_compress2(). Set any ++ * compression parameters you need on this context. ++ * @param outSeqs The output sequences buffer of size @p outSeqsSize ++ * @param outSeqsSize The size of the output sequences buffer. ++ * ZSTD_sequenceBound(srcSize) is an upper bound on the number ++ * of sequences that can be generated. ++ * @param src The source buffer to generate sequences from of size @p srcSize. ++ * @param srcSize The size of the source buffer. + * + * Each block will end with a dummy sequence + * with offset == 0, matchLength == 0, and litLength == length of last literals. + * litLength may be == 0, and if so, then the sequence of (of: 0 ml: 0 ll: 0) + * simply acts as a block delimiter. + * +- * zc can be used to insert custom compression params. +- * This function invokes ZSTD_compress2 +- * +- * The output of this function can be fed into ZSTD_compressSequences() with CCtx +- * setting of ZSTD_c_blockDelimiters as ZSTD_sf_explicitBlockDelimiters +- * @return : number of sequences generated +- */ +- +-ZSTDLIB_STATIC_API size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs, +- size_t outSeqsSize, const void* src, size_t srcSize); ++ * @returns The number of sequences generated, necessarily less than ++ * ZSTD_sequenceBound(srcSize), or an error code that can be checked ++ * with ZSTD_isError(). ++ */ ++ZSTD_DEPRECATED("For debugging only, will be replaced by ZSTD_extractSequences()") ++ZSTDLIB_STATIC_API size_t ++ZSTD_generateSequences(ZSTD_CCtx* zc, ++ ZSTD_Sequence* outSeqs, size_t outSeqsSize, ++ const void* src, size_t srcSize); + + /*! ZSTD_mergeBlockDelimiters() : + * Given an array of ZSTD_Sequence, remove all sequences that represent block delimiters/last literals +@@ -1388,7 +1573,9 @@ ZSTDLIB_STATIC_API size_t ZSTD_generateS + ZSTDLIB_STATIC_API size_t ZSTD_mergeBlockDelimiters(ZSTD_Sequence* sequences, size_t seqsSize); + + /*! ZSTD_compressSequences() : +- * Compress an array of ZSTD_Sequence, generated from the original source buffer, into dst. ++ * Compress an array of ZSTD_Sequence, associated with @src buffer, into dst. ++ * @src contains the entire input (not just the literals). ++ * If @srcSize > sum(sequence.length), the remaining bytes are considered all literals + * If a dictionary is included, then the cctx should reference the dict. (see: ZSTD_CCtx_refCDict(), ZSTD_CCtx_loadDictionary(), etc.) + * The entire source is compressed into a single frame. + * +@@ -1413,11 +1600,12 @@ ZSTDLIB_STATIC_API size_t ZSTD_mergeBloc + * Note: Repcodes are, as of now, always re-calculated within this function, so ZSTD_Sequence::rep is unused. + * Note 2: Once we integrate ability to ingest repcodes, the explicit block delims mode must respect those repcodes exactly, + * and cannot emit an RLE block that disagrees with the repcode history +- * @return : final compressed size or a ZSTD error. ++ * @return : final compressed size, or a ZSTD error code. + */ +-ZSTDLIB_STATIC_API size_t ZSTD_compressSequences(ZSTD_CCtx* const cctx, void* dst, size_t dstSize, +- const ZSTD_Sequence* inSeqs, size_t inSeqsSize, +- const void* src, size_t srcSize); ++ZSTDLIB_STATIC_API size_t ++ZSTD_compressSequences( ZSTD_CCtx* cctx, void* dst, size_t dstSize, ++ const ZSTD_Sequence* inSeqs, size_t inSeqsSize, ++ const void* src, size_t srcSize); + + + /*! ZSTD_writeSkippableFrame() : +@@ -1464,48 +1652,59 @@ ZSTDLIB_API unsigned ZSTD_isSkippableFra + /*! ZSTD_estimate*() : + * These functions make it possible to estimate memory usage + * of a future {D,C}Ctx, before its creation. ++ * This is useful in combination with ZSTD_initStatic(), ++ * which makes it possible to employ a static buffer for ZSTD_CCtx* state. + * + * ZSTD_estimateCCtxSize() will provide a memory budget large enough +- * for any compression level up to selected one. +- * Note : Unlike ZSTD_estimateCStreamSize*(), this estimate +- * does not include space for a window buffer. +- * Therefore, the estimation is only guaranteed for single-shot compressions, not streaming. ++ * to compress data of any size using one-shot compression ZSTD_compressCCtx() or ZSTD_compress2() ++ * associated with any compression level up to max specified one. + * The estimate will assume the input may be arbitrarily large, + * which is the worst case. + * ++ * Note that the size estimation is specific for one-shot compression, ++ * it is not valid for streaming (see ZSTD_estimateCStreamSize*()) ++ * nor other potential ways of using a ZSTD_CCtx* state. ++ * + * When srcSize can be bound by a known and rather "small" value, +- * this fact can be used to provide a tighter estimation +- * because the CCtx compression context will need less memory. +- * This tighter estimation can be provided by more advanced functions ++ * this knowledge can be used to provide a tighter budget estimation ++ * because the ZSTD_CCtx* state will need less memory for small inputs. ++ * This tighter estimation can be provided by employing more advanced functions + * ZSTD_estimateCCtxSize_usingCParams(), which can be used in tandem with ZSTD_getCParams(), + * and ZSTD_estimateCCtxSize_usingCCtxParams(), which can be used in tandem with ZSTD_CCtxParams_setParameter(). + * Both can be used to estimate memory using custom compression parameters and arbitrary srcSize limits. + * +- * Note 2 : only single-threaded compression is supported. ++ * Note : only single-threaded compression is supported. + * ZSTD_estimateCCtxSize_usingCCtxParams() will return an error code if ZSTD_c_nbWorkers is >= 1. + */ +-ZSTDLIB_STATIC_API size_t ZSTD_estimateCCtxSize(int compressionLevel); ++ZSTDLIB_STATIC_API size_t ZSTD_estimateCCtxSize(int maxCompressionLevel); + ZSTDLIB_STATIC_API size_t ZSTD_estimateCCtxSize_usingCParams(ZSTD_compressionParameters cParams); + ZSTDLIB_STATIC_API size_t ZSTD_estimateCCtxSize_usingCCtxParams(const ZSTD_CCtx_params* params); + ZSTDLIB_STATIC_API size_t ZSTD_estimateDCtxSize(void); + + /*! ZSTD_estimateCStreamSize() : +- * ZSTD_estimateCStreamSize() will provide a budget large enough for any compression level up to selected one. +- * It will also consider src size to be arbitrarily "large", which is worst case. ++ * ZSTD_estimateCStreamSize() will provide a memory budget large enough for streaming compression ++ * using any compression level up to the max specified one. ++ * It will also consider src size to be arbitrarily "large", which is a worst case scenario. + * If srcSize is known to always be small, ZSTD_estimateCStreamSize_usingCParams() can provide a tighter estimation. + * ZSTD_estimateCStreamSize_usingCParams() can be used in tandem with ZSTD_getCParams() to create cParams from compressionLevel. + * ZSTD_estimateCStreamSize_usingCCtxParams() can be used in tandem with ZSTD_CCtxParams_setParameter(). Only single-threaded compression is supported. This function will return an error code if ZSTD_c_nbWorkers is >= 1. + * Note : CStream size estimation is only correct for single-threaded compression. +- * ZSTD_DStream memory budget depends on window Size. ++ * ZSTD_estimateCStreamSize_usingCCtxParams() will return an error code if ZSTD_c_nbWorkers is >= 1. ++ * Note 2 : ZSTD_estimateCStreamSize* functions are not compatible with the Block-Level Sequence Producer API at this time. ++ * Size estimates assume that no external sequence producer is registered. ++ * ++ * ZSTD_DStream memory budget depends on frame's window Size. + * This information can be passed manually, using ZSTD_estimateDStreamSize, + * or deducted from a valid frame Header, using ZSTD_estimateDStreamSize_fromFrame(); ++ * Any frame requesting a window size larger than max specified one will be rejected. + * Note : if streaming is init with function ZSTD_init?Stream_usingDict(), + * an internal ?Dict will be created, which additional size is not estimated here. +- * In this case, get total size by adding ZSTD_estimate?DictSize */ +-ZSTDLIB_STATIC_API size_t ZSTD_estimateCStreamSize(int compressionLevel); ++ * In this case, get total size by adding ZSTD_estimate?DictSize ++ */ ++ZSTDLIB_STATIC_API size_t ZSTD_estimateCStreamSize(int maxCompressionLevel); + ZSTDLIB_STATIC_API size_t ZSTD_estimateCStreamSize_usingCParams(ZSTD_compressionParameters cParams); + ZSTDLIB_STATIC_API size_t ZSTD_estimateCStreamSize_usingCCtxParams(const ZSTD_CCtx_params* params); +-ZSTDLIB_STATIC_API size_t ZSTD_estimateDStreamSize(size_t windowSize); ++ZSTDLIB_STATIC_API size_t ZSTD_estimateDStreamSize(size_t maxWindowSize); + ZSTDLIB_STATIC_API size_t ZSTD_estimateDStreamSize_fromFrame(const void* src, size_t srcSize); + + /*! ZSTD_estimate?DictSize() : +@@ -1649,22 +1848,45 @@ ZSTDLIB_STATIC_API size_t ZSTD_checkCPar + * This function never fails (wide contract) */ + ZSTDLIB_STATIC_API ZSTD_compressionParameters ZSTD_adjustCParams(ZSTD_compressionParameters cPar, unsigned long long srcSize, size_t dictSize); + ++/*! ZSTD_CCtx_setCParams() : ++ * Set all parameters provided within @p cparams into the working @p cctx. ++ * Note : if modifying parameters during compression (MT mode only), ++ * note that changes to the .windowLog parameter will be ignored. ++ * @return 0 on success, or an error code (can be checked with ZSTD_isError()). ++ * On failure, no parameters are updated. ++ */ ++ZSTDLIB_STATIC_API size_t ZSTD_CCtx_setCParams(ZSTD_CCtx* cctx, ZSTD_compressionParameters cparams); ++ ++/*! ZSTD_CCtx_setFParams() : ++ * Set all parameters provided within @p fparams into the working @p cctx. ++ * @return 0 on success, or an error code (can be checked with ZSTD_isError()). ++ */ ++ZSTDLIB_STATIC_API size_t ZSTD_CCtx_setFParams(ZSTD_CCtx* cctx, ZSTD_frameParameters fparams); ++ ++/*! ZSTD_CCtx_setParams() : ++ * Set all parameters provided within @p params into the working @p cctx. ++ * @return 0 on success, or an error code (can be checked with ZSTD_isError()). ++ */ ++ZSTDLIB_STATIC_API size_t ZSTD_CCtx_setParams(ZSTD_CCtx* cctx, ZSTD_parameters params); ++ + /*! ZSTD_compress_advanced() : + * Note : this function is now DEPRECATED. + * It can be replaced by ZSTD_compress2(), in combination with ZSTD_CCtx_setParameter() and other parameter setters. + * This prototype will generate compilation warnings. */ + ZSTD_DEPRECATED("use ZSTD_compress2") ++ZSTDLIB_STATIC_API + size_t ZSTD_compress_advanced(ZSTD_CCtx* cctx, +- void* dst, size_t dstCapacity, +- const void* src, size_t srcSize, +- const void* dict,size_t dictSize, +- ZSTD_parameters params); ++ void* dst, size_t dstCapacity, ++ const void* src, size_t srcSize, ++ const void* dict,size_t dictSize, ++ ZSTD_parameters params); + + /*! ZSTD_compress_usingCDict_advanced() : + * Note : this function is now DEPRECATED. + * It can be replaced by ZSTD_compress2(), in combination with ZSTD_CCtx_loadDictionary() and other parameter setters. + * This prototype will generate compilation warnings. */ + ZSTD_DEPRECATED("use ZSTD_compress2 with ZSTD_CCtx_loadDictionary") ++ZSTDLIB_STATIC_API + size_t ZSTD_compress_usingCDict_advanced(ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, +@@ -1737,11 +1959,6 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refP + */ + #define ZSTD_c_literalCompressionMode ZSTD_c_experimentalParam5 + +-/* Tries to fit compressed block size to be around targetCBlockSize. +- * No target when targetCBlockSize == 0. +- * There is no guarantee on compressed block size (default:0) */ +-#define ZSTD_c_targetCBlockSize ZSTD_c_experimentalParam6 +- + /* User's best guess of source size. + * Hint is not valid when srcSizeHint == 0. + * There is no guarantee that hint is close to actual source size, +@@ -1808,13 +2025,16 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refP + * Experimental parameter. + * Default is 0 == disabled. Set to 1 to enable. + * +- * Tells the compressor that the ZSTD_inBuffer will ALWAYS be the same +- * between calls, except for the modifications that zstd makes to pos (the +- * caller must not modify pos). This is checked by the compressor, and +- * compression will fail if it ever changes. This means the only flush +- * mode that makes sense is ZSTD_e_end, so zstd will error if ZSTD_e_end +- * is not used. The data in the ZSTD_inBuffer in the range [src, src + pos) +- * MUST not be modified during compression or you will get data corruption. ++ * Tells the compressor that input data presented with ZSTD_inBuffer ++ * will ALWAYS be the same between calls. ++ * Technically, the @src pointer must never be changed, ++ * and the @pos field can only be updated by zstd. ++ * However, it's possible to increase the @size field, ++ * allowing scenarios where more data can be appended after compressions starts. ++ * These conditions are checked by the compressor, ++ * and compression will fail if they are not respected. ++ * Also, data in the ZSTD_inBuffer within the range [src, src + pos) ++ * MUST not be modified during compression or it will result in data corruption. + * + * When this flag is enabled zstd won't allocate an input window buffer, + * because the user guarantees it can reference the ZSTD_inBuffer until +@@ -1822,18 +2042,15 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refP + * large enough to fit a block (see ZSTD_c_stableOutBuffer). This will also + * avoid the memcpy() from the input buffer to the input window buffer. + * +- * NOTE: ZSTD_compressStream2() will error if ZSTD_e_end is not used. +- * That means this flag cannot be used with ZSTD_compressStream(). +- * + * NOTE: So long as the ZSTD_inBuffer always points to valid memory, using + * this flag is ALWAYS memory safe, and will never access out-of-bounds +- * memory. However, compression WILL fail if you violate the preconditions. ++ * memory. However, compression WILL fail if conditions are not respected. + * +- * WARNING: The data in the ZSTD_inBuffer in the range [dst, dst + pos) MUST +- * not be modified during compression or you will get data corruption. This +- * is because zstd needs to reference data in the ZSTD_inBuffer to find ++ * WARNING: The data in the ZSTD_inBuffer in the range [src, src + pos) MUST ++ * not be modified during compression or it will result in data corruption. ++ * This is because zstd needs to reference data in the ZSTD_inBuffer to find + * matches. Normally zstd maintains its own window buffer for this purpose, +- * but passing this flag tells zstd to use the user provided buffer. ++ * but passing this flag tells zstd to rely on user provided buffer instead. + */ + #define ZSTD_c_stableInBuffer ZSTD_c_experimentalParam9 + +@@ -1878,7 +2095,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refP + * Without validation, providing a sequence that does not conform to the zstd spec will cause + * undefined behavior, and may produce a corrupted block. + * +- * With validation enabled, a if sequence is invalid (see doc/zstd_compression_format.md for ++ * With validation enabled, if sequence is invalid (see doc/zstd_compression_format.md for + * specifics regarding offset/matchlength requirements) then the function will bail out and + * return an error. + * +@@ -1928,6 +2145,79 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refP + */ + #define ZSTD_c_deterministicRefPrefix ZSTD_c_experimentalParam15 + ++/* ZSTD_c_prefetchCDictTables ++ * Controlled with ZSTD_paramSwitch_e enum. Default is ZSTD_ps_auto. ++ * ++ * In some situations, zstd uses CDict tables in-place rather than copying them ++ * into the working context. (See docs on ZSTD_dictAttachPref_e above for details). ++ * In such situations, compression speed is seriously impacted when CDict tables are ++ * "cold" (outside CPU cache). This parameter instructs zstd to prefetch CDict tables ++ * when they are used in-place. ++ * ++ * For sufficiently small inputs, the cost of the prefetch will outweigh the benefit. ++ * For sufficiently large inputs, zstd will by default memcpy() CDict tables ++ * into the working context, so there is no need to prefetch. This parameter is ++ * targeted at a middle range of input sizes, where a prefetch is cheap enough to be ++ * useful but memcpy() is too expensive. The exact range of input sizes where this ++ * makes sense is best determined by careful experimentation. ++ * ++ * Note: for this parameter, ZSTD_ps_auto is currently equivalent to ZSTD_ps_disable, ++ * but in the future zstd may conditionally enable this feature via an auto-detection ++ * heuristic for cold CDicts. ++ * Use ZSTD_ps_disable to opt out of prefetching under any circumstances. ++ */ ++#define ZSTD_c_prefetchCDictTables ZSTD_c_experimentalParam16 ++ ++/* ZSTD_c_enableSeqProducerFallback ++ * Allowed values are 0 (disable) and 1 (enable). The default setting is 0. ++ * ++ * Controls whether zstd will fall back to an internal sequence producer if an ++ * external sequence producer is registered and returns an error code. This fallback ++ * is block-by-block: the internal sequence producer will only be called for blocks ++ * where the external sequence producer returns an error code. Fallback parsing will ++ * follow any other cParam settings, such as compression level, the same as in a ++ * normal (fully-internal) compression operation. ++ * ++ * The user is strongly encouraged to read the full Block-Level Sequence Producer API ++ * documentation (below) before setting this parameter. */ ++#define ZSTD_c_enableSeqProducerFallback ZSTD_c_experimentalParam17 ++ ++/* ZSTD_c_maxBlockSize ++ * Allowed values are between 1KB and ZSTD_BLOCKSIZE_MAX (128KB). ++ * The default is ZSTD_BLOCKSIZE_MAX, and setting to 0 will set to the default. ++ * ++ * This parameter can be used to set an upper bound on the blocksize ++ * that overrides the default ZSTD_BLOCKSIZE_MAX. It cannot be used to set upper ++ * bounds greater than ZSTD_BLOCKSIZE_MAX or bounds lower than 1KB (will make ++ * compressBound() inaccurate). Only currently meant to be used for testing. ++ * ++ */ ++#define ZSTD_c_maxBlockSize ZSTD_c_experimentalParam18 ++ ++/* ZSTD_c_searchForExternalRepcodes ++ * This parameter affects how zstd parses external sequences, such as sequences ++ * provided through the compressSequences() API or from an external block-level ++ * sequence producer. ++ * ++ * If set to ZSTD_ps_enable, the library will check for repeated offsets in ++ * external sequences, even if those repcodes are not explicitly indicated in ++ * the "rep" field. Note that this is the only way to exploit repcode matches ++ * while using compressSequences() or an external sequence producer, since zstd ++ * currently ignores the "rep" field of external sequences. ++ * ++ * If set to ZSTD_ps_disable, the library will not exploit repeated offsets in ++ * external sequences, regardless of whether the "rep" field has been set. This ++ * reduces sequence compression overhead by about 25% while sacrificing some ++ * compression ratio. ++ * ++ * The default value is ZSTD_ps_auto, for which the library will enable/disable ++ * based on compression level. ++ * ++ * Note: for now, this param only has an effect if ZSTD_c_blockDelimiters is ++ * set to ZSTD_sf_explicitBlockDelimiters. That may change in the future. ++ */ ++#define ZSTD_c_searchForExternalRepcodes ZSTD_c_experimentalParam19 ++ + /*! ZSTD_CCtx_getParameter() : + * Get the requested compression parameter value, selected by enum ZSTD_cParameter, + * and store it into int* value. +@@ -2084,7 +2374,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_DCtx_getP + * in the range [dst, dst + pos) MUST not be modified during decompression + * or you will get data corruption. + * +- * When this flags is enabled zstd won't allocate an output buffer, because ++ * When this flag is enabled zstd won't allocate an output buffer, because + * it can write directly to the ZSTD_outBuffer, but it will still allocate + * an input buffer large enough to fit any compressed block. This will also + * avoid the memcpy() from the internal output buffer to the ZSTD_outBuffer. +@@ -2137,6 +2427,33 @@ ZSTDLIB_STATIC_API size_t ZSTD_DCtx_getP + */ + #define ZSTD_d_refMultipleDDicts ZSTD_d_experimentalParam4 + ++/* ZSTD_d_disableHuffmanAssembly ++ * Set to 1 to disable the Huffman assembly implementation. ++ * The default value is 0, which allows zstd to use the Huffman assembly ++ * implementation if available. ++ * ++ * This parameter can be used to disable Huffman assembly at runtime. ++ * If you want to disable it at compile time you can define the macro ++ * ZSTD_DISABLE_ASM. ++ */ ++#define ZSTD_d_disableHuffmanAssembly ZSTD_d_experimentalParam5 ++ ++/* ZSTD_d_maxBlockSize ++ * Allowed values are between 1KB and ZSTD_BLOCKSIZE_MAX (128KB). ++ * The default is ZSTD_BLOCKSIZE_MAX, and setting to 0 will set to the default. ++ * ++ * Forces the decompressor to reject blocks whose content size is ++ * larger than the configured maxBlockSize. When maxBlockSize is ++ * larger than the windowSize, the windowSize is used instead. ++ * This saves memory on the decoder when you know all blocks are small. ++ * ++ * This option is typically used in conjunction with ZSTD_c_maxBlockSize. ++ * ++ * WARNING: This causes the decoder to reject otherwise valid frames ++ * that have block sizes larger than the configured maxBlockSize. ++ */ ++#define ZSTD_d_maxBlockSize ZSTD_d_experimentalParam6 ++ + + /*! ZSTD_DCtx_setFormat() : + * This function is REDUNDANT. Prefer ZSTD_DCtx_setParameter(). +@@ -2145,6 +2462,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_DCtx_getP + * such ZSTD_f_zstd1_magicless for example. + * @return : 0, or an error code (which can be tested using ZSTD_isError()). */ + ZSTD_DEPRECATED("use ZSTD_DCtx_setParameter() instead") ++ZSTDLIB_STATIC_API + size_t ZSTD_DCtx_setFormat(ZSTD_DCtx* dctx, ZSTD_format_e format); + + /*! ZSTD_decompressStream_simpleArgs() : +@@ -2181,6 +2499,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_decompres + * This prototype will generate compilation warnings. + */ + ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions") ++ZSTDLIB_STATIC_API + size_t ZSTD_initCStream_srcSize(ZSTD_CStream* zcs, + int compressionLevel, + unsigned long long pledgedSrcSize); +@@ -2198,17 +2517,15 @@ size_t ZSTD_initCStream_srcSize(ZSTD_CSt + * This prototype will generate compilation warnings. + */ + ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions") ++ZSTDLIB_STATIC_API + size_t ZSTD_initCStream_usingDict(ZSTD_CStream* zcs, + const void* dict, size_t dictSize, + int compressionLevel); + + /*! ZSTD_initCStream_advanced() : +- * This function is DEPRECATED, and is approximately equivalent to: ++ * This function is DEPRECATED, and is equivalent to: + * ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only); +- * // Pseudocode: Set each zstd parameter and leave the rest as-is. +- * for ((param, value) : params) { +- * ZSTD_CCtx_setParameter(zcs, param, value); +- * } ++ * ZSTD_CCtx_setParams(zcs, params); + * ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize); + * ZSTD_CCtx_loadDictionary(zcs, dict, dictSize); + * +@@ -2218,6 +2535,7 @@ size_t ZSTD_initCStream_usingDict(ZSTD_C + * This prototype will generate compilation warnings. + */ + ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions") ++ZSTDLIB_STATIC_API + size_t ZSTD_initCStream_advanced(ZSTD_CStream* zcs, + const void* dict, size_t dictSize, + ZSTD_parameters params, +@@ -2232,15 +2550,13 @@ size_t ZSTD_initCStream_advanced(ZSTD_CS + * This prototype will generate compilation warnings. + */ + ZSTD_DEPRECATED("use ZSTD_CCtx_reset and ZSTD_CCtx_refCDict, see zstd.h for detailed instructions") ++ZSTDLIB_STATIC_API + size_t ZSTD_initCStream_usingCDict(ZSTD_CStream* zcs, const ZSTD_CDict* cdict); + + /*! ZSTD_initCStream_usingCDict_advanced() : +- * This function is DEPRECATED, and is approximately equivalent to: ++ * This function is DEPRECATED, and is equivalent to: + * ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only); +- * // Pseudocode: Set each zstd frame parameter and leave the rest as-is. +- * for ((fParam, value) : fParams) { +- * ZSTD_CCtx_setParameter(zcs, fParam, value); +- * } ++ * ZSTD_CCtx_setFParams(zcs, fParams); + * ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize); + * ZSTD_CCtx_refCDict(zcs, cdict); + * +@@ -2250,6 +2566,7 @@ size_t ZSTD_initCStream_usingCDict(ZSTD_ + * This prototype will generate compilation warnings. + */ + ZSTD_DEPRECATED("use ZSTD_CCtx_reset and ZSTD_CCtx_refCDict, see zstd.h for detailed instructions") ++ZSTDLIB_STATIC_API + size_t ZSTD_initCStream_usingCDict_advanced(ZSTD_CStream* zcs, + const ZSTD_CDict* cdict, + ZSTD_frameParameters fParams, +@@ -2264,7 +2581,7 @@ size_t ZSTD_initCStream_usingCDict_advan + * explicitly specified. + * + * start a new frame, using same parameters from previous frame. +- * This is typically useful to skip dictionary loading stage, since it will re-use it in-place. ++ * This is typically useful to skip dictionary loading stage, since it will reuse it in-place. + * Note that zcs must be init at least once before using ZSTD_resetCStream(). + * If pledgedSrcSize is not known at reset time, use macro ZSTD_CONTENTSIZE_UNKNOWN. + * If pledgedSrcSize > 0, its value must be correct, as it will be written in header, and controlled at the end. +@@ -2274,6 +2591,7 @@ size_t ZSTD_initCStream_usingCDict_advan + * This prototype will generate compilation warnings. + */ + ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions") ++ZSTDLIB_STATIC_API + size_t ZSTD_resetCStream(ZSTD_CStream* zcs, unsigned long long pledgedSrcSize); + + +@@ -2319,8 +2637,8 @@ ZSTDLIB_STATIC_API size_t ZSTD_toFlushNo + * ZSTD_DCtx_loadDictionary(zds, dict, dictSize); + * + * note: no dictionary will be used if dict == NULL or dictSize < 8 +- * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x + */ ++ZSTD_DEPRECATED("use ZSTD_DCtx_reset + ZSTD_DCtx_loadDictionary, see zstd.h for detailed instructions") + ZSTDLIB_STATIC_API size_t ZSTD_initDStream_usingDict(ZSTD_DStream* zds, const void* dict, size_t dictSize); + + /*! +@@ -2330,8 +2648,8 @@ ZSTDLIB_STATIC_API size_t ZSTD_initDStre + * ZSTD_DCtx_refDDict(zds, ddict); + * + * note : ddict is referenced, it must outlive decompression session +- * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x + */ ++ZSTD_DEPRECATED("use ZSTD_DCtx_reset + ZSTD_DCtx_refDDict, see zstd.h for detailed instructions") + ZSTDLIB_STATIC_API size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* zds, const ZSTD_DDict* ddict); + + /*! +@@ -2339,18 +2657,202 @@ ZSTDLIB_STATIC_API size_t ZSTD_initDStre + * + * ZSTD_DCtx_reset(zds, ZSTD_reset_session_only); + * +- * re-use decompression parameters from previous init; saves dictionary loading +- * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x ++ * reuse decompression parameters from previous init; saves dictionary loading + */ ++ZSTD_DEPRECATED("use ZSTD_DCtx_reset, see zstd.h for detailed instructions") + ZSTDLIB_STATIC_API size_t ZSTD_resetDStream(ZSTD_DStream* zds); + + ++/* ********************* BLOCK-LEVEL SEQUENCE PRODUCER API ********************* ++ * ++ * *** OVERVIEW *** ++ * The Block-Level Sequence Producer API allows users to provide their own custom ++ * sequence producer which libzstd invokes to process each block. The produced list ++ * of sequences (literals and matches) is then post-processed by libzstd to produce ++ * valid compressed blocks. ++ * ++ * This block-level offload API is a more granular complement of the existing ++ * frame-level offload API compressSequences() (introduced in v1.5.1). It offers ++ * an easier migration story for applications already integrated with libzstd: the ++ * user application continues to invoke the same compression functions ++ * ZSTD_compress2() or ZSTD_compressStream2() as usual, and transparently benefits ++ * from the specific advantages of the external sequence producer. For example, ++ * the sequence producer could be tuned to take advantage of known characteristics ++ * of the input, to offer better speed / ratio, or could leverage hardware ++ * acceleration not available within libzstd itself. ++ * ++ * See contrib/externalSequenceProducer for an example program employing the ++ * Block-Level Sequence Producer API. ++ * ++ * *** USAGE *** ++ * The user is responsible for implementing a function of type ++ * ZSTD_sequenceProducer_F. For each block, zstd will pass the following ++ * arguments to the user-provided function: ++ * ++ * - sequenceProducerState: a pointer to a user-managed state for the sequence ++ * producer. ++ * ++ * - outSeqs, outSeqsCapacity: an output buffer for the sequence producer. ++ * outSeqsCapacity is guaranteed >= ZSTD_sequenceBound(srcSize). The memory ++ * backing outSeqs is managed by the CCtx. ++ * ++ * - src, srcSize: an input buffer for the sequence producer to parse. ++ * srcSize is guaranteed to be <= ZSTD_BLOCKSIZE_MAX. ++ * ++ * - dict, dictSize: a history buffer, which may be empty, which the sequence ++ * producer may reference as it parses the src buffer. Currently, zstd will ++ * always pass dictSize == 0 into external sequence producers, but this will ++ * change in the future. ++ * ++ * - compressionLevel: a signed integer representing the zstd compression level ++ * set by the user for the current operation. The sequence producer may choose ++ * to use this information to change its compression strategy and speed/ratio ++ * tradeoff. Note: the compression level does not reflect zstd parameters set ++ * through the advanced API. ++ * ++ * - windowSize: a size_t representing the maximum allowed offset for external ++ * sequences. Note that sequence offsets are sometimes allowed to exceed the ++ * windowSize if a dictionary is present, see doc/zstd_compression_format.md ++ * for details. ++ * ++ * The user-provided function shall return a size_t representing the number of ++ * sequences written to outSeqs. This return value will be treated as an error ++ * code if it is greater than outSeqsCapacity. The return value must be non-zero ++ * if srcSize is non-zero. The ZSTD_SEQUENCE_PRODUCER_ERROR macro is provided ++ * for convenience, but any value greater than outSeqsCapacity will be treated as ++ * an error code. ++ * ++ * If the user-provided function does not return an error code, the sequences ++ * written to outSeqs must be a valid parse of the src buffer. Data corruption may ++ * occur if the parse is not valid. A parse is defined to be valid if the ++ * following conditions hold: ++ * - The sum of matchLengths and literalLengths must equal srcSize. ++ * - All sequences in the parse, except for the final sequence, must have ++ * matchLength >= ZSTD_MINMATCH_MIN. The final sequence must have ++ * matchLength >= ZSTD_MINMATCH_MIN or matchLength == 0. ++ * - All offsets must respect the windowSize parameter as specified in ++ * doc/zstd_compression_format.md. ++ * - If the final sequence has matchLength == 0, it must also have offset == 0. ++ * ++ * zstd will only validate these conditions (and fail compression if they do not ++ * hold) if the ZSTD_c_validateSequences cParam is enabled. Note that sequence ++ * validation has a performance cost. ++ * ++ * If the user-provided function returns an error, zstd will either fall back ++ * to an internal sequence producer or fail the compression operation. The user can ++ * choose between the two behaviors by setting the ZSTD_c_enableSeqProducerFallback ++ * cParam. Fallback compression will follow any other cParam settings, such as ++ * compression level, the same as in a normal compression operation. ++ * ++ * The user shall instruct zstd to use a particular ZSTD_sequenceProducer_F ++ * function by calling ++ * ZSTD_registerSequenceProducer(cctx, ++ * sequenceProducerState, ++ * sequenceProducer) ++ * This setting will persist until the next parameter reset of the CCtx. ++ * ++ * The sequenceProducerState must be initialized by the user before calling ++ * ZSTD_registerSequenceProducer(). The user is responsible for destroying the ++ * sequenceProducerState. ++ * ++ * *** LIMITATIONS *** ++ * This API is compatible with all zstd compression APIs which respect advanced parameters. ++ * However, there are three limitations: ++ * ++ * First, the ZSTD_c_enableLongDistanceMatching cParam is not currently supported. ++ * COMPRESSION WILL FAIL if it is enabled and the user tries to compress with a block-level ++ * external sequence producer. ++ * - Note that ZSTD_c_enableLongDistanceMatching is auto-enabled by default in some ++ * cases (see its documentation for details). Users must explicitly set ++ * ZSTD_c_enableLongDistanceMatching to ZSTD_ps_disable in such cases if an external ++ * sequence producer is registered. ++ * - As of this writing, ZSTD_c_enableLongDistanceMatching is disabled by default ++ * whenever ZSTD_c_windowLog < 128MB, but that's subject to change. Users should ++ * check the docs on ZSTD_c_enableLongDistanceMatching whenever the Block-Level Sequence ++ * Producer API is used in conjunction with advanced settings (like ZSTD_c_windowLog). ++ * ++ * Second, history buffers are not currently supported. Concretely, zstd will always pass ++ * dictSize == 0 to the external sequence producer (for now). This has two implications: ++ * - Dictionaries are not currently supported. Compression will *not* fail if the user ++ * references a dictionary, but the dictionary won't have any effect. ++ * - Stream history is not currently supported. All advanced compression APIs, including ++ * streaming APIs, work with external sequence producers, but each block is treated as ++ * an independent chunk without history from previous blocks. ++ * ++ * Third, multi-threading within a single compression is not currently supported. In other words, ++ * COMPRESSION WILL FAIL if ZSTD_c_nbWorkers > 0 and an external sequence producer is registered. ++ * Multi-threading across compressions is fine: simply create one CCtx per thread. ++ * ++ * Long-term, we plan to overcome all three limitations. There is no technical blocker to ++ * overcoming them. It is purely a question of engineering effort. ++ */ ++ ++#define ZSTD_SEQUENCE_PRODUCER_ERROR ((size_t)(-1)) ++ ++typedef size_t (*ZSTD_sequenceProducer_F) ( ++ void* sequenceProducerState, ++ ZSTD_Sequence* outSeqs, size_t outSeqsCapacity, ++ const void* src, size_t srcSize, ++ const void* dict, size_t dictSize, ++ int compressionLevel, ++ size_t windowSize ++); ++ ++/*! ZSTD_registerSequenceProducer() : ++ * Instruct zstd to use a block-level external sequence producer function. ++ * ++ * The sequenceProducerState must be initialized by the caller, and the caller is ++ * responsible for managing its lifetime. This parameter is sticky across ++ * compressions. It will remain set until the user explicitly resets compression ++ * parameters. ++ * ++ * Sequence producer registration is considered to be an "advanced parameter", ++ * part of the "advanced API". This means it will only have an effect on compression ++ * APIs which respect advanced parameters, such as compress2() and compressStream2(). ++ * Older compression APIs such as compressCCtx(), which predate the introduction of ++ * "advanced parameters", will ignore any external sequence producer setting. ++ * ++ * The sequence producer can be "cleared" by registering a NULL function pointer. This ++ * removes all limitations described above in the "LIMITATIONS" section of the API docs. ++ * ++ * The user is strongly encouraged to read the full API documentation (above) before ++ * calling this function. */ ++ZSTDLIB_STATIC_API void ++ZSTD_registerSequenceProducer( ++ ZSTD_CCtx* cctx, ++ void* sequenceProducerState, ++ ZSTD_sequenceProducer_F sequenceProducer ++); ++ ++/*! ZSTD_CCtxParams_registerSequenceProducer() : ++ * Same as ZSTD_registerSequenceProducer(), but operates on ZSTD_CCtx_params. ++ * This is used for accurate size estimation with ZSTD_estimateCCtxSize_usingCCtxParams(), ++ * which is needed when creating a ZSTD_CCtx with ZSTD_initStaticCCtx(). ++ * ++ * If you are using the external sequence producer API in a scenario where ZSTD_initStaticCCtx() ++ * is required, then this function is for you. Otherwise, you probably don't need it. ++ * ++ * See tests/zstreamtest.c for example usage. */ ++ZSTDLIB_STATIC_API void ++ZSTD_CCtxParams_registerSequenceProducer( ++ ZSTD_CCtx_params* params, ++ void* sequenceProducerState, ++ ZSTD_sequenceProducer_F sequenceProducer ++); ++ ++ + /* ******************************************************************* +-* Buffer-less and synchronous inner streaming functions ++* Buffer-less and synchronous inner streaming functions (DEPRECATED) + * +-* This is an advanced API, giving full control over buffer management, for users which need direct control over memory. +-* But it's also a complex one, with several restrictions, documented below. +-* Prefer normal streaming API for an easier experience. ++* This API is deprecated, and will be removed in a future version. ++* It allows streaming (de)compression with user allocated buffers. ++* However, it is hard to use, and not as well tested as the rest of ++* our API. ++* ++* Please use the normal streaming API instead: ZSTD_compressStream2, ++* and ZSTD_decompressStream. ++* If there is functionality that you need, but it doesn't provide, ++* please open an issue on our GitHub. + ********************************************************************* */ + + /* +@@ -2358,11 +2860,10 @@ ZSTDLIB_STATIC_API size_t ZSTD_resetDStr + + A ZSTD_CCtx object is required to track streaming operations. + Use ZSTD_createCCtx() / ZSTD_freeCCtx() to manage resource. +- ZSTD_CCtx object can be re-used multiple times within successive compression operations. ++ ZSTD_CCtx object can be reused multiple times within successive compression operations. + + Start by initializing a context. + Use ZSTD_compressBegin(), or ZSTD_compressBegin_usingDict() for dictionary compression. +- It's also possible to duplicate a reference context which has already been initialized, using ZSTD_copyCCtx() + + Then, consume your input using ZSTD_compressContinue(). + There are some important considerations to keep in mind when using this advanced function : +@@ -2380,36 +2881,46 @@ ZSTDLIB_STATIC_API size_t ZSTD_resetDStr + It's possible to use srcSize==0, in which case, it will write a final empty block to end the frame. + Without last block mark, frames are considered unfinished (hence corrupted) by compliant decoders. + +- `ZSTD_CCtx` object can be re-used (ZSTD_compressBegin()) to compress again. ++ `ZSTD_CCtx` object can be reused (ZSTD_compressBegin()) to compress again. + */ + + /*===== Buffer-less streaming compression functions =====*/ ++ZSTD_DEPRECATED("The buffer-less API is deprecated in favor of the normal streaming API. See docs.") + ZSTDLIB_STATIC_API size_t ZSTD_compressBegin(ZSTD_CCtx* cctx, int compressionLevel); ++ZSTD_DEPRECATED("The buffer-less API is deprecated in favor of the normal streaming API. See docs.") + ZSTDLIB_STATIC_API size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel); ++ZSTD_DEPRECATED("The buffer-less API is deprecated in favor of the normal streaming API. See docs.") + ZSTDLIB_STATIC_API size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict); /*< note: fails if cdict==NULL */ +-ZSTDLIB_STATIC_API size_t ZSTD_copyCCtx(ZSTD_CCtx* cctx, const ZSTD_CCtx* preparedCCtx, unsigned long long pledgedSrcSize); /*< note: if pledgedSrcSize is not known, use ZSTD_CONTENTSIZE_UNKNOWN */ + ++ZSTD_DEPRECATED("This function will likely be removed in a future release. It is misleading and has very limited utility.") ++ZSTDLIB_STATIC_API ++size_t ZSTD_copyCCtx(ZSTD_CCtx* cctx, const ZSTD_CCtx* preparedCCtx, unsigned long long pledgedSrcSize); /*< note: if pledgedSrcSize is not known, use ZSTD_CONTENTSIZE_UNKNOWN */ ++ ++ZSTD_DEPRECATED("The buffer-less API is deprecated in favor of the normal streaming API. See docs.") + ZSTDLIB_STATIC_API size_t ZSTD_compressContinue(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); ++ZSTD_DEPRECATED("The buffer-less API is deprecated in favor of the normal streaming API. See docs.") + ZSTDLIB_STATIC_API size_t ZSTD_compressEnd(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); + + /* The ZSTD_compressBegin_advanced() and ZSTD_compressBegin_usingCDict_advanced() are now DEPRECATED and will generate a compiler warning */ + ZSTD_DEPRECATED("use advanced API to access custom parameters") ++ZSTDLIB_STATIC_API + size_t ZSTD_compressBegin_advanced(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, ZSTD_parameters params, unsigned long long pledgedSrcSize); /*< pledgedSrcSize : If srcSize is not known at init time, use ZSTD_CONTENTSIZE_UNKNOWN */ + ZSTD_DEPRECATED("use advanced API to access custom parameters") ++ZSTDLIB_STATIC_API + size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_CDict* const cdict, ZSTD_frameParameters const fParams, unsigned long long const pledgedSrcSize); /* compression parameters are already set within cdict. pledgedSrcSize must be correct. If srcSize is not known, use macro ZSTD_CONTENTSIZE_UNKNOWN */ + /* + Buffer-less streaming decompression (synchronous mode) + + A ZSTD_DCtx object is required to track streaming operations. + Use ZSTD_createDCtx() / ZSTD_freeDCtx() to manage it. +- A ZSTD_DCtx object can be re-used multiple times. ++ A ZSTD_DCtx object can be reused multiple times. + + First typical operation is to retrieve frame parameters, using ZSTD_getFrameHeader(). + Frame header is extracted from the beginning of compressed frame, so providing only the frame's beginning is enough. + Data fragment must be large enough to ensure successful decoding. + `ZSTD_frameHeaderSize_max` bytes is guaranteed to always be large enough. +- @result : 0 : successful decoding, the `ZSTD_frameHeader` structure is correctly filled. +- >0 : `srcSize` is too small, please provide at least @result bytes on next attempt. ++ result : 0 : successful decoding, the `ZSTD_frameHeader` structure is correctly filled. ++ >0 : `srcSize` is too small, please provide at least result bytes on next attempt. + errorCode, which can be tested using ZSTD_isError(). + + It fills a ZSTD_frameHeader structure with important information to correctly decode the frame, +@@ -2428,7 +2939,7 @@ size_t ZSTD_compressBegin_usingCDict_adv + + The most memory efficient way is to use a round buffer of sufficient size. + Sufficient size is determined by invoking ZSTD_decodingBufferSize_min(), +- which can @return an error code if required value is too large for current system (in 32-bits mode). ++ which can return an error code if required value is too large for current system (in 32-bits mode). + In a round buffer methodology, ZSTD_decompressContinue() decompresses each block next to previous one, + up to the moment there is not enough room left in the buffer to guarantee decoding another full block, + which maximum size is provided in `ZSTD_frameHeader` structure, field `blockSizeMax`. +@@ -2448,7 +2959,7 @@ size_t ZSTD_compressBegin_usingCDict_adv + ZSTD_nextSrcSizeToDecompress() tells how many bytes to provide as 'srcSize' to ZSTD_decompressContinue(). + ZSTD_decompressContinue() requires this _exact_ amount of bytes, or it will fail. + +- @result of ZSTD_decompressContinue() is the number of bytes regenerated within 'dst' (necessarily <= dstCapacity). ++ result of ZSTD_decompressContinue() is the number of bytes regenerated within 'dst' (necessarily <= dstCapacity). + It can be zero : it just means ZSTD_decompressContinue() has decoded some metadata item. + It can also be an error code, which can be tested with ZSTD_isError(). + +@@ -2471,27 +2982,7 @@ size_t ZSTD_compressBegin_usingCDict_adv + */ + + /*===== Buffer-less streaming decompression functions =====*/ +-typedef enum { ZSTD_frame, ZSTD_skippableFrame } ZSTD_frameType_e; +-typedef struct { +- unsigned long long frameContentSize; /* if == ZSTD_CONTENTSIZE_UNKNOWN, it means this field is not available. 0 means "empty" */ +- unsigned long long windowSize; /* can be very large, up to <= frameContentSize */ +- unsigned blockSizeMax; +- ZSTD_frameType_e frameType; /* if == ZSTD_skippableFrame, frameContentSize is the size of skippable content */ +- unsigned headerSize; +- unsigned dictID; +- unsigned checksumFlag; +-} ZSTD_frameHeader; + +-/*! ZSTD_getFrameHeader() : +- * decode Frame Header, or requires larger `srcSize`. +- * @return : 0, `zfhPtr` is correctly filled, +- * >0, `srcSize` is too small, value is wanted `srcSize` amount, +- * or an error code, which can be tested using ZSTD_isError() */ +-ZSTDLIB_STATIC_API size_t ZSTD_getFrameHeader(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize); /*< doesn't consume input */ +-/*! ZSTD_getFrameHeader_advanced() : +- * same as ZSTD_getFrameHeader(), +- * with added capability to select a format (like ZSTD_f_zstd1_magicless) */ +-ZSTDLIB_STATIC_API size_t ZSTD_getFrameHeader_advanced(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize, ZSTD_format_e format); + ZSTDLIB_STATIC_API size_t ZSTD_decodingBufferSize_min(unsigned long long windowSize, unsigned long long frameContentSize); /*< when frame content size is not known, pass in frameContentSize == ZSTD_CONTENTSIZE_UNKNOWN */ + + ZSTDLIB_STATIC_API size_t ZSTD_decompressBegin(ZSTD_DCtx* dctx); +@@ -2502,6 +2993,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_nextSrcSi + ZSTDLIB_STATIC_API size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); + + /* misc */ ++ZSTD_DEPRECATED("This function will likely be removed in the next minor release. It is misleading and has very limited utility.") + ZSTDLIB_STATIC_API void ZSTD_copyDCtx(ZSTD_DCtx* dctx, const ZSTD_DCtx* preparedDCtx); + typedef enum { ZSTDnit_frameHeader, ZSTDnit_blockHeader, ZSTDnit_block, ZSTDnit_lastBlock, ZSTDnit_checksum, ZSTDnit_skippableFrame } ZSTD_nextInputType_e; + ZSTDLIB_STATIC_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx); +@@ -2509,11 +3001,23 @@ ZSTDLIB_STATIC_API ZSTD_nextInputType_e + + + +-/* ============================ */ +-/* Block level API */ +-/* ============================ */ ++/* ========================================= */ ++/* Block level API (DEPRECATED) */ ++/* ========================================= */ + + /*! ++ ++ This API is deprecated in favor of the regular compression API. ++ You can get the frame header down to 2 bytes by setting: ++ - ZSTD_c_format = ZSTD_f_zstd1_magicless ++ - ZSTD_c_contentSizeFlag = 0 ++ - ZSTD_c_checksumFlag = 0 ++ - ZSTD_c_dictIDFlag = 0 ++ ++ This API is not as well tested as our normal API, so we recommend not using it. ++ We will be removing it in a future version. If the normal API doesn't provide ++ the functionality you need, please open a GitHub issue. ++ + Block functions produce and decode raw zstd blocks, without frame metadata. + Frame metadata cost is typically ~12 bytes, which can be non-negligible for very small blocks (< 100 bytes). + But users will have to take in charge needed metadata to regenerate data, such as compressed and content sizes. +@@ -2524,7 +3028,6 @@ ZSTDLIB_STATIC_API ZSTD_nextInputType_e + - It is necessary to init context before starting + + compression : any ZSTD_compressBegin*() variant, including with dictionary + + decompression : any ZSTD_decompressBegin*() variant, including with dictionary +- + copyCCtx() and copyDCtx() can be used too + - Block size is limited, it must be <= ZSTD_getBlockSize() <= ZSTD_BLOCKSIZE_MAX == 128 KB + + If input is larger than a block size, it's necessary to split input data into multiple blocks + + For inputs larger than a single block, consider using regular ZSTD_compress() instead. +@@ -2541,11 +3044,14 @@ ZSTDLIB_STATIC_API ZSTD_nextInputType_e + */ + + /*===== Raw zstd block functions =====*/ ++ZSTD_DEPRECATED("The block API is deprecated in favor of the normal compression API. See docs.") + ZSTDLIB_STATIC_API size_t ZSTD_getBlockSize (const ZSTD_CCtx* cctx); ++ZSTD_DEPRECATED("The block API is deprecated in favor of the normal compression API. See docs.") + ZSTDLIB_STATIC_API size_t ZSTD_compressBlock (ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); ++ZSTD_DEPRECATED("The block API is deprecated in favor of the normal compression API. See docs.") + ZSTDLIB_STATIC_API size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); ++ZSTD_DEPRECATED("The block API is deprecated in favor of the normal compression API. See docs.") + ZSTDLIB_STATIC_API size_t ZSTD_insertBlock (ZSTD_DCtx* dctx, const void* blockStart, size_t blockSize); /*< insert uncompressed block into `dctx` history. Useful for multi-blocks decompression. */ + +- + #endif /* ZSTD_H_ZSTD_STATIC_LINKING_ONLY */ + +--- a/lib/zstd/Makefile ++++ b/lib/zstd/Makefile +@@ -1,6 +1,6 @@ + # SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + # ################################################################ +-# Copyright (c) Facebook, Inc. ++# Copyright (c) Meta Platforms, Inc. and affiliates. + # All rights reserved. + # + # This source code is licensed under both the BSD-style license (found in the +--- /dev/null ++++ b/lib/zstd/common/allocations.h +@@ -0,0 +1,56 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ ++/* ++ * Copyright (c) Meta Platforms, Inc. and affiliates. ++ * All rights reserved. ++ * ++ * This source code is licensed under both the BSD-style license (found in the ++ * LICENSE file in the root directory of this source tree) and the GPLv2 (found ++ * in the COPYING file in the root directory of this source tree). ++ * You may select, at your option, one of the above-listed licenses. ++ */ ++ ++/* This file provides custom allocation primitives ++ */ ++ ++#define ZSTD_DEPS_NEED_MALLOC ++#include "zstd_deps.h" /* ZSTD_malloc, ZSTD_calloc, ZSTD_free, ZSTD_memset */ ++ ++#include "compiler.h" /* MEM_STATIC */ ++#define ZSTD_STATIC_LINKING_ONLY ++#include /* ZSTD_customMem */ ++ ++#ifndef ZSTD_ALLOCATIONS_H ++#define ZSTD_ALLOCATIONS_H ++ ++/* custom memory allocation functions */ ++ ++MEM_STATIC void* ZSTD_customMalloc(size_t size, ZSTD_customMem customMem) ++{ ++ if (customMem.customAlloc) ++ return customMem.customAlloc(customMem.opaque, size); ++ return ZSTD_malloc(size); ++} ++ ++MEM_STATIC void* ZSTD_customCalloc(size_t size, ZSTD_customMem customMem) ++{ ++ if (customMem.customAlloc) { ++ /* calloc implemented as malloc+memset; ++ * not as efficient as calloc, but next best guess for custom malloc */ ++ void* const ptr = customMem.customAlloc(customMem.opaque, size); ++ ZSTD_memset(ptr, 0, size); ++ return ptr; ++ } ++ return ZSTD_calloc(1, size); ++} ++ ++MEM_STATIC void ZSTD_customFree(void* ptr, ZSTD_customMem customMem) ++{ ++ if (ptr!=NULL) { ++ if (customMem.customFree) ++ customMem.customFree(customMem.opaque, ptr); ++ else ++ ZSTD_free(ptr); ++ } ++} ++ ++#endif /* ZSTD_ALLOCATIONS_H */ +--- /dev/null ++++ b/lib/zstd/common/bits.h +@@ -0,0 +1,149 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ ++/* ++ * Copyright (c) Meta Platforms, Inc. and affiliates. ++ * All rights reserved. ++ * ++ * This source code is licensed under both the BSD-style license (found in the ++ * LICENSE file in the root directory of this source tree) and the GPLv2 (found ++ * in the COPYING file in the root directory of this source tree). ++ * You may select, at your option, one of the above-listed licenses. ++ */ ++ ++#ifndef ZSTD_BITS_H ++#define ZSTD_BITS_H ++ ++#include "mem.h" ++ ++MEM_STATIC unsigned ZSTD_countTrailingZeros32_fallback(U32 val) ++{ ++ assert(val != 0); ++ { ++ static const U32 DeBruijnBytePos[32] = {0, 1, 28, 2, 29, 14, 24, 3, ++ 30, 22, 20, 15, 25, 17, 4, 8, ++ 31, 27, 13, 23, 21, 19, 16, 7, ++ 26, 12, 18, 6, 11, 5, 10, 9}; ++ return DeBruijnBytePos[((U32) ((val & -(S32) val) * 0x077CB531U)) >> 27]; ++ } ++} ++ ++MEM_STATIC unsigned ZSTD_countTrailingZeros32(U32 val) ++{ ++ assert(val != 0); ++# if (__GNUC__ >= 4) ++ return (unsigned)__builtin_ctz(val); ++# else ++ return ZSTD_countTrailingZeros32_fallback(val); ++# endif ++} ++ ++MEM_STATIC unsigned ZSTD_countLeadingZeros32_fallback(U32 val) { ++ assert(val != 0); ++ { ++ static const U32 DeBruijnClz[32] = {0, 9, 1, 10, 13, 21, 2, 29, ++ 11, 14, 16, 18, 22, 25, 3, 30, ++ 8, 12, 20, 28, 15, 17, 24, 7, ++ 19, 27, 23, 6, 26, 5, 4, 31}; ++ val |= val >> 1; ++ val |= val >> 2; ++ val |= val >> 4; ++ val |= val >> 8; ++ val |= val >> 16; ++ return 31 - DeBruijnClz[(val * 0x07C4ACDDU) >> 27]; ++ } ++} ++ ++MEM_STATIC unsigned ZSTD_countLeadingZeros32(U32 val) ++{ ++ assert(val != 0); ++# if (__GNUC__ >= 4) ++ return (unsigned)__builtin_clz(val); ++# else ++ return ZSTD_countLeadingZeros32_fallback(val); ++# endif ++} ++ ++MEM_STATIC unsigned ZSTD_countTrailingZeros64(U64 val) ++{ ++ assert(val != 0); ++# if (__GNUC__ >= 4) && defined(__LP64__) ++ return (unsigned)__builtin_ctzll(val); ++# else ++ { ++ U32 mostSignificantWord = (U32)(val >> 32); ++ U32 leastSignificantWord = (U32)val; ++ if (leastSignificantWord == 0) { ++ return 32 + ZSTD_countTrailingZeros32(mostSignificantWord); ++ } else { ++ return ZSTD_countTrailingZeros32(leastSignificantWord); ++ } ++ } ++# endif ++} ++ ++MEM_STATIC unsigned ZSTD_countLeadingZeros64(U64 val) ++{ ++ assert(val != 0); ++# if (__GNUC__ >= 4) ++ return (unsigned)(__builtin_clzll(val)); ++# else ++ { ++ U32 mostSignificantWord = (U32)(val >> 32); ++ U32 leastSignificantWord = (U32)val; ++ if (mostSignificantWord == 0) { ++ return 32 + ZSTD_countLeadingZeros32(leastSignificantWord); ++ } else { ++ return ZSTD_countLeadingZeros32(mostSignificantWord); ++ } ++ } ++# endif ++} ++ ++MEM_STATIC unsigned ZSTD_NbCommonBytes(size_t val) ++{ ++ if (MEM_isLittleEndian()) { ++ if (MEM_64bits()) { ++ return ZSTD_countTrailingZeros64((U64)val) >> 3; ++ } else { ++ return ZSTD_countTrailingZeros32((U32)val) >> 3; ++ } ++ } else { /* Big Endian CPU */ ++ if (MEM_64bits()) { ++ return ZSTD_countLeadingZeros64((U64)val) >> 3; ++ } else { ++ return ZSTD_countLeadingZeros32((U32)val) >> 3; ++ } ++ } ++} ++ ++MEM_STATIC unsigned ZSTD_highbit32(U32 val) /* compress, dictBuilder, decodeCorpus */ ++{ ++ assert(val != 0); ++ return 31 - ZSTD_countLeadingZeros32(val); ++} ++ ++/* ZSTD_rotateRight_*(): ++ * Rotates a bitfield to the right by "count" bits. ++ * https://en.wikipedia.org/w/index.php?title=Circular_shift&oldid=991635599#Implementing_circular_shifts ++ */ ++MEM_STATIC ++U64 ZSTD_rotateRight_U64(U64 const value, U32 count) { ++ assert(count < 64); ++ count &= 0x3F; /* for fickle pattern recognition */ ++ return (value >> count) | (U64)(value << ((0U - count) & 0x3F)); ++} ++ ++MEM_STATIC ++U32 ZSTD_rotateRight_U32(U32 const value, U32 count) { ++ assert(count < 32); ++ count &= 0x1F; /* for fickle pattern recognition */ ++ return (value >> count) | (U32)(value << ((0U - count) & 0x1F)); ++} ++ ++MEM_STATIC ++U16 ZSTD_rotateRight_U16(U16 const value, U32 count) { ++ assert(count < 16); ++ count &= 0x0F; /* for fickle pattern recognition */ ++ return (value >> count) | (U16)(value << ((0U - count) & 0x0F)); ++} ++ ++#endif /* ZSTD_BITS_H */ +--- a/lib/zstd/common/bitstream.h ++++ b/lib/zstd/common/bitstream.h +@@ -1,7 +1,8 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* ****************************************************************** + * bitstream + * Part of FSE library +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * You can contact the author at : + * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy +@@ -27,6 +28,7 @@ + #include "compiler.h" /* UNLIKELY() */ + #include "debug.h" /* assert(), DEBUGLOG(), RAWLOG() */ + #include "error_private.h" /* error codes and messages */ ++#include "bits.h" /* ZSTD_highbit32 */ + + + /*========================================= +@@ -79,19 +81,20 @@ MEM_STATIC size_t BIT_closeCStream(BIT_C + /*-******************************************** + * bitStream decoding API (read backward) + **********************************************/ ++typedef size_t BitContainerType; + typedef struct { +- size_t bitContainer; ++ BitContainerType bitContainer; + unsigned bitsConsumed; + const char* ptr; + const char* start; + const char* limitPtr; + } BIT_DStream_t; + +-typedef enum { BIT_DStream_unfinished = 0, +- BIT_DStream_endOfBuffer = 1, +- BIT_DStream_completed = 2, +- BIT_DStream_overflow = 3 } BIT_DStream_status; /* result of BIT_reloadDStream() */ +- /* 1,2,4,8 would be better for bitmap combinations, but slows down performance a bit ... :( */ ++typedef enum { BIT_DStream_unfinished = 0, /* fully refilled */ ++ BIT_DStream_endOfBuffer = 1, /* still some bits left in bitstream */ ++ BIT_DStream_completed = 2, /* bitstream entirely consumed, bit-exact */ ++ BIT_DStream_overflow = 3 /* user requested more bits than present in bitstream */ ++ } BIT_DStream_status; /* result of BIT_reloadDStream() */ + + MEM_STATIC size_t BIT_initDStream(BIT_DStream_t* bitD, const void* srcBuffer, size_t srcSize); + MEM_STATIC size_t BIT_readBits(BIT_DStream_t* bitD, unsigned nbBits); +@@ -101,7 +104,7 @@ MEM_STATIC unsigned BIT_endOfDStream(con + + /* Start by invoking BIT_initDStream(). + * A chunk of the bitStream is then stored into a local register. +-* Local register size is 64-bits on 64-bits systems, 32-bits on 32-bits systems (size_t). ++* Local register size is 64-bits on 64-bits systems, 32-bits on 32-bits systems (BitContainerType). + * You can then retrieve bitFields stored into the local register, **in reverse order**. + * Local register is explicitly reloaded from memory by the BIT_reloadDStream() method. + * A reload guarantee a minimum of ((8*sizeof(bitD->bitContainer))-7) bits when its result is BIT_DStream_unfinished. +@@ -122,33 +125,6 @@ MEM_STATIC void BIT_flushBitsFast(BIT_CS + MEM_STATIC size_t BIT_readBitsFast(BIT_DStream_t* bitD, unsigned nbBits); + /* faster, but works only if nbBits >= 1 */ + +- +- +-/*-************************************************************** +-* Internal functions +-****************************************************************/ +-MEM_STATIC unsigned BIT_highbit32 (U32 val) +-{ +- assert(val != 0); +- { +-# if (__GNUC__ >= 3) /* Use GCC Intrinsic */ +- return __builtin_clz (val) ^ 31; +-# else /* Software version */ +- static const unsigned DeBruijnClz[32] = { 0, 9, 1, 10, 13, 21, 2, 29, +- 11, 14, 16, 18, 22, 25, 3, 30, +- 8, 12, 20, 28, 15, 17, 24, 7, +- 19, 27, 23, 6, 26, 5, 4, 31 }; +- U32 v = val; +- v |= v >> 1; +- v |= v >> 2; +- v |= v >> 4; +- v |= v >> 8; +- v |= v >> 16; +- return DeBruijnClz[ (U32) (v * 0x07C4ACDDU) >> 27]; +-# endif +- } +-} +- + /*===== Local Constants =====*/ + static const unsigned BIT_mask[] = { + 0, 1, 3, 7, 0xF, 0x1F, +@@ -178,6 +154,12 @@ MEM_STATIC size_t BIT_initCStream(BIT_CS + return 0; + } + ++FORCE_INLINE_TEMPLATE size_t BIT_getLowerBits(size_t bitContainer, U32 const nbBits) ++{ ++ assert(nbBits < BIT_MASK_SIZE); ++ return bitContainer & BIT_mask[nbBits]; ++} ++ + /*! BIT_addBits() : + * can add up to 31 bits into `bitC`. + * Note : does not check for register overflow ! */ +@@ -187,7 +169,7 @@ MEM_STATIC void BIT_addBits(BIT_CStream_ + DEBUG_STATIC_ASSERT(BIT_MASK_SIZE == 32); + assert(nbBits < BIT_MASK_SIZE); + assert(nbBits + bitC->bitPos < sizeof(bitC->bitContainer) * 8); +- bitC->bitContainer |= (value & BIT_mask[nbBits]) << bitC->bitPos; ++ bitC->bitContainer |= BIT_getLowerBits(value, nbBits) << bitC->bitPos; + bitC->bitPos += nbBits; + } + +@@ -266,35 +248,35 @@ MEM_STATIC size_t BIT_initDStream(BIT_DS + bitD->ptr = (const char*)srcBuffer + srcSize - sizeof(bitD->bitContainer); + bitD->bitContainer = MEM_readLEST(bitD->ptr); + { BYTE const lastByte = ((const BYTE*)srcBuffer)[srcSize-1]; +- bitD->bitsConsumed = lastByte ? 8 - BIT_highbit32(lastByte) : 0; /* ensures bitsConsumed is always set */ ++ bitD->bitsConsumed = lastByte ? 8 - ZSTD_highbit32(lastByte) : 0; /* ensures bitsConsumed is always set */ + if (lastByte == 0) return ERROR(GENERIC); /* endMark not present */ } + } else { + bitD->ptr = bitD->start; + bitD->bitContainer = *(const BYTE*)(bitD->start); + switch(srcSize) + { +- case 7: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[6]) << (sizeof(bitD->bitContainer)*8 - 16); ++ case 7: bitD->bitContainer += (BitContainerType)(((const BYTE*)(srcBuffer))[6]) << (sizeof(bitD->bitContainer)*8 - 16); + ZSTD_FALLTHROUGH; + +- case 6: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[5]) << (sizeof(bitD->bitContainer)*8 - 24); ++ case 6: bitD->bitContainer += (BitContainerType)(((const BYTE*)(srcBuffer))[5]) << (sizeof(bitD->bitContainer)*8 - 24); + ZSTD_FALLTHROUGH; + +- case 5: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[4]) << (sizeof(bitD->bitContainer)*8 - 32); ++ case 5: bitD->bitContainer += (BitContainerType)(((const BYTE*)(srcBuffer))[4]) << (sizeof(bitD->bitContainer)*8 - 32); + ZSTD_FALLTHROUGH; + +- case 4: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[3]) << 24; ++ case 4: bitD->bitContainer += (BitContainerType)(((const BYTE*)(srcBuffer))[3]) << 24; + ZSTD_FALLTHROUGH; + +- case 3: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[2]) << 16; ++ case 3: bitD->bitContainer += (BitContainerType)(((const BYTE*)(srcBuffer))[2]) << 16; + ZSTD_FALLTHROUGH; + +- case 2: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[1]) << 8; ++ case 2: bitD->bitContainer += (BitContainerType)(((const BYTE*)(srcBuffer))[1]) << 8; + ZSTD_FALLTHROUGH; + + default: break; + } + { BYTE const lastByte = ((const BYTE*)srcBuffer)[srcSize-1]; +- bitD->bitsConsumed = lastByte ? 8 - BIT_highbit32(lastByte) : 0; ++ bitD->bitsConsumed = lastByte ? 8 - ZSTD_highbit32(lastByte) : 0; + if (lastByte == 0) return ERROR(corruption_detected); /* endMark not present */ + } + bitD->bitsConsumed += (U32)(sizeof(bitD->bitContainer) - srcSize)*8; +@@ -303,12 +285,12 @@ MEM_STATIC size_t BIT_initDStream(BIT_DS + return srcSize; + } + +-MEM_STATIC FORCE_INLINE_ATTR size_t BIT_getUpperBits(size_t bitContainer, U32 const start) ++FORCE_INLINE_TEMPLATE size_t BIT_getUpperBits(BitContainerType bitContainer, U32 const start) + { + return bitContainer >> start; + } + +-MEM_STATIC FORCE_INLINE_ATTR size_t BIT_getMiddleBits(size_t bitContainer, U32 const start, U32 const nbBits) ++FORCE_INLINE_TEMPLATE size_t BIT_getMiddleBits(BitContainerType bitContainer, U32 const start, U32 const nbBits) + { + U32 const regMask = sizeof(bitContainer)*8 - 1; + /* if start > regMask, bitstream is corrupted, and result is undefined */ +@@ -325,19 +307,13 @@ MEM_STATIC FORCE_INLINE_ATTR size_t BIT_ + #endif + } + +-MEM_STATIC FORCE_INLINE_ATTR size_t BIT_getLowerBits(size_t bitContainer, U32 const nbBits) +-{ +- assert(nbBits < BIT_MASK_SIZE); +- return bitContainer & BIT_mask[nbBits]; +-} +- + /*! BIT_lookBits() : + * Provides next n bits from local register. + * local register is not modified. + * On 32-bits, maxNbBits==24. + * On 64-bits, maxNbBits==56. + * @return : value extracted */ +-MEM_STATIC FORCE_INLINE_ATTR size_t BIT_lookBits(const BIT_DStream_t* bitD, U32 nbBits) ++FORCE_INLINE_TEMPLATE size_t BIT_lookBits(const BIT_DStream_t* bitD, U32 nbBits) + { + /* arbitrate between double-shift and shift+mask */ + #if 1 +@@ -360,7 +336,7 @@ MEM_STATIC size_t BIT_lookBitsFast(const + return (bitD->bitContainer << (bitD->bitsConsumed & regMask)) >> (((regMask+1)-nbBits) & regMask); + } + +-MEM_STATIC FORCE_INLINE_ATTR void BIT_skipBits(BIT_DStream_t* bitD, U32 nbBits) ++FORCE_INLINE_TEMPLATE void BIT_skipBits(BIT_DStream_t* bitD, U32 nbBits) + { + bitD->bitsConsumed += nbBits; + } +@@ -369,7 +345,7 @@ MEM_STATIC FORCE_INLINE_ATTR void BIT_sk + * Read (consume) next n bits from local register and update. + * Pay attention to not read more than nbBits contained into local register. + * @return : extracted value. */ +-MEM_STATIC FORCE_INLINE_ATTR size_t BIT_readBits(BIT_DStream_t* bitD, unsigned nbBits) ++FORCE_INLINE_TEMPLATE size_t BIT_readBits(BIT_DStream_t* bitD, unsigned nbBits) + { + size_t const value = BIT_lookBits(bitD, nbBits); + BIT_skipBits(bitD, nbBits); +@@ -377,7 +353,7 @@ MEM_STATIC FORCE_INLINE_ATTR size_t BIT_ + } + + /*! BIT_readBitsFast() : +- * unsafe version; only works only if nbBits >= 1 */ ++ * unsafe version; only works if nbBits >= 1 */ + MEM_STATIC size_t BIT_readBitsFast(BIT_DStream_t* bitD, unsigned nbBits) + { + size_t const value = BIT_lookBitsFast(bitD, nbBits); +@@ -386,6 +362,21 @@ MEM_STATIC size_t BIT_readBitsFast(BIT_D + return value; + } + ++/*! BIT_reloadDStream_internal() : ++ * Simple variant of BIT_reloadDStream(), with two conditions: ++ * 1. bitstream is valid : bitsConsumed <= sizeof(bitD->bitContainer)*8 ++ * 2. look window is valid after shifted down : bitD->ptr >= bitD->start ++ */ ++MEM_STATIC BIT_DStream_status BIT_reloadDStream_internal(BIT_DStream_t* bitD) ++{ ++ assert(bitD->bitsConsumed <= sizeof(bitD->bitContainer)*8); ++ bitD->ptr -= bitD->bitsConsumed >> 3; ++ assert(bitD->ptr >= bitD->start); ++ bitD->bitsConsumed &= 7; ++ bitD->bitContainer = MEM_readLEST(bitD->ptr); ++ return BIT_DStream_unfinished; ++} ++ + /*! BIT_reloadDStreamFast() : + * Similar to BIT_reloadDStream(), but with two differences: + * 1. bitsConsumed <= sizeof(bitD->bitContainer)*8 must hold! +@@ -396,31 +387,35 @@ MEM_STATIC BIT_DStream_status BIT_reload + { + if (UNLIKELY(bitD->ptr < bitD->limitPtr)) + return BIT_DStream_overflow; +- assert(bitD->bitsConsumed <= sizeof(bitD->bitContainer)*8); +- bitD->ptr -= bitD->bitsConsumed >> 3; +- bitD->bitsConsumed &= 7; +- bitD->bitContainer = MEM_readLEST(bitD->ptr); +- return BIT_DStream_unfinished; ++ return BIT_reloadDStream_internal(bitD); + } + + /*! BIT_reloadDStream() : + * Refill `bitD` from buffer previously set in BIT_initDStream() . +- * This function is safe, it guarantees it will not read beyond src buffer. ++ * This function is safe, it guarantees it will not never beyond src buffer. + * @return : status of `BIT_DStream_t` internal register. + * when status == BIT_DStream_unfinished, internal register is filled with at least 25 or 57 bits */ +-MEM_STATIC BIT_DStream_status BIT_reloadDStream(BIT_DStream_t* bitD) ++FORCE_INLINE_TEMPLATE BIT_DStream_status BIT_reloadDStream(BIT_DStream_t* bitD) + { +- if (bitD->bitsConsumed > (sizeof(bitD->bitContainer)*8)) /* overflow detected, like end of stream */ ++ /* note : once in overflow mode, a bitstream remains in this mode until it's reset */ ++ if (UNLIKELY(bitD->bitsConsumed > (sizeof(bitD->bitContainer)*8))) { ++ static const BitContainerType zeroFilled = 0; ++ bitD->ptr = (const char*)&zeroFilled; /* aliasing is allowed for char */ ++ /* overflow detected, erroneous scenario or end of stream: no update */ + return BIT_DStream_overflow; ++ } ++ ++ assert(bitD->ptr >= bitD->start); + + if (bitD->ptr >= bitD->limitPtr) { +- return BIT_reloadDStreamFast(bitD); ++ return BIT_reloadDStream_internal(bitD); + } + if (bitD->ptr == bitD->start) { ++ /* reached end of bitStream => no update */ + if (bitD->bitsConsumed < sizeof(bitD->bitContainer)*8) return BIT_DStream_endOfBuffer; + return BIT_DStream_completed; + } +- /* start < ptr < limitPtr */ ++ /* start < ptr < limitPtr => cautious update */ + { U32 nbBytes = bitD->bitsConsumed >> 3; + BIT_DStream_status result = BIT_DStream_unfinished; + if (bitD->ptr - nbBytes < bitD->start) { +--- a/lib/zstd/common/compiler.h ++++ b/lib/zstd/common/compiler.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -11,6 +12,8 @@ + #ifndef ZSTD_COMPILER_H + #define ZSTD_COMPILER_H + ++#include ++ + #include "portability_macros.h" + + /*-******************************************************* +@@ -41,12 +44,15 @@ + */ + #define WIN_CDECL + ++/* UNUSED_ATTR tells the compiler it is okay if the function is unused. */ ++#define UNUSED_ATTR __attribute__((unused)) ++ + /* + * FORCE_INLINE_TEMPLATE is used to define C "templates", which take constant + * parameters. They must be inlined for the compiler to eliminate the constant + * branches. + */ +-#define FORCE_INLINE_TEMPLATE static INLINE_KEYWORD FORCE_INLINE_ATTR ++#define FORCE_INLINE_TEMPLATE static INLINE_KEYWORD FORCE_INLINE_ATTR UNUSED_ATTR + /* + * HINT_INLINE is used to help the compiler generate better code. It is *not* + * used for "templates", so it can be tweaked based on the compilers +@@ -61,11 +67,21 @@ + #if !defined(__clang__) && defined(__GNUC__) && __GNUC__ >= 4 && __GNUC_MINOR__ >= 8 && __GNUC__ < 5 + # define HINT_INLINE static INLINE_KEYWORD + #else +-# define HINT_INLINE static INLINE_KEYWORD FORCE_INLINE_ATTR ++# define HINT_INLINE FORCE_INLINE_TEMPLATE + #endif + +-/* UNUSED_ATTR tells the compiler it is okay if the function is unused. */ +-#define UNUSED_ATTR __attribute__((unused)) ++/* "soft" inline : ++ * The compiler is free to select if it's a good idea to inline or not. ++ * The main objective is to silence compiler warnings ++ * when a defined function in included but not used. ++ * ++ * Note : this macro is prefixed `MEM_` because it used to be provided by `mem.h` unit. ++ * Updating the prefix is probably preferable, but requires a fairly large codemod, ++ * since this name is used everywhere. ++ */ ++#ifndef MEM_STATIC /* already defined in Linux Kernel mem.h */ ++#define MEM_STATIC static __inline UNUSED_ATTR ++#endif + + /* force no inlining */ + #define FORCE_NOINLINE static __attribute__((__noinline__)) +@@ -86,23 +102,24 @@ + # define PREFETCH_L1(ptr) __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */) + # define PREFETCH_L2(ptr) __builtin_prefetch((ptr), 0 /* rw==read */, 2 /* locality */) + #elif defined(__aarch64__) +-# define PREFETCH_L1(ptr) __asm__ __volatile__("prfm pldl1keep, %0" ::"Q"(*(ptr))) +-# define PREFETCH_L2(ptr) __asm__ __volatile__("prfm pldl2keep, %0" ::"Q"(*(ptr))) ++# define PREFETCH_L1(ptr) do { __asm__ __volatile__("prfm pldl1keep, %0" ::"Q"(*(ptr))); } while (0) ++# define PREFETCH_L2(ptr) do { __asm__ __volatile__("prfm pldl2keep, %0" ::"Q"(*(ptr))); } while (0) + #else +-# define PREFETCH_L1(ptr) (void)(ptr) /* disabled */ +-# define PREFETCH_L2(ptr) (void)(ptr) /* disabled */ ++# define PREFETCH_L1(ptr) do { (void)(ptr); } while (0) /* disabled */ ++# define PREFETCH_L2(ptr) do { (void)(ptr); } while (0) /* disabled */ + #endif /* NO_PREFETCH */ + + #define CACHELINE_SIZE 64 + +-#define PREFETCH_AREA(p, s) { \ +- const char* const _ptr = (const char*)(p); \ +- size_t const _size = (size_t)(s); \ +- size_t _pos; \ +- for (_pos=0; _pos<_size; _pos+=CACHELINE_SIZE) { \ +- PREFETCH_L2(_ptr + _pos); \ +- } \ +-} ++#define PREFETCH_AREA(p, s) \ ++ do { \ ++ const char* const _ptr = (const char*)(p); \ ++ size_t const _size = (size_t)(s); \ ++ size_t _pos; \ ++ for (_pos=0; _pos<_size; _pos+=CACHELINE_SIZE) { \ ++ PREFETCH_L2(_ptr + _pos); \ ++ } \ ++ } while (0) + + /* vectorization + * older GCC (pre gcc-4.3 picked as the cutoff) uses a different syntax, +@@ -126,9 +143,9 @@ + #define UNLIKELY(x) (__builtin_expect((x), 0)) + + #if __has_builtin(__builtin_unreachable) || (defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 5))) +-# define ZSTD_UNREACHABLE { assert(0), __builtin_unreachable(); } ++# define ZSTD_UNREACHABLE do { assert(0), __builtin_unreachable(); } while (0) + #else +-# define ZSTD_UNREACHABLE { assert(0); } ++# define ZSTD_UNREACHABLE do { assert(0); } while (0) + #endif + + /* disable warnings */ +@@ -179,6 +196,85 @@ + * Sanitizer + *****************************************************************/ + ++/* ++ * Zstd relies on pointer overflow in its decompressor. ++ * We add this attribute to functions that rely on pointer overflow. ++ */ ++#ifndef ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++# if __has_attribute(no_sanitize) ++# if !defined(__clang__) && defined(__GNUC__) && __GNUC__ < 8 ++ /* gcc < 8 only has signed-integer-overlow which triggers on pointer overflow */ ++# define ZSTD_ALLOW_POINTER_OVERFLOW_ATTR __attribute__((no_sanitize("signed-integer-overflow"))) ++# else ++ /* older versions of clang [3.7, 5.0) will warn that pointer-overflow is ignored. */ ++# define ZSTD_ALLOW_POINTER_OVERFLOW_ATTR __attribute__((no_sanitize("pointer-overflow"))) ++# endif ++# else ++# define ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++# endif ++#endif ++ ++/* ++ * Helper function to perform a wrapped pointer difference without trigging ++ * UBSAN. ++ * ++ * @returns lhs - rhs with wrapping ++ */ ++MEM_STATIC ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++ptrdiff_t ZSTD_wrappedPtrDiff(unsigned char const* lhs, unsigned char const* rhs) ++{ ++ return lhs - rhs; ++} ++ ++/* ++ * Helper function to perform a wrapped pointer add without triggering UBSAN. ++ * ++ * @return ptr + add with wrapping ++ */ ++MEM_STATIC ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++unsigned char const* ZSTD_wrappedPtrAdd(unsigned char const* ptr, ptrdiff_t add) ++{ ++ return ptr + add; ++} ++ ++/* ++ * Helper function to perform a wrapped pointer subtraction without triggering ++ * UBSAN. ++ * ++ * @return ptr - sub with wrapping ++ */ ++MEM_STATIC ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++unsigned char const* ZSTD_wrappedPtrSub(unsigned char const* ptr, ptrdiff_t sub) ++{ ++ return ptr - sub; ++} ++ ++/* ++ * Helper function to add to a pointer that works around C's undefined behavior ++ * of adding 0 to NULL. ++ * ++ * @returns `ptr + add` except it defines `NULL + 0 == NULL`. ++ */ ++MEM_STATIC ++unsigned char* ZSTD_maybeNullPtrAdd(unsigned char* ptr, ptrdiff_t add) ++{ ++ return add > 0 ? ptr + add : ptr; ++} ++ ++/* Issue #3240 reports an ASAN failure on an llvm-mingw build. Out of an ++ * abundance of caution, disable our custom poisoning on mingw. */ ++#ifdef __MINGW32__ ++#ifndef ZSTD_ASAN_DONT_POISON_WORKSPACE ++#define ZSTD_ASAN_DONT_POISON_WORKSPACE 1 ++#endif ++#ifndef ZSTD_MSAN_DONT_POISON_WORKSPACE ++#define ZSTD_MSAN_DONT_POISON_WORKSPACE 1 ++#endif ++#endif ++ + + + #endif /* ZSTD_COMPILER_H */ +--- a/lib/zstd/common/cpu.h ++++ b/lib/zstd/common/cpu.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +--- a/lib/zstd/common/debug.c ++++ b/lib/zstd/common/debug.c +@@ -1,7 +1,8 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* ****************************************************************** + * debug + * Part of FSE library +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * You can contact the author at : + * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy +@@ -21,4 +22,10 @@ + + #include "debug.h" + ++#if (DEBUGLEVEL>=2) ++/* We only use this when DEBUGLEVEL>=2, but we get -Werror=pedantic errors if a ++ * translation unit is empty. So remove this from Linux kernel builds, but ++ * otherwise just leave it in. ++ */ + int g_debuglevel = DEBUGLEVEL; ++#endif +--- a/lib/zstd/common/debug.h ++++ b/lib/zstd/common/debug.h +@@ -1,7 +1,8 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* ****************************************************************** + * debug + * Part of FSE library +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * You can contact the author at : + * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy +@@ -82,18 +83,27 @@ extern int g_debuglevel; /* the variable + It's useful when enabling very verbose levels + on selective conditions (such as position in src) */ + +-# define RAWLOG(l, ...) { \ +- if (l<=g_debuglevel) { \ +- ZSTD_DEBUG_PRINT(__VA_ARGS__); \ +- } } +-# define DEBUGLOG(l, ...) { \ +- if (l<=g_debuglevel) { \ +- ZSTD_DEBUG_PRINT(__FILE__ ": " __VA_ARGS__); \ +- ZSTD_DEBUG_PRINT(" \n"); \ +- } } ++# define RAWLOG(l, ...) \ ++ do { \ ++ if (l<=g_debuglevel) { \ ++ ZSTD_DEBUG_PRINT(__VA_ARGS__); \ ++ } \ ++ } while (0) ++ ++#define STRINGIFY(x) #x ++#define TOSTRING(x) STRINGIFY(x) ++#define LINE_AS_STRING TOSTRING(__LINE__) ++ ++# define DEBUGLOG(l, ...) \ ++ do { \ ++ if (l<=g_debuglevel) { \ ++ ZSTD_DEBUG_PRINT(__FILE__ ":" LINE_AS_STRING ": " __VA_ARGS__); \ ++ ZSTD_DEBUG_PRINT(" \n"); \ ++ } \ ++ } while (0) + #else +-# define RAWLOG(l, ...) {} /* disabled */ +-# define DEBUGLOG(l, ...) {} /* disabled */ ++# define RAWLOG(l, ...) do { } while (0) /* disabled */ ++# define DEBUGLOG(l, ...) do { } while (0) /* disabled */ + #endif + + +--- a/lib/zstd/common/entropy_common.c ++++ b/lib/zstd/common/entropy_common.c +@@ -1,6 +1,7 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* ****************************************************************** + * Common functions of New Generation Entropy library +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * You can contact the author at : + * - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy +@@ -19,8 +20,8 @@ + #include "error_private.h" /* ERR_*, ERROR */ + #define FSE_STATIC_LINKING_ONLY /* FSE_MIN_TABLELOG */ + #include "fse.h" +-#define HUF_STATIC_LINKING_ONLY /* HUF_TABLELOG_ABSOLUTEMAX */ + #include "huf.h" ++#include "bits.h" /* ZSDT_highbit32, ZSTD_countTrailingZeros32 */ + + + /*=== Version ===*/ +@@ -38,23 +39,6 @@ const char* HUF_getErrorName(size_t code + /*-************************************************************** + * FSE NCount encoding-decoding + ****************************************************************/ +-static U32 FSE_ctz(U32 val) +-{ +- assert(val != 0); +- { +-# if (__GNUC__ >= 3) /* GCC Intrinsic */ +- return __builtin_ctz(val); +-# else /* Software version */ +- U32 count = 0; +- while ((val & 1) == 0) { +- val >>= 1; +- ++count; +- } +- return count; +-# endif +- } +-} +- + FORCE_INLINE_TEMPLATE + size_t FSE_readNCount_body(short* normalizedCounter, unsigned* maxSVPtr, unsigned* tableLogPtr, + const void* headerBuffer, size_t hbSize) +@@ -102,7 +86,7 @@ size_t FSE_readNCount_body(short* normal + * repeat. + * Avoid UB by setting the high bit to 1. + */ +- int repeats = FSE_ctz(~bitStream | 0x80000000) >> 1; ++ int repeats = ZSTD_countTrailingZeros32(~bitStream | 0x80000000) >> 1; + while (repeats >= 12) { + charnum += 3 * 12; + if (LIKELY(ip <= iend-7)) { +@@ -113,7 +97,7 @@ size_t FSE_readNCount_body(short* normal + ip = iend - 4; + } + bitStream = MEM_readLE32(ip) >> bitCount; +- repeats = FSE_ctz(~bitStream | 0x80000000) >> 1; ++ repeats = ZSTD_countTrailingZeros32(~bitStream | 0x80000000) >> 1; + } + charnum += 3 * repeats; + bitStream >>= 2 * repeats; +@@ -178,7 +162,7 @@ size_t FSE_readNCount_body(short* normal + * know that threshold > 1. + */ + if (remaining <= 1) break; +- nbBits = BIT_highbit32(remaining) + 1; ++ nbBits = ZSTD_highbit32(remaining) + 1; + threshold = 1 << (nbBits - 1); + } + if (charnum >= maxSV1) break; +@@ -253,7 +237,7 @@ size_t HUF_readStats(BYTE* huffWeight, s + const void* src, size_t srcSize) + { + U32 wksp[HUF_READ_STATS_WORKSPACE_SIZE_U32]; +- return HUF_readStats_wksp(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, wksp, sizeof(wksp), /* bmi2 */ 0); ++ return HUF_readStats_wksp(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, wksp, sizeof(wksp), /* flags */ 0); + } + + FORCE_INLINE_TEMPLATE size_t +@@ -301,14 +285,14 @@ HUF_readStats_body(BYTE* huffWeight, siz + if (weightTotal == 0) return ERROR(corruption_detected); + + /* get last non-null symbol weight (implied, total must be 2^n) */ +- { U32 const tableLog = BIT_highbit32(weightTotal) + 1; ++ { U32 const tableLog = ZSTD_highbit32(weightTotal) + 1; + if (tableLog > HUF_TABLELOG_MAX) return ERROR(corruption_detected); + *tableLogPtr = tableLog; + /* determine last weight */ + { U32 const total = 1 << tableLog; + U32 const rest = total - weightTotal; +- U32 const verif = 1 << BIT_highbit32(rest); +- U32 const lastWeight = BIT_highbit32(rest) + 1; ++ U32 const verif = 1 << ZSTD_highbit32(rest); ++ U32 const lastWeight = ZSTD_highbit32(rest) + 1; + if (verif != rest) return ERROR(corruption_detected); /* last value must be a clean power of 2 */ + huffWeight[oSize] = (BYTE)lastWeight; + rankStats[lastWeight]++; +@@ -345,13 +329,13 @@ size_t HUF_readStats_wksp(BYTE* huffWeig + U32* nbSymbolsPtr, U32* tableLogPtr, + const void* src, size_t srcSize, + void* workSpace, size_t wkspSize, +- int bmi2) ++ int flags) + { + #if DYNAMIC_BMI2 +- if (bmi2) { ++ if (flags & HUF_flags_bmi2) { + return HUF_readStats_body_bmi2(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, workSpace, wkspSize); + } + #endif +- (void)bmi2; ++ (void)flags; + return HUF_readStats_body_default(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, workSpace, wkspSize); + } +--- a/lib/zstd/common/error_private.c ++++ b/lib/zstd/common/error_private.c +@@ -1,5 +1,6 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -27,9 +28,11 @@ const char* ERR_getErrorString(ERR_enum + case PREFIX(version_unsupported): return "Version not supported"; + case PREFIX(frameParameter_unsupported): return "Unsupported frame parameter"; + case PREFIX(frameParameter_windowTooLarge): return "Frame requires too much memory for decoding"; +- case PREFIX(corruption_detected): return "Corrupted block detected"; ++ case PREFIX(corruption_detected): return "Data corruption detected"; + case PREFIX(checksum_wrong): return "Restored data doesn't match checksum"; ++ case PREFIX(literals_headerWrong): return "Header of Literals' block doesn't respect format specification"; + case PREFIX(parameter_unsupported): return "Unsupported parameter"; ++ case PREFIX(parameter_combination_unsupported): return "Unsupported combination of parameters"; + case PREFIX(parameter_outOfBound): return "Parameter is out of bound"; + case PREFIX(init_missing): return "Context should be init first"; + case PREFIX(memory_allocation): return "Allocation error : not enough memory"; +@@ -38,17 +41,22 @@ const char* ERR_getErrorString(ERR_enum + case PREFIX(tableLog_tooLarge): return "tableLog requires too much memory : unsupported"; + case PREFIX(maxSymbolValue_tooLarge): return "Unsupported max Symbol Value : too large"; + case PREFIX(maxSymbolValue_tooSmall): return "Specified maxSymbolValue is too small"; ++ case PREFIX(stabilityCondition_notRespected): return "pledged buffer stability condition is not respected"; + case PREFIX(dictionary_corrupted): return "Dictionary is corrupted"; + case PREFIX(dictionary_wrong): return "Dictionary mismatch"; + case PREFIX(dictionaryCreation_failed): return "Cannot create Dictionary from provided samples"; + case PREFIX(dstSize_tooSmall): return "Destination buffer is too small"; + case PREFIX(srcSize_wrong): return "Src size is incorrect"; + case PREFIX(dstBuffer_null): return "Operation on NULL destination buffer"; ++ case PREFIX(noForwardProgress_destFull): return "Operation made no progress over multiple calls, due to output buffer being full"; ++ case PREFIX(noForwardProgress_inputEmpty): return "Operation made no progress over multiple calls, due to input being empty"; + /* following error codes are not stable and may be removed or changed in a future version */ + case PREFIX(frameIndex_tooLarge): return "Frame index is too large"; + case PREFIX(seekableIO): return "An I/O error occurred when reading/seeking"; + case PREFIX(dstBuffer_wrong): return "Destination buffer is wrong"; + case PREFIX(srcBuffer_wrong): return "Source buffer is wrong"; ++ case PREFIX(sequenceProducer_failed): return "Block-level external sequence producer returned an error code"; ++ case PREFIX(externalSequences_invalid): return "External sequences are not valid"; + case PREFIX(maxCode): + default: return notErrorCode; + } +--- a/lib/zstd/common/error_private.h ++++ b/lib/zstd/common/error_private.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -49,8 +50,13 @@ ERR_STATIC unsigned ERR_isError(size_t c + ERR_STATIC ERR_enum ERR_getErrorCode(size_t code) { if (!ERR_isError(code)) return (ERR_enum)0; return (ERR_enum) (0-code); } + + /* check and forward error code */ +-#define CHECK_V_F(e, f) size_t const e = f; if (ERR_isError(e)) return e +-#define CHECK_F(f) { CHECK_V_F(_var_err__, f); } ++#define CHECK_V_F(e, f) \ ++ size_t const e = f; \ ++ do { \ ++ if (ERR_isError(e)) \ ++ return e; \ ++ } while (0) ++#define CHECK_F(f) do { CHECK_V_F(_var_err__, f); } while (0) + + + /*-**************************************** +@@ -84,10 +90,12 @@ void _force_has_format_string(const char + * We want to force this function invocation to be syntactically correct, but + * we don't want to force runtime evaluation of its arguments. + */ +-#define _FORCE_HAS_FORMAT_STRING(...) \ +- if (0) { \ +- _force_has_format_string(__VA_ARGS__); \ +- } ++#define _FORCE_HAS_FORMAT_STRING(...) \ ++ do { \ ++ if (0) { \ ++ _force_has_format_string(__VA_ARGS__); \ ++ } \ ++ } while (0) + + #define ERR_QUOTE(str) #str + +@@ -98,48 +106,50 @@ void _force_has_format_string(const char + * In order to do that (particularly, printing the conditional that failed), + * this can't just wrap RETURN_ERROR(). + */ +-#define RETURN_ERROR_IF(cond, err, ...) \ +- if (cond) { \ +- RAWLOG(3, "%s:%d: ERROR!: check %s failed, returning %s", \ +- __FILE__, __LINE__, ERR_QUOTE(cond), ERR_QUOTE(ERROR(err))); \ +- _FORCE_HAS_FORMAT_STRING(__VA_ARGS__); \ +- RAWLOG(3, ": " __VA_ARGS__); \ +- RAWLOG(3, "\n"); \ +- return ERROR(err); \ +- } ++#define RETURN_ERROR_IF(cond, err, ...) \ ++ do { \ ++ if (cond) { \ ++ RAWLOG(3, "%s:%d: ERROR!: check %s failed, returning %s", \ ++ __FILE__, __LINE__, ERR_QUOTE(cond), ERR_QUOTE(ERROR(err))); \ ++ _FORCE_HAS_FORMAT_STRING(__VA_ARGS__); \ ++ RAWLOG(3, ": " __VA_ARGS__); \ ++ RAWLOG(3, "\n"); \ ++ return ERROR(err); \ ++ } \ ++ } while (0) + + /* + * Unconditionally return the specified error. + * + * In debug modes, prints additional information. + */ +-#define RETURN_ERROR(err, ...) \ +- do { \ +- RAWLOG(3, "%s:%d: ERROR!: unconditional check failed, returning %s", \ +- __FILE__, __LINE__, ERR_QUOTE(ERROR(err))); \ +- _FORCE_HAS_FORMAT_STRING(__VA_ARGS__); \ +- RAWLOG(3, ": " __VA_ARGS__); \ +- RAWLOG(3, "\n"); \ +- return ERROR(err); \ +- } while(0); ++#define RETURN_ERROR(err, ...) \ ++ do { \ ++ RAWLOG(3, "%s:%d: ERROR!: unconditional check failed, returning %s", \ ++ __FILE__, __LINE__, ERR_QUOTE(ERROR(err))); \ ++ _FORCE_HAS_FORMAT_STRING(__VA_ARGS__); \ ++ RAWLOG(3, ": " __VA_ARGS__); \ ++ RAWLOG(3, "\n"); \ ++ return ERROR(err); \ ++ } while(0) + + /* + * If the provided expression evaluates to an error code, returns that error code. + * + * In debug modes, prints additional information. + */ +-#define FORWARD_IF_ERROR(err, ...) \ +- do { \ +- size_t const err_code = (err); \ +- if (ERR_isError(err_code)) { \ +- RAWLOG(3, "%s:%d: ERROR!: forwarding error in %s: %s", \ +- __FILE__, __LINE__, ERR_QUOTE(err), ERR_getErrorName(err_code)); \ +- _FORCE_HAS_FORMAT_STRING(__VA_ARGS__); \ +- RAWLOG(3, ": " __VA_ARGS__); \ +- RAWLOG(3, "\n"); \ +- return err_code; \ +- } \ +- } while(0); ++#define FORWARD_IF_ERROR(err, ...) \ ++ do { \ ++ size_t const err_code = (err); \ ++ if (ERR_isError(err_code)) { \ ++ RAWLOG(3, "%s:%d: ERROR!: forwarding error in %s: %s", \ ++ __FILE__, __LINE__, ERR_QUOTE(err), ERR_getErrorName(err_code)); \ ++ _FORCE_HAS_FORMAT_STRING(__VA_ARGS__); \ ++ RAWLOG(3, ": " __VA_ARGS__); \ ++ RAWLOG(3, "\n"); \ ++ return err_code; \ ++ } \ ++ } while(0) + + + #endif /* ERROR_H_MODULE */ +--- a/lib/zstd/common/fse.h ++++ b/lib/zstd/common/fse.h +@@ -1,7 +1,8 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* ****************************************************************** + * FSE : Finite State Entropy codec + * Public Prototypes declaration +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * You can contact the author at : + * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy +@@ -50,34 +51,6 @@ + FSE_PUBLIC_API unsigned FSE_versionNumber(void); /*< library version number; to be used when checking dll version */ + + +-/*-**************************************** +-* FSE simple functions +-******************************************/ +-/*! FSE_compress() : +- Compress content of buffer 'src', of size 'srcSize', into destination buffer 'dst'. +- 'dst' buffer must be already allocated. Compression runs faster is dstCapacity >= FSE_compressBound(srcSize). +- @return : size of compressed data (<= dstCapacity). +- Special values : if return == 0, srcData is not compressible => Nothing is stored within dst !!! +- if return == 1, srcData is a single byte symbol * srcSize times. Use RLE compression instead. +- if FSE_isError(return), compression failed (more details using FSE_getErrorName()) +-*/ +-FSE_PUBLIC_API size_t FSE_compress(void* dst, size_t dstCapacity, +- const void* src, size_t srcSize); +- +-/*! FSE_decompress(): +- Decompress FSE data from buffer 'cSrc', of size 'cSrcSize', +- into already allocated destination buffer 'dst', of size 'dstCapacity'. +- @return : size of regenerated data (<= maxDstSize), +- or an error code, which can be tested using FSE_isError() . +- +- ** Important ** : FSE_decompress() does not decompress non-compressible nor RLE data !!! +- Why ? : making this distinction requires a header. +- Header management is intentionally delegated to the user layer, which can better manage special cases. +-*/ +-FSE_PUBLIC_API size_t FSE_decompress(void* dst, size_t dstCapacity, +- const void* cSrc, size_t cSrcSize); +- +- + /*-***************************************** + * Tool functions + ******************************************/ +@@ -89,20 +62,6 @@ FSE_PUBLIC_API const char* FSE_getErrorN + + + /*-***************************************** +-* FSE advanced functions +-******************************************/ +-/*! FSE_compress2() : +- Same as FSE_compress(), but allows the selection of 'maxSymbolValue' and 'tableLog' +- Both parameters can be defined as '0' to mean : use default value +- @return : size of compressed data +- Special values : if return == 0, srcData is not compressible => Nothing is stored within cSrc !!! +- if return == 1, srcData is a single byte symbol * srcSize times. Use RLE compression. +- if FSE_isError(return), it's an error code. +-*/ +-FSE_PUBLIC_API size_t FSE_compress2 (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog); +- +- +-/*-***************************************** + * FSE detailed API + ******************************************/ + /*! +@@ -161,8 +120,6 @@ FSE_PUBLIC_API size_t FSE_writeNCount (v + /*! Constructor and Destructor of FSE_CTable. + Note that FSE_CTable size depends on 'tableLog' and 'maxSymbolValue' */ + typedef unsigned FSE_CTable; /* don't allocate that. It's only meant to be more restrictive than void* */ +-FSE_PUBLIC_API FSE_CTable* FSE_createCTable (unsigned maxSymbolValue, unsigned tableLog); +-FSE_PUBLIC_API void FSE_freeCTable (FSE_CTable* ct); + + /*! FSE_buildCTable(): + Builds `ct`, which must be already allocated, using FSE_createCTable(). +@@ -238,23 +195,7 @@ FSE_PUBLIC_API size_t FSE_readNCount_bmi + unsigned* maxSymbolValuePtr, unsigned* tableLogPtr, + const void* rBuffer, size_t rBuffSize, int bmi2); + +-/*! Constructor and Destructor of FSE_DTable. +- Note that its size depends on 'tableLog' */ + typedef unsigned FSE_DTable; /* don't allocate that. It's just a way to be more restrictive than void* */ +-FSE_PUBLIC_API FSE_DTable* FSE_createDTable(unsigned tableLog); +-FSE_PUBLIC_API void FSE_freeDTable(FSE_DTable* dt); +- +-/*! FSE_buildDTable(): +- Builds 'dt', which must be already allocated, using FSE_createDTable(). +- return : 0, or an errorCode, which can be tested using FSE_isError() */ +-FSE_PUBLIC_API size_t FSE_buildDTable (FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog); +- +-/*! FSE_decompress_usingDTable(): +- Decompress compressed source `cSrc` of size `cSrcSize` using `dt` +- into `dst` which must be already allocated. +- @return : size of regenerated data (necessarily <= `dstCapacity`), +- or an errorCode, which can be tested using FSE_isError() */ +-FSE_PUBLIC_API size_t FSE_decompress_usingDTable(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, const FSE_DTable* dt); + + /*! + Tutorial : +@@ -286,6 +227,7 @@ If there is an error, the function will + + #endif /* FSE_H */ + ++ + #if !defined(FSE_H_FSE_STATIC_LINKING_ONLY) + #define FSE_H_FSE_STATIC_LINKING_ONLY + +@@ -317,16 +259,6 @@ If there is an error, the function will + unsigned FSE_optimalTableLog_internal(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue, unsigned minus); + /*< same as FSE_optimalTableLog(), which used `minus==2` */ + +-/* FSE_compress_wksp() : +- * Same as FSE_compress2(), but using an externally allocated scratch buffer (`workSpace`). +- * FSE_COMPRESS_WKSP_SIZE_U32() provides the minimum size required for `workSpace` as a table of FSE_CTable. +- */ +-#define FSE_COMPRESS_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) ( FSE_CTABLE_SIZE_U32(maxTableLog, maxSymbolValue) + ((maxTableLog > 12) ? (1 << (maxTableLog - 2)) : 1024) ) +-size_t FSE_compress_wksp (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize); +- +-size_t FSE_buildCTable_raw (FSE_CTable* ct, unsigned nbBits); +-/*< build a fake FSE_CTable, designed for a flat distribution, where each symbol uses nbBits */ +- + size_t FSE_buildCTable_rle (FSE_CTable* ct, unsigned char symbolValue); + /*< build a fake FSE_CTable, designed to compress always the same symbolValue */ + +@@ -344,19 +276,11 @@ size_t FSE_buildCTable_wksp(FSE_CTable* + FSE_PUBLIC_API size_t FSE_buildDTable_wksp(FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize); + /*< Same as FSE_buildDTable(), using an externally allocated `workspace` produced with `FSE_BUILD_DTABLE_WKSP_SIZE_U32(maxSymbolValue)` */ + +-size_t FSE_buildDTable_raw (FSE_DTable* dt, unsigned nbBits); +-/*< build a fake FSE_DTable, designed to read a flat distribution where each symbol uses nbBits */ +- +-size_t FSE_buildDTable_rle (FSE_DTable* dt, unsigned char symbolValue); +-/*< build a fake FSE_DTable, designed to always generate the same symbolValue */ +- +-#define FSE_DECOMPRESS_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) (FSE_DTABLE_SIZE_U32(maxTableLog) + FSE_BUILD_DTABLE_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) + (FSE_MAX_SYMBOL_VALUE + 1) / 2 + 1) ++#define FSE_DECOMPRESS_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) (FSE_DTABLE_SIZE_U32(maxTableLog) + 1 + FSE_BUILD_DTABLE_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) + (FSE_MAX_SYMBOL_VALUE + 1) / 2 + 1) + #define FSE_DECOMPRESS_WKSP_SIZE(maxTableLog, maxSymbolValue) (FSE_DECOMPRESS_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) * sizeof(unsigned)) +-size_t FSE_decompress_wksp(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, unsigned maxLog, void* workSpace, size_t wkspSize); +-/*< same as FSE_decompress(), using an externally allocated `workSpace` produced with `FSE_DECOMPRESS_WKSP_SIZE_U32(maxLog, maxSymbolValue)` */ +- + size_t FSE_decompress_wksp_bmi2(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, unsigned maxLog, void* workSpace, size_t wkspSize, int bmi2); +-/*< Same as FSE_decompress_wksp() but with dynamic BMI2 support. Pass 1 if your CPU supports BMI2 or 0 if it doesn't. */ ++/*< same as FSE_decompress(), using an externally allocated `workSpace` produced with `FSE_DECOMPRESS_WKSP_SIZE_U32(maxLog, maxSymbolValue)`. ++ * Set bmi2 to 1 if your CPU supports BMI2 or 0 if it doesn't */ + + typedef enum { + FSE_repeat_none, /*< Cannot use the previous table */ +@@ -539,20 +463,20 @@ MEM_STATIC void FSE_encodeSymbol(BIT_CSt + FSE_symbolCompressionTransform const symbolTT = ((const FSE_symbolCompressionTransform*)(statePtr->symbolTT))[symbol]; + const U16* const stateTable = (const U16*)(statePtr->stateTable); + U32 const nbBitsOut = (U32)((statePtr->value + symbolTT.deltaNbBits) >> 16); +- BIT_addBits(bitC, statePtr->value, nbBitsOut); ++ BIT_addBits(bitC, (size_t)statePtr->value, nbBitsOut); + statePtr->value = stateTable[ (statePtr->value >> nbBitsOut) + symbolTT.deltaFindState]; + } + + MEM_STATIC void FSE_flushCState(BIT_CStream_t* bitC, const FSE_CState_t* statePtr) + { +- BIT_addBits(bitC, statePtr->value, statePtr->stateLog); ++ BIT_addBits(bitC, (size_t)statePtr->value, statePtr->stateLog); + BIT_flushBits(bitC); + } + + + /* FSE_getMaxNbBits() : + * Approximate maximum cost of a symbol, in bits. +- * Fractional get rounded up (i.e : a symbol with a normalized frequency of 3 gives the same result as a frequency of 2) ++ * Fractional get rounded up (i.e. a symbol with a normalized frequency of 3 gives the same result as a frequency of 2) + * note 1 : assume symbolValue is valid (<= maxSymbolValue) + * note 2 : if freq[symbolValue]==0, @return a fake cost of tableLog+1 bits */ + MEM_STATIC U32 FSE_getMaxNbBits(const void* symbolTTPtr, U32 symbolValue) +--- a/lib/zstd/common/fse_decompress.c ++++ b/lib/zstd/common/fse_decompress.c +@@ -1,6 +1,7 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* ****************************************************************** + * FSE : Finite State Entropy decoder +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * You can contact the author at : + * - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy +@@ -22,8 +23,8 @@ + #define FSE_STATIC_LINKING_ONLY + #include "fse.h" + #include "error_private.h" +-#define ZSTD_DEPS_NEED_MALLOC +-#include "zstd_deps.h" ++#include "zstd_deps.h" /* ZSTD_memcpy */ ++#include "bits.h" /* ZSTD_highbit32 */ + + + /* ************************************************************** +@@ -55,19 +56,6 @@ + #define FSE_FUNCTION_NAME(X,Y) FSE_CAT(X,Y) + #define FSE_TYPE_NAME(X,Y) FSE_CAT(X,Y) + +- +-/* Function templates */ +-FSE_DTable* FSE_createDTable (unsigned tableLog) +-{ +- if (tableLog > FSE_TABLELOG_ABSOLUTE_MAX) tableLog = FSE_TABLELOG_ABSOLUTE_MAX; +- return (FSE_DTable*)ZSTD_malloc( FSE_DTABLE_SIZE_U32(tableLog) * sizeof (U32) ); +-} +- +-void FSE_freeDTable (FSE_DTable* dt) +-{ +- ZSTD_free(dt); +-} +- + static size_t FSE_buildDTable_internal(FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize) + { + void* const tdPtr = dt+1; /* because *dt is unsigned, 32-bits aligned on 32-bits */ +@@ -96,7 +84,7 @@ static size_t FSE_buildDTable_internal(F + symbolNext[s] = 1; + } else { + if (normalizedCounter[s] >= largeLimit) DTableH.fastMode=0; +- symbolNext[s] = normalizedCounter[s]; ++ symbolNext[s] = (U16)normalizedCounter[s]; + } } } + ZSTD_memcpy(dt, &DTableH, sizeof(DTableH)); + } +@@ -111,8 +99,7 @@ static size_t FSE_buildDTable_internal(F + * all symbols have counts <= 8. We ensure we have 8 bytes at the end of + * our buffer to handle the over-write. + */ +- { +- U64 const add = 0x0101010101010101ull; ++ { U64 const add = 0x0101010101010101ull; + size_t pos = 0; + U64 sv = 0; + U32 s; +@@ -123,14 +110,13 @@ static size_t FSE_buildDTable_internal(F + for (i = 8; i < n; i += 8) { + MEM_write64(spread + pos + i, sv); + } +- pos += n; +- } +- } ++ pos += (size_t)n; ++ } } + /* Now we spread those positions across the table. +- * The benefit of doing it in two stages is that we avoid the the ++ * The benefit of doing it in two stages is that we avoid the + * variable size inner loop, which caused lots of branch misses. + * Now we can run through all the positions without any branch misses. +- * We unroll the loop twice, since that is what emperically worked best. ++ * We unroll the loop twice, since that is what empirically worked best. + */ + { + size_t position = 0; +@@ -166,7 +152,7 @@ static size_t FSE_buildDTable_internal(F + for (u=0; utableLog = 0; +- DTableH->fastMode = 0; +- +- cell->newState = 0; +- cell->symbol = symbolValue; +- cell->nbBits = 0; +- +- return 0; +-} +- +- +-size_t FSE_buildDTable_raw (FSE_DTable* dt, unsigned nbBits) +-{ +- void* ptr = dt; +- FSE_DTableHeader* const DTableH = (FSE_DTableHeader*)ptr; +- void* dPtr = dt + 1; +- FSE_decode_t* const dinfo = (FSE_decode_t*)dPtr; +- const unsigned tableSize = 1 << nbBits; +- const unsigned tableMask = tableSize - 1; +- const unsigned maxSV1 = tableMask+1; +- unsigned s; +- +- /* Sanity checks */ +- if (nbBits < 1) return ERROR(GENERIC); /* min size */ +- +- /* Build Decoding Table */ +- DTableH->tableLog = (U16)nbBits; +- DTableH->fastMode = 1; +- for (s=0; sfastMode; +- +- /* select fast mode (static) */ +- if (fastMode) return FSE_decompress_usingDTable_generic(dst, originalSize, cSrc, cSrcSize, dt, 1); +- return FSE_decompress_usingDTable_generic(dst, originalSize, cSrc, cSrcSize, dt, 0); +-} +- +- +-size_t FSE_decompress_wksp(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, unsigned maxLog, void* workSpace, size_t wkspSize) +-{ +- return FSE_decompress_wksp_bmi2(dst, dstCapacity, cSrc, cSrcSize, maxLog, workSpace, wkspSize, /* bmi2 */ 0); ++ assert(op >= ostart); ++ return (size_t)(op-ostart); + } + + typedef struct { + short ncount[FSE_MAX_SYMBOL_VALUE + 1]; +- FSE_DTable dtable[]; /* Dynamically sized */ + } FSE_DecompressWksp; + + +@@ -327,13 +250,18 @@ FORCE_INLINE_TEMPLATE size_t FSE_decompr + unsigned tableLog; + unsigned maxSymbolValue = FSE_MAX_SYMBOL_VALUE; + FSE_DecompressWksp* const wksp = (FSE_DecompressWksp*)workSpace; ++ size_t const dtablePos = sizeof(FSE_DecompressWksp) / sizeof(FSE_DTable); ++ FSE_DTable* const dtable = (FSE_DTable*)workSpace + dtablePos; + +- DEBUG_STATIC_ASSERT((FSE_MAX_SYMBOL_VALUE + 1) % 2 == 0); ++ FSE_STATIC_ASSERT((FSE_MAX_SYMBOL_VALUE + 1) % 2 == 0); + if (wkspSize < sizeof(*wksp)) return ERROR(GENERIC); + ++ /* correct offset to dtable depends on this property */ ++ FSE_STATIC_ASSERT(sizeof(FSE_DecompressWksp) % sizeof(FSE_DTable) == 0); ++ + /* normal FSE decoding mode */ +- { +- size_t const NCountLength = FSE_readNCount_bmi2(wksp->ncount, &maxSymbolValue, &tableLog, istart, cSrcSize, bmi2); ++ { size_t const NCountLength = ++ FSE_readNCount_bmi2(wksp->ncount, &maxSymbolValue, &tableLog, istart, cSrcSize, bmi2); + if (FSE_isError(NCountLength)) return NCountLength; + if (tableLog > maxLog) return ERROR(tableLog_tooLarge); + assert(NCountLength <= cSrcSize); +@@ -342,19 +270,20 @@ FORCE_INLINE_TEMPLATE size_t FSE_decompr + } + + if (FSE_DECOMPRESS_WKSP_SIZE(tableLog, maxSymbolValue) > wkspSize) return ERROR(tableLog_tooLarge); +- workSpace = wksp->dtable + FSE_DTABLE_SIZE_U32(tableLog); ++ assert(sizeof(*wksp) + FSE_DTABLE_SIZE(tableLog) <= wkspSize); ++ workSpace = (BYTE*)workSpace + sizeof(*wksp) + FSE_DTABLE_SIZE(tableLog); + wkspSize -= sizeof(*wksp) + FSE_DTABLE_SIZE(tableLog); + +- CHECK_F( FSE_buildDTable_internal(wksp->dtable, wksp->ncount, maxSymbolValue, tableLog, workSpace, wkspSize) ); ++ CHECK_F( FSE_buildDTable_internal(dtable, wksp->ncount, maxSymbolValue, tableLog, workSpace, wkspSize) ); + + { +- const void* ptr = wksp->dtable; ++ const void* ptr = dtable; + const FSE_DTableHeader* DTableH = (const FSE_DTableHeader*)ptr; + const U32 fastMode = DTableH->fastMode; + + /* select fast mode (static) */ +- if (fastMode) return FSE_decompress_usingDTable_generic(dst, dstCapacity, ip, cSrcSize, wksp->dtable, 1); +- return FSE_decompress_usingDTable_generic(dst, dstCapacity, ip, cSrcSize, wksp->dtable, 0); ++ if (fastMode) return FSE_decompress_usingDTable_generic(dst, dstCapacity, ip, cSrcSize, dtable, 1); ++ return FSE_decompress_usingDTable_generic(dst, dstCapacity, ip, cSrcSize, dtable, 0); + } + } + +@@ -382,9 +311,4 @@ size_t FSE_decompress_wksp_bmi2(void* ds + return FSE_decompress_wksp_body_default(dst, dstCapacity, cSrc, cSrcSize, maxLog, workSpace, wkspSize); + } + +- +-typedef FSE_DTable DTable_max_t[FSE_DTABLE_SIZE_U32(FSE_MAX_TABLELOG)]; +- +- +- + #endif /* FSE_COMMONDEFS_ONLY */ +--- a/lib/zstd/common/huf.h ++++ b/lib/zstd/common/huf.h +@@ -1,7 +1,8 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* ****************************************************************** + * huff0 huffman codec, + * part of Finite State Entropy library +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * You can contact the author at : + * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy +@@ -18,99 +19,22 @@ + + /* *** Dependencies *** */ + #include "zstd_deps.h" /* size_t */ +- +- +-/* *** library symbols visibility *** */ +-/* Note : when linking with -fvisibility=hidden on gcc, or by default on Visual, +- * HUF symbols remain "private" (internal symbols for library only). +- * Set macro FSE_DLL_EXPORT to 1 if you want HUF symbols visible on DLL interface */ +-#if defined(FSE_DLL_EXPORT) && (FSE_DLL_EXPORT==1) && defined(__GNUC__) && (__GNUC__ >= 4) +-# define HUF_PUBLIC_API __attribute__ ((visibility ("default"))) +-#elif defined(FSE_DLL_EXPORT) && (FSE_DLL_EXPORT==1) /* Visual expected */ +-# define HUF_PUBLIC_API __declspec(dllexport) +-#elif defined(FSE_DLL_IMPORT) && (FSE_DLL_IMPORT==1) +-# define HUF_PUBLIC_API __declspec(dllimport) /* not required, just to generate faster code (saves a function pointer load from IAT and an indirect jump) */ +-#else +-# define HUF_PUBLIC_API +-#endif +- +- +-/* ========================== */ +-/* *** simple functions *** */ +-/* ========================== */ +- +-/* HUF_compress() : +- * Compress content from buffer 'src', of size 'srcSize', into buffer 'dst'. +- * 'dst' buffer must be already allocated. +- * Compression runs faster if `dstCapacity` >= HUF_compressBound(srcSize). +- * `srcSize` must be <= `HUF_BLOCKSIZE_MAX` == 128 KB. +- * @return : size of compressed data (<= `dstCapacity`). +- * Special values : if return == 0, srcData is not compressible => Nothing is stored within dst !!! +- * if HUF_isError(return), compression failed (more details using HUF_getErrorName()) +- */ +-HUF_PUBLIC_API size_t HUF_compress(void* dst, size_t dstCapacity, +- const void* src, size_t srcSize); +- +-/* HUF_decompress() : +- * Decompress HUF data from buffer 'cSrc', of size 'cSrcSize', +- * into already allocated buffer 'dst', of minimum size 'dstSize'. +- * `originalSize` : **must** be the ***exact*** size of original (uncompressed) data. +- * Note : in contrast with FSE, HUF_decompress can regenerate +- * RLE (cSrcSize==1) and uncompressed (cSrcSize==dstSize) data, +- * because it knows size to regenerate (originalSize). +- * @return : size of regenerated data (== originalSize), +- * or an error code, which can be tested using HUF_isError() +- */ +-HUF_PUBLIC_API size_t HUF_decompress(void* dst, size_t originalSize, +- const void* cSrc, size_t cSrcSize); ++#include "mem.h" /* U32 */ ++#define FSE_STATIC_LINKING_ONLY ++#include "fse.h" + + + /* *** Tool functions *** */ +-#define HUF_BLOCKSIZE_MAX (128 * 1024) /*< maximum input size for a single block compressed with HUF_compress */ +-HUF_PUBLIC_API size_t HUF_compressBound(size_t size); /*< maximum compressed size (worst case) */ ++#define HUF_BLOCKSIZE_MAX (128 * 1024) /*< maximum input size for a single block compressed with HUF_compress */ ++size_t HUF_compressBound(size_t size); /*< maximum compressed size (worst case) */ + + /* Error Management */ +-HUF_PUBLIC_API unsigned HUF_isError(size_t code); /*< tells if a return value is an error code */ +-HUF_PUBLIC_API const char* HUF_getErrorName(size_t code); /*< provides error code string (useful for debugging) */ +- ++unsigned HUF_isError(size_t code); /*< tells if a return value is an error code */ ++const char* HUF_getErrorName(size_t code); /*< provides error code string (useful for debugging) */ + +-/* *** Advanced function *** */ + +-/* HUF_compress2() : +- * Same as HUF_compress(), but offers control over `maxSymbolValue` and `tableLog`. +- * `maxSymbolValue` must be <= HUF_SYMBOLVALUE_MAX . +- * `tableLog` must be `<= HUF_TABLELOG_MAX` . */ +-HUF_PUBLIC_API size_t HUF_compress2 (void* dst, size_t dstCapacity, +- const void* src, size_t srcSize, +- unsigned maxSymbolValue, unsigned tableLog); +- +-/* HUF_compress4X_wksp() : +- * Same as HUF_compress2(), but uses externally allocated `workSpace`. +- * `workspace` must be at least as large as HUF_WORKSPACE_SIZE */ + #define HUF_WORKSPACE_SIZE ((8 << 10) + 512 /* sorting scratch space */) + #define HUF_WORKSPACE_SIZE_U64 (HUF_WORKSPACE_SIZE / sizeof(U64)) +-HUF_PUBLIC_API size_t HUF_compress4X_wksp (void* dst, size_t dstCapacity, +- const void* src, size_t srcSize, +- unsigned maxSymbolValue, unsigned tableLog, +- void* workSpace, size_t wkspSize); +- +-#endif /* HUF_H_298734234 */ +- +-/* ****************************************************************** +- * WARNING !! +- * The following section contains advanced and experimental definitions +- * which shall never be used in the context of a dynamic library, +- * because they are not guaranteed to remain stable in the future. +- * Only consider them in association with static linking. +- * *****************************************************************/ +-#if !defined(HUF_H_HUF_STATIC_LINKING_ONLY) +-#define HUF_H_HUF_STATIC_LINKING_ONLY +- +-/* *** Dependencies *** */ +-#include "mem.h" /* U32 */ +-#define FSE_STATIC_LINKING_ONLY +-#include "fse.h" +- + + /* *** Constants *** */ + #define HUF_TABLELOG_MAX 12 /* max runtime value of tableLog (due to static allocation); can be modified up to HUF_TABLELOG_ABSOLUTEMAX */ +@@ -151,25 +75,49 @@ typedef U32 HUF_DTable; + /* **************************************** + * Advanced decompression functions + ******************************************/ +-size_t HUF_decompress4X1 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /*< single-symbol decoder */ +-#ifndef HUF_FORCE_DECOMPRESS_X1 +-size_t HUF_decompress4X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /*< double-symbols decoder */ +-#endif + +-size_t HUF_decompress4X_DCtx (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /*< decodes RLE and uncompressed */ +-size_t HUF_decompress4X_hufOnly(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /*< considers RLE and uncompressed as errors */ +-size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize); /*< considers RLE and uncompressed as errors */ +-size_t HUF_decompress4X1_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /*< single-symbol decoder */ +-size_t HUF_decompress4X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize); /*< single-symbol decoder */ +-#ifndef HUF_FORCE_DECOMPRESS_X1 +-size_t HUF_decompress4X2_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /*< double-symbols decoder */ +-size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize); /*< double-symbols decoder */ +-#endif ++/* ++ * Huffman flags bitset. ++ * For all flags, 0 is the default value. ++ */ ++typedef enum { ++ /* ++ * If compiled with DYNAMIC_BMI2: Set flag only if the CPU supports BMI2 at runtime. ++ * Otherwise: Ignored. ++ */ ++ HUF_flags_bmi2 = (1 << 0), ++ /* ++ * If set: Test possible table depths to find the one that produces the smallest header + encoded size. ++ * If unset: Use heuristic to find the table depth. ++ */ ++ HUF_flags_optimalDepth = (1 << 1), ++ /* ++ * If set: If the previous table can encode the input, always reuse the previous table. ++ * If unset: If the previous table can encode the input, reuse the previous table if it results in a smaller output. ++ */ ++ HUF_flags_preferRepeat = (1 << 2), ++ /* ++ * If set: Sample the input and check if the sample is uncompressible, if it is then don't attempt to compress. ++ * If unset: Always histogram the entire input. ++ */ ++ HUF_flags_suspectUncompressible = (1 << 3), ++ /* ++ * If set: Don't use assembly implementations ++ * If unset: Allow using assembly implementations ++ */ ++ HUF_flags_disableAsm = (1 << 4), ++ /* ++ * If set: Don't use the fast decoding loop, always use the fallback decoding loop. ++ * If unset: Use the fast decoding loop when possible. ++ */ ++ HUF_flags_disableFast = (1 << 5) ++} HUF_flags_e; + + + /* **************************************** + * HUF detailed API + * ****************************************/ ++#define HUF_OPTIMAL_DEPTH_THRESHOLD ZSTD_btultra + + /*! HUF_compress() does the following: + * 1. count symbol occurrence from source[] into table count[] using FSE_count() (exposed within "fse.h") +@@ -182,12 +130,12 @@ size_t HUF_decompress4X2_DCtx_wksp(HUF_D + * For example, it's possible to compress several blocks using the same 'CTable', + * or to save and regenerate 'CTable' using external methods. + */ +-unsigned HUF_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue); +-size_t HUF_buildCTable (HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue, unsigned maxNbBits); /* @return : maxNbBits; CTable and count can overlap. In which case, CTable will overwrite count content */ +-size_t HUF_writeCTable (void* dst, size_t maxDstSize, const HUF_CElt* CTable, unsigned maxSymbolValue, unsigned huffLog); ++unsigned HUF_minTableLog(unsigned symbolCardinality); ++unsigned HUF_cardinality(const unsigned* count, unsigned maxSymbolValue); ++unsigned HUF_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue, void* workSpace, ++ size_t wkspSize, HUF_CElt* table, const unsigned* count, int flags); /* table is used as scratch space for building and testing tables, not a return value */ + size_t HUF_writeCTable_wksp(void* dst, size_t maxDstSize, const HUF_CElt* CTable, unsigned maxSymbolValue, unsigned huffLog, void* workspace, size_t workspaceSize); +-size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable); +-size_t HUF_compress4X_usingCTable_bmi2(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int bmi2); ++size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int flags); + size_t HUF_estimateCompressedSize(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue); + int HUF_validateCTable(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue); + +@@ -196,6 +144,7 @@ typedef enum { + HUF_repeat_check, /*< Can use the previous table but it must be checked. Note : The previous table must have been constructed by HUF_compress{1, 4}X_repeat */ + HUF_repeat_valid /*< Can use the previous table and it is assumed to be valid */ + } HUF_repeat; ++ + /* HUF_compress4X_repeat() : + * Same as HUF_compress4X_wksp(), but considers using hufTable if *repeat != HUF_repeat_none. + * If it uses hufTable it does not modify hufTable or repeat. +@@ -206,13 +155,13 @@ size_t HUF_compress4X_repeat(void* dst, + const void* src, size_t srcSize, + unsigned maxSymbolValue, unsigned tableLog, + void* workSpace, size_t wkspSize, /*< `workSpace` must be aligned on 4-bytes boundaries, `wkspSize` must be >= HUF_WORKSPACE_SIZE */ +- HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2, unsigned suspectUncompressible); ++ HUF_CElt* hufTable, HUF_repeat* repeat, int flags); + + /* HUF_buildCTable_wksp() : + * Same as HUF_buildCTable(), but using externally allocated scratch buffer. + * `workSpace` must be aligned on 4-bytes boundaries, and its size must be >= HUF_CTABLE_WORKSPACE_SIZE. + */ +-#define HUF_CTABLE_WORKSPACE_SIZE_U32 (2*HUF_SYMBOLVALUE_MAX +1 +1) ++#define HUF_CTABLE_WORKSPACE_SIZE_U32 ((4 * (HUF_SYMBOLVALUE_MAX + 1)) + 192) + #define HUF_CTABLE_WORKSPACE_SIZE (HUF_CTABLE_WORKSPACE_SIZE_U32 * sizeof(unsigned)) + size_t HUF_buildCTable_wksp (HUF_CElt* tree, + const unsigned* count, U32 maxSymbolValue, U32 maxNbBits, +@@ -238,7 +187,7 @@ size_t HUF_readStats_wksp(BYTE* huffWeig + U32* rankStats, U32* nbSymbolsPtr, U32* tableLogPtr, + const void* src, size_t srcSize, + void* workspace, size_t wkspSize, +- int bmi2); ++ int flags); + + /* HUF_readCTable() : + * Loading a CTable saved with HUF_writeCTable() */ +@@ -246,9 +195,22 @@ size_t HUF_readCTable (HUF_CElt* CTable, + + /* HUF_getNbBitsFromCTable() : + * Read nbBits from CTable symbolTable, for symbol `symbolValue` presumed <= HUF_SYMBOLVALUE_MAX +- * Note 1 : is not inlined, as HUF_CElt definition is private */ ++ * Note 1 : If symbolValue > HUF_readCTableHeader(symbolTable).maxSymbolValue, returns 0 ++ * Note 2 : is not inlined, as HUF_CElt definition is private ++ */ + U32 HUF_getNbBitsFromCTable(const HUF_CElt* symbolTable, U32 symbolValue); + ++typedef struct { ++ BYTE tableLog; ++ BYTE maxSymbolValue; ++ BYTE unused[sizeof(size_t) - 2]; ++} HUF_CTableHeader; ++ ++/* HUF_readCTableHeader() : ++ * @returns The header from the CTable specifying the tableLog and the maxSymbolValue. ++ */ ++HUF_CTableHeader HUF_readCTableHeader(HUF_CElt const* ctable); ++ + /* + * HUF_decompress() does the following: + * 1. select the decompression algorithm (X1, X2) based on pre-computed heuristics +@@ -276,32 +238,12 @@ U32 HUF_selectDecoder (size_t dstSize, s + #define HUF_DECOMPRESS_WORKSPACE_SIZE ((2 << 10) + (1 << 9)) + #define HUF_DECOMPRESS_WORKSPACE_SIZE_U32 (HUF_DECOMPRESS_WORKSPACE_SIZE / sizeof(U32)) + +-#ifndef HUF_FORCE_DECOMPRESS_X2 +-size_t HUF_readDTableX1 (HUF_DTable* DTable, const void* src, size_t srcSize); +-size_t HUF_readDTableX1_wksp (HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize); +-#endif +-#ifndef HUF_FORCE_DECOMPRESS_X1 +-size_t HUF_readDTableX2 (HUF_DTable* DTable, const void* src, size_t srcSize); +-size_t HUF_readDTableX2_wksp (HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize); +-#endif +- +-size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable); +-#ifndef HUF_FORCE_DECOMPRESS_X2 +-size_t HUF_decompress4X1_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable); +-#endif +-#ifndef HUF_FORCE_DECOMPRESS_X1 +-size_t HUF_decompress4X2_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable); +-#endif +- + + /* ====================== */ + /* single stream variants */ + /* ====================== */ + +-size_t HUF_compress1X (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog); +-size_t HUF_compress1X_wksp (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize); /*< `workSpace` must be a table of at least HUF_WORKSPACE_SIZE_U64 U64 */ +-size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable); +-size_t HUF_compress1X_usingCTable_bmi2(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int bmi2); ++size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int flags); + /* HUF_compress1X_repeat() : + * Same as HUF_compress1X_wksp(), but considers using hufTable if *repeat != HUF_repeat_none. + * If it uses hufTable it does not modify hufTable or repeat. +@@ -312,47 +254,28 @@ size_t HUF_compress1X_repeat(void* dst, + const void* src, size_t srcSize, + unsigned maxSymbolValue, unsigned tableLog, + void* workSpace, size_t wkspSize, /*< `workSpace` must be aligned on 4-bytes boundaries, `wkspSize` must be >= HUF_WORKSPACE_SIZE */ +- HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2, unsigned suspectUncompressible); ++ HUF_CElt* hufTable, HUF_repeat* repeat, int flags); + +-size_t HUF_decompress1X1 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /* single-symbol decoder */ +-#ifndef HUF_FORCE_DECOMPRESS_X1 +-size_t HUF_decompress1X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /* double-symbol decoder */ +-#endif +- +-size_t HUF_decompress1X_DCtx (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); +-size_t HUF_decompress1X_DCtx_wksp (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize); +-#ifndef HUF_FORCE_DECOMPRESS_X2 +-size_t HUF_decompress1X1_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /*< single-symbol decoder */ +-size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize); /*< single-symbol decoder */ +-#endif ++size_t HUF_decompress1X_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags); + #ifndef HUF_FORCE_DECOMPRESS_X1 +-size_t HUF_decompress1X2_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /*< double-symbols decoder */ +-size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize); /*< double-symbols decoder */ +-#endif +- +-size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable); /*< automatic selection of sing or double symbol decoder, based on DTable */ +-#ifndef HUF_FORCE_DECOMPRESS_X2 +-size_t HUF_decompress1X1_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable); +-#endif +-#ifndef HUF_FORCE_DECOMPRESS_X1 +-size_t HUF_decompress1X2_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable); ++size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags); /*< double-symbols decoder */ + #endif + + /* BMI2 variants. + * If the CPU has BMI2 support, pass bmi2=1, otherwise pass bmi2=0. + */ +-size_t HUF_decompress1X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2); ++size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int flags); + #ifndef HUF_FORCE_DECOMPRESS_X2 +-size_t HUF_decompress1X1_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2); ++size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags); + #endif +-size_t HUF_decompress4X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2); +-size_t HUF_decompress4X_hufOnly_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2); ++size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int flags); ++size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags); + #ifndef HUF_FORCE_DECOMPRESS_X2 +-size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int bmi2); ++size_t HUF_readDTableX1_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int flags); + #endif + #ifndef HUF_FORCE_DECOMPRESS_X1 +-size_t HUF_readDTableX2_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int bmi2); ++size_t HUF_readDTableX2_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int flags); + #endif + +-#endif /* HUF_STATIC_LINKING_ONLY */ ++#endif /* HUF_H_298734234 */ + +--- a/lib/zstd/common/mem.h ++++ b/lib/zstd/common/mem.h +@@ -1,6 +1,6 @@ + /* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -24,6 +24,7 @@ + /*-**************************************** + * Compiler specifics + ******************************************/ ++#undef MEM_STATIC /* may be already defined from common/compiler.h */ + #define MEM_STATIC static inline + + /*-************************************************************** +--- a/lib/zstd/common/portability_macros.h ++++ b/lib/zstd/common/portability_macros.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -12,7 +13,7 @@ + #define ZSTD_PORTABILITY_MACROS_H + + /* +- * This header file contains macro defintions to support portability. ++ * This header file contains macro definitions to support portability. + * This header is shared between C and ASM code, so it MUST only + * contain macro definitions. It MUST not contain any C code. + * +@@ -45,6 +46,8 @@ + /* Mark the internal assembly functions as hidden */ + #ifdef __ELF__ + # define ZSTD_HIDE_ASM_FUNCTION(func) .hidden func ++#elif defined(__APPLE__) ++# define ZSTD_HIDE_ASM_FUNCTION(func) .private_extern func + #else + # define ZSTD_HIDE_ASM_FUNCTION(func) + #endif +@@ -65,7 +68,7 @@ + #endif + + /* +- * Only enable assembly for GNUC comptabile compilers, ++ * Only enable assembly for GNUC compatible compilers, + * because other platforms may not support GAS assembly syntax. + * + * Only enable assembly for Linux / MacOS, other platforms may +@@ -90,4 +93,23 @@ + */ + #define ZSTD_ENABLE_ASM_X86_64_BMI2 0 + ++/* ++ * For x86 ELF targets, add .note.gnu.property section for Intel CET in ++ * assembly sources when CET is enabled. ++ * ++ * Additionally, any function that may be called indirectly must begin ++ * with ZSTD_CET_ENDBRANCH. ++ */ ++#if defined(__ELF__) && (defined(__x86_64__) || defined(__i386__)) \ ++ && defined(__has_include) ++# if __has_include() ++# include ++# define ZSTD_CET_ENDBRANCH _CET_ENDBR ++# endif ++#endif ++ ++#ifndef ZSTD_CET_ENDBRANCH ++# define ZSTD_CET_ENDBRANCH ++#endif ++ + #endif /* ZSTD_PORTABILITY_MACROS_H */ +--- a/lib/zstd/common/zstd_common.c ++++ b/lib/zstd/common/zstd_common.c +@@ -1,5 +1,6 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -14,7 +15,6 @@ + * Dependencies + ***************************************/ + #define ZSTD_DEPS_NEED_MALLOC +-#include "zstd_deps.h" /* ZSTD_malloc, ZSTD_calloc, ZSTD_free, ZSTD_memset */ + #include "error_private.h" + #include "zstd_internal.h" + +@@ -47,37 +47,3 @@ ZSTD_ErrorCode ZSTD_getErrorCode(size_t + /*! ZSTD_getErrorString() : + * provides error code string from enum */ + const char* ZSTD_getErrorString(ZSTD_ErrorCode code) { return ERR_getErrorString(code); } +- +- +- +-/*=************************************************************** +-* Custom allocator +-****************************************************************/ +-void* ZSTD_customMalloc(size_t size, ZSTD_customMem customMem) +-{ +- if (customMem.customAlloc) +- return customMem.customAlloc(customMem.opaque, size); +- return ZSTD_malloc(size); +-} +- +-void* ZSTD_customCalloc(size_t size, ZSTD_customMem customMem) +-{ +- if (customMem.customAlloc) { +- /* calloc implemented as malloc+memset; +- * not as efficient as calloc, but next best guess for custom malloc */ +- void* const ptr = customMem.customAlloc(customMem.opaque, size); +- ZSTD_memset(ptr, 0, size); +- return ptr; +- } +- return ZSTD_calloc(1, size); +-} +- +-void ZSTD_customFree(void* ptr, ZSTD_customMem customMem) +-{ +- if (ptr!=NULL) { +- if (customMem.customFree) +- customMem.customFree(customMem.opaque, ptr); +- else +- ZSTD_free(ptr); +- } +-} +--- a/lib/zstd/common/zstd_deps.h ++++ b/lib/zstd/common/zstd_deps.h +@@ -1,6 +1,6 @@ + /* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -105,3 +105,17 @@ static uint64_t ZSTD_div64(uint64_t divi + + #endif /* ZSTD_DEPS_IO */ + #endif /* ZSTD_DEPS_NEED_IO */ ++ ++/* ++ * Only requested when MSAN is enabled. ++ * Need: ++ * intptr_t ++ */ ++#ifdef ZSTD_DEPS_NEED_STDINT ++#ifndef ZSTD_DEPS_STDINT ++#define ZSTD_DEPS_STDINT ++ ++/* intptr_t already provided by ZSTD_DEPS_COMMON */ ++ ++#endif /* ZSTD_DEPS_STDINT */ ++#endif /* ZSTD_DEPS_NEED_STDINT */ +--- a/lib/zstd/common/zstd_internal.h ++++ b/lib/zstd/common/zstd_internal.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -28,7 +29,6 @@ + #include + #define FSE_STATIC_LINKING_ONLY + #include "fse.h" +-#define HUF_STATIC_LINKING_ONLY + #include "huf.h" + #include /* XXH_reset, update, digest */ + #define ZSTD_TRACE 0 +@@ -83,9 +83,9 @@ typedef enum { bt_raw, bt_rle, bt_compre + #define ZSTD_FRAMECHECKSUMSIZE 4 + + #define MIN_SEQUENCES_SIZE 1 /* nbSeq==0 */ +-#define MIN_CBLOCK_SIZE (1 /*litCSize*/ + 1 /* RLE or RAW */ + MIN_SEQUENCES_SIZE /* nbSeq==0 */) /* for a non-null block */ ++#define MIN_CBLOCK_SIZE (1 /*litCSize*/ + 1 /* RLE or RAW */) /* for a non-null block */ ++#define MIN_LITERALS_FOR_4_STREAMS 6 + +-#define HufLog 12 + typedef enum { set_basic, set_rle, set_compressed, set_repeat } symbolEncodingType_e; + + #define LONGNBSEQ 0x7F00 +@@ -93,6 +93,7 @@ typedef enum { set_basic, set_rle, set_c + #define MINMATCH 3 + + #define Litbits 8 ++#define LitHufLog 11 + #define MaxLit ((1<= WILDCOPY_VECLEN || diff <= -WILDCOPY_VECLEN); +@@ -225,12 +228,6 @@ void ZSTD_wildcopy(void* dst, const void + * one COPY16() in the first call. Then, do two calls per loop since + * at that point it is more likely to have a high trip count. + */ +-#ifdef __aarch64__ +- do { +- COPY16(op, ip); +- } +- while (op < oend); +-#else + ZSTD_copy16(op, ip); + if (16 >= length) return; + op += 16; +@@ -240,7 +237,6 @@ void ZSTD_wildcopy(void* dst, const void + COPY16(op, ip); + } + while (op < oend); +-#endif + } + } + +@@ -289,11 +285,11 @@ typedef enum { + typedef struct { + seqDef* sequencesStart; + seqDef* sequences; /* ptr to end of sequences */ +- BYTE* litStart; +- BYTE* lit; /* ptr to end of literals */ +- BYTE* llCode; +- BYTE* mlCode; +- BYTE* ofCode; ++ BYTE* litStart; ++ BYTE* lit; /* ptr to end of literals */ ++ BYTE* llCode; ++ BYTE* mlCode; ++ BYTE* ofCode; + size_t maxNbSeq; + size_t maxNbLit; + +@@ -301,8 +297,8 @@ typedef struct { + * in the seqStore that has a value larger than U16 (if it exists). To do so, we increment + * the existing value of the litLength or matchLength by 0x10000. + */ +- ZSTD_longLengthType_e longLengthType; +- U32 longLengthPos; /* Index of the sequence to apply long length modification to */ ++ ZSTD_longLengthType_e longLengthType; ++ U32 longLengthPos; /* Index of the sequence to apply long length modification to */ + } seqStore_t; + + typedef struct { +@@ -321,10 +317,10 @@ MEM_STATIC ZSTD_sequenceLength ZSTD_getS + seqLen.matchLength = seq->mlBase + MINMATCH; + if (seqStore->longLengthPos == (U32)(seq - seqStore->sequencesStart)) { + if (seqStore->longLengthType == ZSTD_llt_literalLength) { +- seqLen.litLength += 0xFFFF; ++ seqLen.litLength += 0x10000; + } + if (seqStore->longLengthType == ZSTD_llt_matchLength) { +- seqLen.matchLength += 0xFFFF; ++ seqLen.matchLength += 0x10000; + } + } + return seqLen; +@@ -337,72 +333,13 @@ MEM_STATIC ZSTD_sequenceLength ZSTD_getS + * `decompressedBound != ZSTD_CONTENTSIZE_ERROR` + */ + typedef struct { ++ size_t nbBlocks; + size_t compressedSize; + unsigned long long decompressedBound; + } ZSTD_frameSizeInfo; /* decompress & legacy */ + + const seqStore_t* ZSTD_getSeqStore(const ZSTD_CCtx* ctx); /* compress & dictBuilder */ +-void ZSTD_seqToCodes(const seqStore_t* seqStorePtr); /* compress, dictBuilder, decodeCorpus (shouldn't get its definition from here) */ +- +-/* custom memory allocation functions */ +-void* ZSTD_customMalloc(size_t size, ZSTD_customMem customMem); +-void* ZSTD_customCalloc(size_t size, ZSTD_customMem customMem); +-void ZSTD_customFree(void* ptr, ZSTD_customMem customMem); +- +- +-MEM_STATIC U32 ZSTD_highbit32(U32 val) /* compress, dictBuilder, decodeCorpus */ +-{ +- assert(val != 0); +- { +-# if (__GNUC__ >= 3) /* GCC Intrinsic */ +- return __builtin_clz (val) ^ 31; +-# else /* Software version */ +- static const U32 DeBruijnClz[32] = { 0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18, 22, 25, 3, 30, 8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31 }; +- U32 v = val; +- v |= v >> 1; +- v |= v >> 2; +- v |= v >> 4; +- v |= v >> 8; +- v |= v >> 16; +- return DeBruijnClz[(v * 0x07C4ACDDU) >> 27]; +-# endif +- } +-} +- +-/* +- * Counts the number of trailing zeros of a `size_t`. +- * Most compilers should support CTZ as a builtin. A backup +- * implementation is provided if the builtin isn't supported, but +- * it may not be terribly efficient. +- */ +-MEM_STATIC unsigned ZSTD_countTrailingZeros(size_t val) +-{ +- if (MEM_64bits()) { +-# if (__GNUC__ >= 4) +- return __builtin_ctzll((U64)val); +-# else +- static const int DeBruijnBytePos[64] = { 0, 1, 2, 7, 3, 13, 8, 19, +- 4, 25, 14, 28, 9, 34, 20, 56, +- 5, 17, 26, 54, 15, 41, 29, 43, +- 10, 31, 38, 35, 21, 45, 49, 57, +- 63, 6, 12, 18, 24, 27, 33, 55, +- 16, 53, 40, 42, 30, 37, 44, 48, +- 62, 11, 23, 32, 52, 39, 36, 47, +- 61, 22, 51, 46, 60, 50, 59, 58 }; +- return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58]; +-# endif +- } else { /* 32 bits */ +-# if (__GNUC__ >= 3) +- return __builtin_ctz((U32)val); +-# else +- static const int DeBruijnBytePos[32] = { 0, 1, 28, 2, 29, 14, 24, 3, +- 30, 22, 20, 15, 25, 17, 4, 8, +- 31, 27, 13, 23, 21, 19, 16, 7, +- 26, 12, 18, 6, 11, 5, 10, 9 }; +- return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27]; +-# endif +- } +-} ++int ZSTD_seqToCodes(const seqStore_t* seqStorePtr); /* compress, dictBuilder, decodeCorpus (shouldn't get its definition from here) */ + + + /* ZSTD_invalidateRepCodes() : +@@ -420,13 +357,13 @@ typedef struct { + + /*! ZSTD_getcBlockSize() : + * Provides the size of compressed block from block header `src` */ +-/* Used by: decompress, fullbench (does not get its definition from here) */ ++/* Used by: decompress, fullbench */ + size_t ZSTD_getcBlockSize(const void* src, size_t srcSize, + blockProperties_t* bpPtr); + + /*! ZSTD_decodeSeqHeaders() : + * decode sequence header from src */ +-/* Used by: decompress, fullbench (does not get its definition from here) */ ++/* Used by: zstd_decompress_block, fullbench */ + size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr, + const void* src, size_t srcSize); + +--- a/lib/zstd/compress/clevels.h ++++ b/lib/zstd/compress/clevels.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +--- a/lib/zstd/compress/fse_compress.c ++++ b/lib/zstd/compress/fse_compress.c +@@ -1,6 +1,7 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* ****************************************************************** + * FSE : Finite State Entropy encoder +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * You can contact the author at : + * - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy +@@ -25,7 +26,8 @@ + #include "../common/error_private.h" + #define ZSTD_DEPS_NEED_MALLOC + #define ZSTD_DEPS_NEED_MATH64 +-#include "../common/zstd_deps.h" /* ZSTD_malloc, ZSTD_free, ZSTD_memcpy, ZSTD_memset */ ++#include "../common/zstd_deps.h" /* ZSTD_memset */ ++#include "../common/bits.h" /* ZSTD_highbit32 */ + + + /* ************************************************************** +@@ -90,7 +92,7 @@ size_t FSE_buildCTable_wksp(FSE_CTable* + assert(tableLog < 16); /* required for threshold strategy to work */ + + /* For explanations on how to distribute symbol values over the table : +- * http://fastcompression.blogspot.fr/2014/02/fse-distributing-symbol-values.html */ ++ * https://fastcompression.blogspot.fr/2014/02/fse-distributing-symbol-values.html */ + + #ifdef __clang_analyzer__ + ZSTD_memset(tableSymbol, 0, sizeof(*tableSymbol) * tableSize); /* useless initialization, just to keep scan-build happy */ +@@ -191,7 +193,7 @@ size_t FSE_buildCTable_wksp(FSE_CTable* + break; + default : + assert(normalizedCounter[s] > 1); +- { U32 const maxBitsOut = tableLog - BIT_highbit32 ((U32)normalizedCounter[s]-1); ++ { U32 const maxBitsOut = tableLog - ZSTD_highbit32 ((U32)normalizedCounter[s]-1); + U32 const minStatePlus = (U32)normalizedCounter[s] << maxBitsOut; + symbolTT[s].deltaNbBits = (maxBitsOut << 16) - minStatePlus; + symbolTT[s].deltaFindState = (int)(total - (unsigned)normalizedCounter[s]); +@@ -224,8 +226,8 @@ size_t FSE_NCountWriteBound(unsigned max + size_t const maxHeaderSize = (((maxSymbolValue+1) * tableLog + + 4 /* bitCount initialized at 4 */ + + 2 /* first two symbols may use one additional bit each */) / 8) +- + 1 /* round up to whole nb bytes */ +- + 2 /* additional two bytes for bitstream flush */; ++ + 1 /* round up to whole nb bytes */ ++ + 2 /* additional two bytes for bitstream flush */; + return maxSymbolValue ? maxHeaderSize : FSE_NCOUNTBOUND; /* maxSymbolValue==0 ? use default */ + } + +@@ -254,7 +256,7 @@ FSE_writeNCount_generic (void* header, s + /* Init */ + remaining = tableSize+1; /* +1 for extra accuracy */ + threshold = tableSize; +- nbBits = tableLog+1; ++ nbBits = (int)tableLog+1; + + while ((symbol < alphabetSize) && (remaining>1)) { /* stops at 1 */ + if (previousIs0) { +@@ -273,7 +275,7 @@ FSE_writeNCount_generic (void* header, s + } + while (symbol >= start+3) { + start+=3; +- bitStream += 3 << bitCount; ++ bitStream += 3U << bitCount; + bitCount += 2; + } + bitStream += (symbol-start) << bitCount; +@@ -293,7 +295,7 @@ FSE_writeNCount_generic (void* header, s + count++; /* +1 for extra accuracy */ + if (count>=threshold) + count += max; /* [0..max[ [max..threshold[ (...) [threshold+max 2*threshold[ */ +- bitStream += count << bitCount; ++ bitStream += (U32)count << bitCount; + bitCount += nbBits; + bitCount -= (count>8); + out+= (bitCount+7) /8; + +- return (out-ostart); ++ assert(out >= ostart); ++ return (size_t)(out-ostart); + } + + +@@ -342,21 +345,11 @@ size_t FSE_writeNCount (void* buffer, si + * FSE Compression Code + ****************************************************************/ + +-FSE_CTable* FSE_createCTable (unsigned maxSymbolValue, unsigned tableLog) +-{ +- size_t size; +- if (tableLog > FSE_TABLELOG_ABSOLUTE_MAX) tableLog = FSE_TABLELOG_ABSOLUTE_MAX; +- size = FSE_CTABLE_SIZE_U32 (tableLog, maxSymbolValue) * sizeof(U32); +- return (FSE_CTable*)ZSTD_malloc(size); +-} +- +-void FSE_freeCTable (FSE_CTable* ct) { ZSTD_free(ct); } +- + /* provides the minimum logSize to safely represent a distribution */ + static unsigned FSE_minTableLog(size_t srcSize, unsigned maxSymbolValue) + { +- U32 minBitsSrc = BIT_highbit32((U32)(srcSize)) + 1; +- U32 minBitsSymbols = BIT_highbit32(maxSymbolValue) + 2; ++ U32 minBitsSrc = ZSTD_highbit32((U32)(srcSize)) + 1; ++ U32 minBitsSymbols = ZSTD_highbit32(maxSymbolValue) + 2; + U32 minBits = minBitsSrc < minBitsSymbols ? minBitsSrc : minBitsSymbols; + assert(srcSize > 1); /* Not supported, RLE should be used instead */ + return minBits; +@@ -364,7 +357,7 @@ static unsigned FSE_minTableLog(size_t s + + unsigned FSE_optimalTableLog_internal(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue, unsigned minus) + { +- U32 maxBitsSrc = BIT_highbit32((U32)(srcSize - 1)) - minus; ++ U32 maxBitsSrc = ZSTD_highbit32((U32)(srcSize - 1)) - minus; + U32 tableLog = maxTableLog; + U32 minBits = FSE_minTableLog(srcSize, maxSymbolValue); + assert(srcSize > 1); /* Not supported, RLE should be used instead */ +@@ -532,40 +525,6 @@ size_t FSE_normalizeCount (short* normal + return tableLog; + } + +- +-/* fake FSE_CTable, for raw (uncompressed) input */ +-size_t FSE_buildCTable_raw (FSE_CTable* ct, unsigned nbBits) +-{ +- const unsigned tableSize = 1 << nbBits; +- const unsigned tableMask = tableSize - 1; +- const unsigned maxSymbolValue = tableMask; +- void* const ptr = ct; +- U16* const tableU16 = ( (U16*) ptr) + 2; +- void* const FSCT = ((U32*)ptr) + 1 /* header */ + (tableSize>>1); /* assumption : tableLog >= 1 */ +- FSE_symbolCompressionTransform* const symbolTT = (FSE_symbolCompressionTransform*) (FSCT); +- unsigned s; +- +- /* Sanity checks */ +- if (nbBits < 1) return ERROR(GENERIC); /* min size */ +- +- /* header */ +- tableU16[-2] = (U16) nbBits; +- tableU16[-1] = (U16) maxSymbolValue; +- +- /* Build table */ +- for (s=0; s= 2 ++ ++static size_t showU32(const U32* arr, size_t size) + { +- return FSE_optimalTableLog_internal(maxTableLog, srcSize, maxSymbolValue, 1); ++ size_t u; ++ for (u=0; u= sizeof(HUF_WriteCTableWksp)); ++ ++ assert(HUF_readCTableHeader(CTable).maxSymbolValue == maxSymbolValue); ++ assert(HUF_readCTableHeader(CTable).tableLog == huffLog); ++ + /* check conditions */ + if (workspaceSize < sizeof(HUF_WriteCTableWksp)) return ERROR(GENERIC); + if (maxSymbolValue > HUF_SYMBOLVALUE_MAX) return ERROR(maxSymbolValue_tooLarge); +@@ -204,16 +286,6 @@ size_t HUF_writeCTable_wksp(void* dst, s + return ((maxSymbolValue+1)/2) + 1; + } + +-/*! HUF_writeCTable() : +- `CTable` : Huffman tree to save, using huf representation. +- @return : size of saved CTable */ +-size_t HUF_writeCTable (void* dst, size_t maxDstSize, +- const HUF_CElt* CTable, unsigned maxSymbolValue, unsigned huffLog) +-{ +- HUF_WriteCTableWksp wksp; +- return HUF_writeCTable_wksp(dst, maxDstSize, CTable, maxSymbolValue, huffLog, &wksp, sizeof(wksp)); +-} +- + + size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void* src, size_t srcSize, unsigned* hasZeroWeights) + { +@@ -231,7 +303,9 @@ size_t HUF_readCTable (HUF_CElt* CTable, + if (tableLog > HUF_TABLELOG_MAX) return ERROR(tableLog_tooLarge); + if (nbSymbols > *maxSymbolValuePtr+1) return ERROR(maxSymbolValue_tooSmall); + +- CTable[0] = tableLog; ++ *maxSymbolValuePtr = nbSymbols - 1; ++ ++ HUF_writeCTableHeader(CTable, tableLog, *maxSymbolValuePtr); + + /* Prepare base value per rank */ + { U32 n, nextRankStart = 0; +@@ -263,74 +337,71 @@ size_t HUF_readCTable (HUF_CElt* CTable, + { U32 n; for (n=0; n HUF_readCTableHeader(CTable).maxSymbolValue) ++ return 0; + return (U32)HUF_getNbBits(ct[symbolValue]); + } + + +-typedef struct nodeElt_s { +- U32 count; +- U16 parent; +- BYTE byte; +- BYTE nbBits; +-} nodeElt; +- + /* + * HUF_setMaxHeight(): +- * Enforces maxNbBits on the Huffman tree described in huffNode. ++ * Try to enforce @targetNbBits on the Huffman tree described in @huffNode. + * +- * It sets all nodes with nbBits > maxNbBits to be maxNbBits. Then it adjusts +- * the tree to so that it is a valid canonical Huffman tree. ++ * It attempts to convert all nodes with nbBits > @targetNbBits ++ * to employ @targetNbBits instead. Then it adjusts the tree ++ * so that it remains a valid canonical Huffman tree. + * + * @pre The sum of the ranks of each symbol == 2^largestBits, + * where largestBits == huffNode[lastNonNull].nbBits. + * @post The sum of the ranks of each symbol == 2^largestBits, +- * where largestBits is the return value <= maxNbBits. ++ * where largestBits is the return value (expected <= targetNbBits). + * +- * @param huffNode The Huffman tree modified in place to enforce maxNbBits. ++ * @param huffNode The Huffman tree modified in place to enforce targetNbBits. ++ * It's presumed sorted, from most frequent to rarest symbol. + * @param lastNonNull The symbol with the lowest count in the Huffman tree. +- * @param maxNbBits The maximum allowed number of bits, which the Huffman tree ++ * @param targetNbBits The allowed number of bits, which the Huffman tree + * may not respect. After this function the Huffman tree will +- * respect maxNbBits. +- * @return The maximum number of bits of the Huffman tree after adjustment, +- * necessarily no more than maxNbBits. ++ * respect targetNbBits. ++ * @return The maximum number of bits of the Huffman tree after adjustment. + */ +-static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits) ++static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 targetNbBits) + { + const U32 largestBits = huffNode[lastNonNull].nbBits; +- /* early exit : no elt > maxNbBits, so the tree is already valid. */ +- if (largestBits <= maxNbBits) return largestBits; ++ /* early exit : no elt > targetNbBits, so the tree is already valid. */ ++ if (largestBits <= targetNbBits) return largestBits; ++ ++ DEBUGLOG(5, "HUF_setMaxHeight (targetNbBits = %u)", targetNbBits); + + /* there are several too large elements (at least >= 2) */ + { int totalCost = 0; +- const U32 baseCost = 1 << (largestBits - maxNbBits); ++ const U32 baseCost = 1 << (largestBits - targetNbBits); + int n = (int)lastNonNull; + +- /* Adjust any ranks > maxNbBits to maxNbBits. ++ /* Adjust any ranks > targetNbBits to targetNbBits. + * Compute totalCost, which is how far the sum of the ranks is + * we are over 2^largestBits after adjust the offending ranks. + */ +- while (huffNode[n].nbBits > maxNbBits) { ++ while (huffNode[n].nbBits > targetNbBits) { + totalCost += baseCost - (1 << (largestBits - huffNode[n].nbBits)); +- huffNode[n].nbBits = (BYTE)maxNbBits; ++ huffNode[n].nbBits = (BYTE)targetNbBits; + n--; + } +- /* n stops at huffNode[n].nbBits <= maxNbBits */ +- assert(huffNode[n].nbBits <= maxNbBits); +- /* n end at index of smallest symbol using < maxNbBits */ +- while (huffNode[n].nbBits == maxNbBits) --n; ++ /* n stops at huffNode[n].nbBits <= targetNbBits */ ++ assert(huffNode[n].nbBits <= targetNbBits); ++ /* n end at index of smallest symbol using < targetNbBits */ ++ while (huffNode[n].nbBits == targetNbBits) --n; + +- /* renorm totalCost from 2^largestBits to 2^maxNbBits ++ /* renorm totalCost from 2^largestBits to 2^targetNbBits + * note : totalCost is necessarily a multiple of baseCost */ +- assert((totalCost & (baseCost - 1)) == 0); +- totalCost >>= (largestBits - maxNbBits); ++ assert(((U32)totalCost & (baseCost - 1)) == 0); ++ totalCost >>= (largestBits - targetNbBits); + assert(totalCost > 0); + + /* repay normalized cost */ +@@ -339,19 +410,19 @@ static U32 HUF_setMaxHeight(nodeElt* huf + + /* Get pos of last (smallest = lowest cum. count) symbol per rank */ + ZSTD_memset(rankLast, 0xF0, sizeof(rankLast)); +- { U32 currentNbBits = maxNbBits; ++ { U32 currentNbBits = targetNbBits; + int pos; + for (pos=n ; pos >= 0; pos--) { + if (huffNode[pos].nbBits >= currentNbBits) continue; +- currentNbBits = huffNode[pos].nbBits; /* < maxNbBits */ +- rankLast[maxNbBits-currentNbBits] = (U32)pos; ++ currentNbBits = huffNode[pos].nbBits; /* < targetNbBits */ ++ rankLast[targetNbBits-currentNbBits] = (U32)pos; + } } + + while (totalCost > 0) { + /* Try to reduce the next power of 2 above totalCost because we + * gain back half the rank. + */ +- U32 nBitsToDecrease = BIT_highbit32((U32)totalCost) + 1; ++ U32 nBitsToDecrease = ZSTD_highbit32((U32)totalCost) + 1; + for ( ; nBitsToDecrease > 1; nBitsToDecrease--) { + U32 const highPos = rankLast[nBitsToDecrease]; + U32 const lowPos = rankLast[nBitsToDecrease-1]; +@@ -391,7 +462,7 @@ static U32 HUF_setMaxHeight(nodeElt* huf + rankLast[nBitsToDecrease] = noSymbol; + else { + rankLast[nBitsToDecrease]--; +- if (huffNode[rankLast[nBitsToDecrease]].nbBits != maxNbBits-nBitsToDecrease) ++ if (huffNode[rankLast[nBitsToDecrease]].nbBits != targetNbBits-nBitsToDecrease) + rankLast[nBitsToDecrease] = noSymbol; /* this rank is now empty */ + } + } /* while (totalCost > 0) */ +@@ -403,11 +474,11 @@ static U32 HUF_setMaxHeight(nodeElt* huf + * TODO. + */ + while (totalCost < 0) { /* Sometimes, cost correction overshoot */ +- /* special case : no rank 1 symbol (using maxNbBits-1); +- * let's create one from largest rank 0 (using maxNbBits). ++ /* special case : no rank 1 symbol (using targetNbBits-1); ++ * let's create one from largest rank 0 (using targetNbBits). + */ + if (rankLast[1] == noSymbol) { +- while (huffNode[n].nbBits == maxNbBits) n--; ++ while (huffNode[n].nbBits == targetNbBits) n--; + huffNode[n+1].nbBits--; + assert(n >= 0); + rankLast[1] = (U32)(n+1); +@@ -421,7 +492,7 @@ static U32 HUF_setMaxHeight(nodeElt* huf + } /* repay normalized cost */ + } /* there are several too large elements (at least >= 2) */ + +- return maxNbBits; ++ return targetNbBits; + } + + typedef struct { +@@ -429,7 +500,7 @@ typedef struct { + U16 curr; + } rankPos; + +-typedef nodeElt huffNodeTable[HUF_CTABLE_WORKSPACE_SIZE_U32]; ++typedef nodeElt huffNodeTable[2 * (HUF_SYMBOLVALUE_MAX + 1)]; + + /* Number of buckets available for HUF_sort() */ + #define RANK_POSITION_TABLE_SIZE 192 +@@ -448,8 +519,8 @@ typedef struct { + * Let buckets 166 to 192 represent all remaining counts up to RANK_POSITION_MAX_COUNT_LOG using log2 bucketing. + */ + #define RANK_POSITION_MAX_COUNT_LOG 32 +-#define RANK_POSITION_LOG_BUCKETS_BEGIN (RANK_POSITION_TABLE_SIZE - 1) - RANK_POSITION_MAX_COUNT_LOG - 1 /* == 158 */ +-#define RANK_POSITION_DISTINCT_COUNT_CUTOFF RANK_POSITION_LOG_BUCKETS_BEGIN + BIT_highbit32(RANK_POSITION_LOG_BUCKETS_BEGIN) /* == 166 */ ++#define RANK_POSITION_LOG_BUCKETS_BEGIN ((RANK_POSITION_TABLE_SIZE - 1) - RANK_POSITION_MAX_COUNT_LOG - 1 /* == 158 */) ++#define RANK_POSITION_DISTINCT_COUNT_CUTOFF (RANK_POSITION_LOG_BUCKETS_BEGIN + ZSTD_highbit32(RANK_POSITION_LOG_BUCKETS_BEGIN) /* == 166 */) + + /* Return the appropriate bucket index for a given count. See definition of + * RANK_POSITION_DISTINCT_COUNT_CUTOFF for explanation of bucketing strategy. +@@ -457,7 +528,7 @@ typedef struct { + static U32 HUF_getIndex(U32 const count) { + return (count < RANK_POSITION_DISTINCT_COUNT_CUTOFF) + ? count +- : BIT_highbit32(count) + RANK_POSITION_LOG_BUCKETS_BEGIN; ++ : ZSTD_highbit32(count) + RANK_POSITION_LOG_BUCKETS_BEGIN; + } + + /* Helper swap function for HUF_quickSortPartition() */ +@@ -580,7 +651,7 @@ static void HUF_sort(nodeElt huffNode[], + + /* Sort each bucket. */ + for (n = RANK_POSITION_DISTINCT_COUNT_CUTOFF; n < RANK_POSITION_TABLE_SIZE - 1; ++n) { +- U32 const bucketSize = rankPosition[n].curr-rankPosition[n].base; ++ int const bucketSize = rankPosition[n].curr - rankPosition[n].base; + U32 const bucketStartIdx = rankPosition[n].base; + if (bucketSize > 1) { + assert(bucketStartIdx < maxSymbolValue1); +@@ -591,6 +662,7 @@ static void HUF_sort(nodeElt huffNode[], + assert(HUF_isSorted(huffNode, maxSymbolValue1)); + } + ++ + /* HUF_buildCTable_wksp() : + * Same as HUF_buildCTable(), but using externally allocated scratch buffer. + * `workSpace` must be aligned on 4-bytes boundaries, and be at least as large as sizeof(HUF_buildCTable_wksp_tables). +@@ -611,6 +683,7 @@ static int HUF_buildTree(nodeElt* huffNo + int lowS, lowN; + int nodeNb = STARTNODE; + int n, nodeRoot; ++ DEBUGLOG(5, "HUF_buildTree (alphabet size = %u)", maxSymbolValue + 1); + /* init for parents */ + nonNullRank = (int)maxSymbolValue; + while(huffNode[nonNullRank].count == 0) nonNullRank--; +@@ -637,6 +710,8 @@ static int HUF_buildTree(nodeElt* huffNo + for (n=0; n<=nonNullRank; n++) + huffNode[n].nbBits = huffNode[ huffNode[n].parent ].nbBits + 1; + ++ DEBUGLOG(6, "Initial distribution of bits completed (%zu sorted symbols)", showHNodeBits(huffNode, maxSymbolValue+1)); ++ + return nonNullRank; + } + +@@ -671,31 +746,40 @@ static void HUF_buildCTableFromTree(HUF_ + HUF_setNbBits(ct + huffNode[n].byte, huffNode[n].nbBits); /* push nbBits per symbol, symbol order */ + for (n=0; nhuffNodeTbl; + nodeElt* const huffNode = huffNode0+1; + int nonNullRank; + ++ HUF_STATIC_ASSERT(HUF_CTABLE_WORKSPACE_SIZE == sizeof(HUF_buildCTable_wksp_tables)); ++ ++ DEBUGLOG(5, "HUF_buildCTable_wksp (alphabet size = %u)", maxSymbolValue+1); ++ + /* safety checks */ + if (wkspSize < sizeof(HUF_buildCTable_wksp_tables)) +- return ERROR(workSpace_tooSmall); ++ return ERROR(workSpace_tooSmall); + if (maxNbBits == 0) maxNbBits = HUF_TABLELOG_DEFAULT; + if (maxSymbolValue > HUF_SYMBOLVALUE_MAX) +- return ERROR(maxSymbolValue_tooLarge); ++ return ERROR(maxSymbolValue_tooLarge); + ZSTD_memset(huffNode0, 0, sizeof(huffNodeTable)); + + /* sort, decreasing order */ + HUF_sort(huffNode, count, maxSymbolValue, wksp_tables->rankPosition); ++ DEBUGLOG(6, "sorted symbols completed (%zu symbols)", showHNodeSymbols(huffNode, maxSymbolValue+1)); + + /* build tree */ + nonNullRank = HUF_buildTree(huffNode, maxSymbolValue); + +- /* enforce maxTableLog */ ++ /* determine and enforce maxTableLog */ + maxNbBits = HUF_setMaxHeight(huffNode, (U32)nonNullRank, maxNbBits); + if (maxNbBits > HUF_TABLELOG_MAX) return ERROR(GENERIC); /* check fit into table */ + +@@ -716,13 +800,20 @@ size_t HUF_estimateCompressedSize(const + } + + int HUF_validateCTable(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue) { +- HUF_CElt const* ct = CTable + 1; +- int bad = 0; +- int s; +- for (s = 0; s <= (int)maxSymbolValue; ++s) { +- bad |= (count[s] != 0) & (HUF_getNbBits(ct[s]) == 0); +- } +- return !bad; ++ HUF_CTableHeader header = HUF_readCTableHeader(CTable); ++ HUF_CElt const* ct = CTable + 1; ++ int bad = 0; ++ int s; ++ ++ assert(header.tableLog <= HUF_TABLELOG_ABSOLUTEMAX); ++ ++ if (header.maxSymbolValue < maxSymbolValue) ++ return 0; ++ ++ for (s = 0; s <= (int)maxSymbolValue; ++s) { ++ bad |= (count[s] != 0) & (HUF_getNbBits(ct[s]) == 0); ++ } ++ return !bad; + } + + size_t HUF_compressBound(size_t size) { return HUF_COMPRESSBOUND(size); } +@@ -804,7 +895,7 @@ FORCE_INLINE_TEMPLATE void HUF_addBits(H + #if DEBUGLEVEL >= 1 + { + size_t const nbBits = HUF_getNbBits(elt); +- size_t const dirtyBits = nbBits == 0 ? 0 : BIT_highbit32((U32)nbBits) + 1; ++ size_t const dirtyBits = nbBits == 0 ? 0 : ZSTD_highbit32((U32)nbBits) + 1; + (void)dirtyBits; + /* Middle bits are 0. */ + assert(((elt >> dirtyBits) << (dirtyBits + nbBits)) == 0); +@@ -884,7 +975,7 @@ static size_t HUF_closeCStream(HUF_CStre + { + size_t const nbBits = bitC->bitPos[0] & 0xFF; + if (bitC->ptr >= bitC->endPtr) return 0; /* overflow detected */ +- return (bitC->ptr - bitC->startPtr) + (nbBits > 0); ++ return (size_t)(bitC->ptr - bitC->startPtr) + (nbBits > 0); + } + } + +@@ -964,17 +1055,17 @@ HUF_compress1X_usingCTable_internal_body + const void* src, size_t srcSize, + const HUF_CElt* CTable) + { +- U32 const tableLog = (U32)CTable[0]; ++ U32 const tableLog = HUF_readCTableHeader(CTable).tableLog; + HUF_CElt const* ct = CTable + 1; + const BYTE* ip = (const BYTE*) src; + BYTE* const ostart = (BYTE*)dst; + BYTE* const oend = ostart + dstSize; +- BYTE* op = ostart; + HUF_CStream_t bitC; + + /* init */ + if (dstSize < 8) return 0; /* not enough space to compress */ +- { size_t const initErr = HUF_initCStream(&bitC, op, (size_t)(oend-op)); ++ { BYTE* op = ostart; ++ size_t const initErr = HUF_initCStream(&bitC, op, (size_t)(oend-op)); + if (HUF_isError(initErr)) return 0; } + + if (dstSize < HUF_tightCompressBound(srcSize, (size_t)tableLog) || tableLog > 11) +@@ -1045,9 +1136,9 @@ HUF_compress1X_usingCTable_internal_defa + static size_t + HUF_compress1X_usingCTable_internal(void* dst, size_t dstSize, + const void* src, size_t srcSize, +- const HUF_CElt* CTable, const int bmi2) ++ const HUF_CElt* CTable, const int flags) + { +- if (bmi2) { ++ if (flags & HUF_flags_bmi2) { + return HUF_compress1X_usingCTable_internal_bmi2(dst, dstSize, src, srcSize, CTable); + } + return HUF_compress1X_usingCTable_internal_default(dst, dstSize, src, srcSize, CTable); +@@ -1058,28 +1149,23 @@ HUF_compress1X_usingCTable_internal(void + static size_t + HUF_compress1X_usingCTable_internal(void* dst, size_t dstSize, + const void* src, size_t srcSize, +- const HUF_CElt* CTable, const int bmi2) ++ const HUF_CElt* CTable, const int flags) + { +- (void)bmi2; ++ (void)flags; + return HUF_compress1X_usingCTable_internal_body(dst, dstSize, src, srcSize, CTable); + } + + #endif + +-size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable) +-{ +- return HUF_compress1X_usingCTable_bmi2(dst, dstSize, src, srcSize, CTable, /* bmi2 */ 0); +-} +- +-size_t HUF_compress1X_usingCTable_bmi2(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int bmi2) ++size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int flags) + { +- return HUF_compress1X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, bmi2); ++ return HUF_compress1X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, flags); + } + + static size_t + HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize, + const void* src, size_t srcSize, +- const HUF_CElt* CTable, int bmi2) ++ const HUF_CElt* CTable, int flags) + { + size_t const segmentSize = (srcSize+3)/4; /* first 3 segments */ + const BYTE* ip = (const BYTE*) src; +@@ -1093,7 +1179,7 @@ HUF_compress4X_usingCTable_internal(void + op += 6; /* jumpTable */ + + assert(op <= oend); +- { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, bmi2) ); ++ { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, flags) ); + if (cSize == 0 || cSize > 65535) return 0; + MEM_writeLE16(ostart, (U16)cSize); + op += cSize; +@@ -1101,7 +1187,7 @@ HUF_compress4X_usingCTable_internal(void + + ip += segmentSize; + assert(op <= oend); +- { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, bmi2) ); ++ { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, flags) ); + if (cSize == 0 || cSize > 65535) return 0; + MEM_writeLE16(ostart+2, (U16)cSize); + op += cSize; +@@ -1109,7 +1195,7 @@ HUF_compress4X_usingCTable_internal(void + + ip += segmentSize; + assert(op <= oend); +- { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, bmi2) ); ++ { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, flags) ); + if (cSize == 0 || cSize > 65535) return 0; + MEM_writeLE16(ostart+4, (U16)cSize); + op += cSize; +@@ -1118,7 +1204,7 @@ HUF_compress4X_usingCTable_internal(void + ip += segmentSize; + assert(op <= oend); + assert(ip <= iend); +- { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, (size_t)(iend-ip), CTable, bmi2) ); ++ { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, (size_t)(iend-ip), CTable, flags) ); + if (cSize == 0 || cSize > 65535) return 0; + op += cSize; + } +@@ -1126,14 +1212,9 @@ HUF_compress4X_usingCTable_internal(void + return (size_t)(op-ostart); + } + +-size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable) +-{ +- return HUF_compress4X_usingCTable_bmi2(dst, dstSize, src, srcSize, CTable, /* bmi2 */ 0); +-} +- +-size_t HUF_compress4X_usingCTable_bmi2(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int bmi2) ++size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int flags) + { +- return HUF_compress4X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, bmi2); ++ return HUF_compress4X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, flags); + } + + typedef enum { HUF_singleStream, HUF_fourStreams } HUF_nbStreams_e; +@@ -1141,11 +1222,11 @@ typedef enum { HUF_singleStream, HUF_fou + static size_t HUF_compressCTable_internal( + BYTE* const ostart, BYTE* op, BYTE* const oend, + const void* src, size_t srcSize, +- HUF_nbStreams_e nbStreams, const HUF_CElt* CTable, const int bmi2) ++ HUF_nbStreams_e nbStreams, const HUF_CElt* CTable, const int flags) + { + size_t const cSize = (nbStreams==HUF_singleStream) ? +- HUF_compress1X_usingCTable_internal(op, (size_t)(oend - op), src, srcSize, CTable, bmi2) : +- HUF_compress4X_usingCTable_internal(op, (size_t)(oend - op), src, srcSize, CTable, bmi2); ++ HUF_compress1X_usingCTable_internal(op, (size_t)(oend - op), src, srcSize, CTable, flags) : ++ HUF_compress4X_usingCTable_internal(op, (size_t)(oend - op), src, srcSize, CTable, flags); + if (HUF_isError(cSize)) { return cSize; } + if (cSize==0) { return 0; } /* uncompressible */ + op += cSize; +@@ -1168,6 +1249,81 @@ typedef struct { + #define SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE 4096 + #define SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO 10 /* Must be >= 2 */ + ++unsigned HUF_cardinality(const unsigned* count, unsigned maxSymbolValue) ++{ ++ unsigned cardinality = 0; ++ unsigned i; ++ ++ for (i = 0; i < maxSymbolValue + 1; i++) { ++ if (count[i] != 0) cardinality += 1; ++ } ++ ++ return cardinality; ++} ++ ++unsigned HUF_minTableLog(unsigned symbolCardinality) ++{ ++ U32 minBitsSymbols = ZSTD_highbit32(symbolCardinality) + 1; ++ return minBitsSymbols; ++} ++ ++unsigned HUF_optimalTableLog( ++ unsigned maxTableLog, ++ size_t srcSize, ++ unsigned maxSymbolValue, ++ void* workSpace, size_t wkspSize, ++ HUF_CElt* table, ++ const unsigned* count, ++ int flags) ++{ ++ assert(srcSize > 1); /* Not supported, RLE should be used instead */ ++ assert(wkspSize >= sizeof(HUF_buildCTable_wksp_tables)); ++ ++ if (!(flags & HUF_flags_optimalDepth)) { ++ /* cheap evaluation, based on FSE */ ++ return FSE_optimalTableLog_internal(maxTableLog, srcSize, maxSymbolValue, 1); ++ } ++ ++ { BYTE* dst = (BYTE*)workSpace + sizeof(HUF_WriteCTableWksp); ++ size_t dstSize = wkspSize - sizeof(HUF_WriteCTableWksp); ++ size_t hSize, newSize; ++ const unsigned symbolCardinality = HUF_cardinality(count, maxSymbolValue); ++ const unsigned minTableLog = HUF_minTableLog(symbolCardinality); ++ size_t optSize = ((size_t) ~0) - 1; ++ unsigned optLog = maxTableLog, optLogGuess; ++ ++ DEBUGLOG(6, "HUF_optimalTableLog: probing huf depth (srcSize=%zu)", srcSize); ++ ++ /* Search until size increases */ ++ for (optLogGuess = minTableLog; optLogGuess <= maxTableLog; optLogGuess++) { ++ DEBUGLOG(7, "checking for huffLog=%u", optLogGuess); ++ ++ { size_t maxBits = HUF_buildCTable_wksp(table, count, maxSymbolValue, optLogGuess, workSpace, wkspSize); ++ if (ERR_isError(maxBits)) continue; ++ ++ if (maxBits < optLogGuess && optLogGuess > minTableLog) break; ++ ++ hSize = HUF_writeCTable_wksp(dst, dstSize, table, maxSymbolValue, (U32)maxBits, workSpace, wkspSize); ++ } ++ ++ if (ERR_isError(hSize)) continue; ++ ++ newSize = HUF_estimateCompressedSize(table, count, maxSymbolValue) + hSize; ++ ++ if (newSize > optSize + 1) { ++ break; ++ } ++ ++ if (newSize < optSize) { ++ optSize = newSize; ++ optLog = optLogGuess; ++ } ++ } ++ assert(optLog <= HUF_TABLELOG_MAX); ++ return optLog; ++ } ++} ++ + /* HUF_compress_internal() : + * `workSpace_align4` must be aligned on 4-bytes boundaries, + * and occupies the same space as a table of HUF_WORKSPACE_SIZE_U64 unsigned */ +@@ -1177,14 +1333,14 @@ HUF_compress_internal (void* dst, size_t + unsigned maxSymbolValue, unsigned huffLog, + HUF_nbStreams_e nbStreams, + void* workSpace, size_t wkspSize, +- HUF_CElt* oldHufTable, HUF_repeat* repeat, int preferRepeat, +- const int bmi2, unsigned suspectUncompressible) ++ HUF_CElt* oldHufTable, HUF_repeat* repeat, int flags) + { + HUF_compress_tables_t* const table = (HUF_compress_tables_t*)HUF_alignUpWorkspace(workSpace, &wkspSize, ZSTD_ALIGNOF(size_t)); + BYTE* const ostart = (BYTE*)dst; + BYTE* const oend = ostart + dstSize; + BYTE* op = ostart; + ++ DEBUGLOG(5, "HUF_compress_internal (srcSize=%zu)", srcSize); + HUF_STATIC_ASSERT(sizeof(*table) + HUF_WORKSPACE_MAX_ALIGNMENT <= HUF_WORKSPACE_SIZE); + + /* checks & inits */ +@@ -1198,16 +1354,17 @@ HUF_compress_internal (void* dst, size_t + if (!huffLog) huffLog = HUF_TABLELOG_DEFAULT; + + /* Heuristic : If old table is valid, use it for small inputs */ +- if (preferRepeat && repeat && *repeat == HUF_repeat_valid) { ++ if ((flags & HUF_flags_preferRepeat) && repeat && *repeat == HUF_repeat_valid) { + return HUF_compressCTable_internal(ostart, op, oend, + src, srcSize, +- nbStreams, oldHufTable, bmi2); ++ nbStreams, oldHufTable, flags); + } + + /* If uncompressible data is suspected, do a smaller sampling first */ + DEBUG_STATIC_ASSERT(SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO >= 2); +- if (suspectUncompressible && srcSize >= (SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE * SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO)) { ++ if ((flags & HUF_flags_suspectUncompressible) && srcSize >= (SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE * SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO)) { + size_t largestTotal = 0; ++ DEBUGLOG(5, "input suspected incompressible : sampling to check"); + { unsigned maxSymbolValueBegin = maxSymbolValue; + CHECK_V_F(largestBegin, HIST_count_simple (table->count, &maxSymbolValueBegin, (const BYTE*)src, SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE) ); + largestTotal += largestBegin; +@@ -1224,6 +1381,7 @@ HUF_compress_internal (void* dst, size_t + if (largest == srcSize) { *ostart = ((const BYTE*)src)[0]; return 1; } /* single symbol, rle */ + if (largest <= (srcSize >> 7)+4) return 0; /* heuristic : probably not compressible enough */ + } ++ DEBUGLOG(6, "histogram detail completed (%zu symbols)", showU32(table->count, maxSymbolValue+1)); + + /* Check validity of previous table */ + if ( repeat +@@ -1232,25 +1390,20 @@ HUF_compress_internal (void* dst, size_t + *repeat = HUF_repeat_none; + } + /* Heuristic : use existing table for small inputs */ +- if (preferRepeat && repeat && *repeat != HUF_repeat_none) { ++ if ((flags & HUF_flags_preferRepeat) && repeat && *repeat != HUF_repeat_none) { + return HUF_compressCTable_internal(ostart, op, oend, + src, srcSize, +- nbStreams, oldHufTable, bmi2); ++ nbStreams, oldHufTable, flags); + } + + /* Build Huffman Tree */ +- huffLog = HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue); ++ huffLog = HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue, &table->wksps, sizeof(table->wksps), table->CTable, table->count, flags); + { size_t const maxBits = HUF_buildCTable_wksp(table->CTable, table->count, + maxSymbolValue, huffLog, + &table->wksps.buildCTable_wksp, sizeof(table->wksps.buildCTable_wksp)); + CHECK_F(maxBits); + huffLog = (U32)maxBits; +- } +- /* Zero unused symbols in CTable, so we can check it for validity */ +- { +- size_t const ctableSize = HUF_CTABLE_SIZE_ST(maxSymbolValue); +- size_t const unusedSize = sizeof(table->CTable) - ctableSize * sizeof(HUF_CElt); +- ZSTD_memset(table->CTable + ctableSize, 0, unusedSize); ++ DEBUGLOG(6, "bit distribution completed (%zu symbols)", showCTableBits(table->CTable + 1, maxSymbolValue+1)); + } + + /* Write table description header */ +@@ -1263,7 +1416,7 @@ HUF_compress_internal (void* dst, size_t + if (oldSize <= hSize + newSize || hSize + 12 >= srcSize) { + return HUF_compressCTable_internal(ostart, op, oend, + src, srcSize, +- nbStreams, oldHufTable, bmi2); ++ nbStreams, oldHufTable, flags); + } } + + /* Use the new huffman table */ +@@ -1275,61 +1428,35 @@ HUF_compress_internal (void* dst, size_t + } + return HUF_compressCTable_internal(ostart, op, oend, + src, srcSize, +- nbStreams, table->CTable, bmi2); +-} +- +- +-size_t HUF_compress1X_wksp (void* dst, size_t dstSize, +- const void* src, size_t srcSize, +- unsigned maxSymbolValue, unsigned huffLog, +- void* workSpace, size_t wkspSize) +-{ +- return HUF_compress_internal(dst, dstSize, src, srcSize, +- maxSymbolValue, huffLog, HUF_singleStream, +- workSpace, wkspSize, +- NULL, NULL, 0, 0 /*bmi2*/, 0); ++ nbStreams, table->CTable, flags); + } + + size_t HUF_compress1X_repeat (void* dst, size_t dstSize, + const void* src, size_t srcSize, + unsigned maxSymbolValue, unsigned huffLog, + void* workSpace, size_t wkspSize, +- HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, +- int bmi2, unsigned suspectUncompressible) ++ HUF_CElt* hufTable, HUF_repeat* repeat, int flags) + { ++ DEBUGLOG(5, "HUF_compress1X_repeat (srcSize = %zu)", srcSize); + return HUF_compress_internal(dst, dstSize, src, srcSize, + maxSymbolValue, huffLog, HUF_singleStream, + workSpace, wkspSize, hufTable, +- repeat, preferRepeat, bmi2, suspectUncompressible); +-} +- +-/* HUF_compress4X_repeat(): +- * compress input using 4 streams. +- * provide workspace to generate compression tables */ +-size_t HUF_compress4X_wksp (void* dst, size_t dstSize, +- const void* src, size_t srcSize, +- unsigned maxSymbolValue, unsigned huffLog, +- void* workSpace, size_t wkspSize) +-{ +- return HUF_compress_internal(dst, dstSize, src, srcSize, +- maxSymbolValue, huffLog, HUF_fourStreams, +- workSpace, wkspSize, +- NULL, NULL, 0, 0 /*bmi2*/, 0); ++ repeat, flags); + } + + /* HUF_compress4X_repeat(): + * compress input using 4 streams. + * consider skipping quickly +- * re-use an existing huffman compression table */ ++ * reuse an existing huffman compression table */ + size_t HUF_compress4X_repeat (void* dst, size_t dstSize, + const void* src, size_t srcSize, + unsigned maxSymbolValue, unsigned huffLog, + void* workSpace, size_t wkspSize, +- HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2, unsigned suspectUncompressible) ++ HUF_CElt* hufTable, HUF_repeat* repeat, int flags) + { ++ DEBUGLOG(5, "HUF_compress4X_repeat (srcSize = %zu)", srcSize); + return HUF_compress_internal(dst, dstSize, src, srcSize, + maxSymbolValue, huffLog, HUF_fourStreams, + workSpace, wkspSize, +- hufTable, repeat, preferRepeat, bmi2, suspectUncompressible); ++ hufTable, repeat, flags); + } +- +--- a/lib/zstd/compress/zstd_compress.c ++++ b/lib/zstd/compress/zstd_compress.c +@@ -1,5 +1,6 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -11,12 +12,12 @@ + /*-************************************* + * Dependencies + ***************************************/ ++#include "../common/allocations.h" /* ZSTD_customMalloc, ZSTD_customCalloc, ZSTD_customFree */ + #include "../common/zstd_deps.h" /* INT_MAX, ZSTD_memset, ZSTD_memcpy */ + #include "../common/mem.h" + #include "hist.h" /* HIST_countFast_wksp */ + #define FSE_STATIC_LINKING_ONLY /* FSE_encodeSymbol */ + #include "../common/fse.h" +-#define HUF_STATIC_LINKING_ONLY + #include "../common/huf.h" + #include "zstd_compress_internal.h" + #include "zstd_compress_sequences.h" +@@ -27,6 +28,7 @@ + #include "zstd_opt.h" + #include "zstd_ldm.h" + #include "zstd_compress_superblock.h" ++#include "../common/bits.h" /* ZSTD_highbit32, ZSTD_rotateRight_U64 */ + + /* *************************************************************** + * Tuning parameters +@@ -55,14 +57,17 @@ + * Helper functions + ***************************************/ + /* ZSTD_compressBound() +- * Note that the result from this function is only compatible with the "normal" +- * full-block strategy. +- * When there are a lot of small blocks due to frequent flush in streaming mode +- * the overhead of headers can make the compressed data to be larger than the +- * return value of ZSTD_compressBound(). ++ * Note that the result from this function is only valid for ++ * the one-pass compression functions. ++ * When employing the streaming mode, ++ * if flushes are frequently altering the size of blocks, ++ * the overhead from block headers can make the compressed data larger ++ * than the return value of ZSTD_compressBound(). + */ + size_t ZSTD_compressBound(size_t srcSize) { +- return ZSTD_COMPRESSBOUND(srcSize); ++ size_t const r = ZSTD_COMPRESSBOUND(srcSize); ++ if (r==0) return ERROR(srcSize_wrong); ++ return r; + } + + +@@ -168,15 +173,13 @@ static void ZSTD_freeCCtxContent(ZSTD_CC + + size_t ZSTD_freeCCtx(ZSTD_CCtx* cctx) + { ++ DEBUGLOG(3, "ZSTD_freeCCtx (address: %p)", (void*)cctx); + if (cctx==NULL) return 0; /* support free on NULL */ + RETURN_ERROR_IF(cctx->staticSize, memory_allocation, + "not compatible with static CCtx"); +- { +- int cctxInWorkspace = ZSTD_cwksp_owns_buffer(&cctx->workspace, cctx); ++ { int cctxInWorkspace = ZSTD_cwksp_owns_buffer(&cctx->workspace, cctx); + ZSTD_freeCCtxContent(cctx); +- if (!cctxInWorkspace) { +- ZSTD_customFree(cctx, cctx->customMem); +- } ++ if (!cctxInWorkspace) ZSTD_customFree(cctx, cctx->customMem); + } + return 0; + } +@@ -257,9 +260,9 @@ static int ZSTD_allocateChainTable(const + return forDDSDict || ((strategy != ZSTD_fast) && !ZSTD_rowMatchFinderUsed(strategy, useRowMatchFinder)); + } + +-/* Returns 1 if compression parameters are such that we should ++/* Returns ZSTD_ps_enable if compression parameters are such that we should + * enable long distance matching (wlog >= 27, strategy >= btopt). +- * Returns 0 otherwise. ++ * Returns ZSTD_ps_disable otherwise. + */ + static ZSTD_paramSwitch_e ZSTD_resolveEnableLdm(ZSTD_paramSwitch_e mode, + const ZSTD_compressionParameters* const cParams) { +@@ -267,6 +270,34 @@ static ZSTD_paramSwitch_e ZSTD_resolveEn + return (cParams->strategy >= ZSTD_btopt && cParams->windowLog >= 27) ? ZSTD_ps_enable : ZSTD_ps_disable; + } + ++static int ZSTD_resolveExternalSequenceValidation(int mode) { ++ return mode; ++} ++ ++/* Resolves maxBlockSize to the default if no value is present. */ ++static size_t ZSTD_resolveMaxBlockSize(size_t maxBlockSize) { ++ if (maxBlockSize == 0) { ++ return ZSTD_BLOCKSIZE_MAX; ++ } else { ++ return maxBlockSize; ++ } ++} ++ ++static ZSTD_paramSwitch_e ZSTD_resolveExternalRepcodeSearch(ZSTD_paramSwitch_e value, int cLevel) { ++ if (value != ZSTD_ps_auto) return value; ++ if (cLevel < 10) { ++ return ZSTD_ps_disable; ++ } else { ++ return ZSTD_ps_enable; ++ } ++} ++ ++/* Returns 1 if compression parameters are such that CDict hashtable and chaintable indices are tagged. ++ * If so, the tags need to be removed in ZSTD_resetCCtx_byCopyingCDict. */ ++static int ZSTD_CDictIndicesAreTagged(const ZSTD_compressionParameters* const cParams) { ++ return cParams->strategy == ZSTD_fast || cParams->strategy == ZSTD_dfast; ++} ++ + static ZSTD_CCtx_params ZSTD_makeCCtxParamsFromCParams( + ZSTD_compressionParameters cParams) + { +@@ -284,6 +315,10 @@ static ZSTD_CCtx_params ZSTD_makeCCtxPar + } + cctxParams.useBlockSplitter = ZSTD_resolveBlockSplitterMode(cctxParams.useBlockSplitter, &cParams); + cctxParams.useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(cctxParams.useRowMatchFinder, &cParams); ++ cctxParams.validateSequences = ZSTD_resolveExternalSequenceValidation(cctxParams.validateSequences); ++ cctxParams.maxBlockSize = ZSTD_resolveMaxBlockSize(cctxParams.maxBlockSize); ++ cctxParams.searchForExternalRepcodes = ZSTD_resolveExternalRepcodeSearch(cctxParams.searchForExternalRepcodes, ++ cctxParams.compressionLevel); + assert(!ZSTD_checkCParams(cParams)); + return cctxParams; + } +@@ -329,10 +364,13 @@ size_t ZSTD_CCtxParams_init(ZSTD_CCtx_pa + #define ZSTD_NO_CLEVEL 0 + + /* +- * Initializes the cctxParams from params and compressionLevel. ++ * Initializes `cctxParams` from `params` and `compressionLevel`. + * @param compressionLevel If params are derived from a compression level then that compression level, otherwise ZSTD_NO_CLEVEL. + */ +-static void ZSTD_CCtxParams_init_internal(ZSTD_CCtx_params* cctxParams, ZSTD_parameters const* params, int compressionLevel) ++static void ++ZSTD_CCtxParams_init_internal(ZSTD_CCtx_params* cctxParams, ++ const ZSTD_parameters* params, ++ int compressionLevel) + { + assert(!ZSTD_checkCParams(params->cParams)); + ZSTD_memset(cctxParams, 0, sizeof(*cctxParams)); +@@ -345,6 +383,9 @@ static void ZSTD_CCtxParams_init_interna + cctxParams->useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(cctxParams->useRowMatchFinder, ¶ms->cParams); + cctxParams->useBlockSplitter = ZSTD_resolveBlockSplitterMode(cctxParams->useBlockSplitter, ¶ms->cParams); + cctxParams->ldmParams.enableLdm = ZSTD_resolveEnableLdm(cctxParams->ldmParams.enableLdm, ¶ms->cParams); ++ cctxParams->validateSequences = ZSTD_resolveExternalSequenceValidation(cctxParams->validateSequences); ++ cctxParams->maxBlockSize = ZSTD_resolveMaxBlockSize(cctxParams->maxBlockSize); ++ cctxParams->searchForExternalRepcodes = ZSTD_resolveExternalRepcodeSearch(cctxParams->searchForExternalRepcodes, compressionLevel); + DEBUGLOG(4, "ZSTD_CCtxParams_init_internal: useRowMatchFinder=%d, useBlockSplitter=%d ldm=%d", + cctxParams->useRowMatchFinder, cctxParams->useBlockSplitter, cctxParams->ldmParams.enableLdm); + } +@@ -359,7 +400,7 @@ size_t ZSTD_CCtxParams_init_advanced(ZST + + /* + * Sets cctxParams' cParams and fParams from params, but otherwise leaves them alone. +- * @param param Validated zstd parameters. ++ * @param params Validated zstd parameters. + */ + static void ZSTD_CCtxParams_setZstdParams( + ZSTD_CCtx_params* cctxParams, const ZSTD_parameters* params) +@@ -455,8 +496,8 @@ ZSTD_bounds ZSTD_cParam_getBounds(ZSTD_c + return bounds; + + case ZSTD_c_enableLongDistanceMatching: +- bounds.lowerBound = 0; +- bounds.upperBound = 1; ++ bounds.lowerBound = (int)ZSTD_ps_auto; ++ bounds.upperBound = (int)ZSTD_ps_disable; + return bounds; + + case ZSTD_c_ldmHashLog: +@@ -549,6 +590,26 @@ ZSTD_bounds ZSTD_cParam_getBounds(ZSTD_c + bounds.upperBound = 1; + return bounds; + ++ case ZSTD_c_prefetchCDictTables: ++ bounds.lowerBound = (int)ZSTD_ps_auto; ++ bounds.upperBound = (int)ZSTD_ps_disable; ++ return bounds; ++ ++ case ZSTD_c_enableSeqProducerFallback: ++ bounds.lowerBound = 0; ++ bounds.upperBound = 1; ++ return bounds; ++ ++ case ZSTD_c_maxBlockSize: ++ bounds.lowerBound = ZSTD_BLOCKSIZE_MAX_MIN; ++ bounds.upperBound = ZSTD_BLOCKSIZE_MAX; ++ return bounds; ++ ++ case ZSTD_c_searchForExternalRepcodes: ++ bounds.lowerBound = (int)ZSTD_ps_auto; ++ bounds.upperBound = (int)ZSTD_ps_disable; ++ return bounds; ++ + default: + bounds.error = ERROR(parameter_unsupported); + return bounds; +@@ -567,10 +628,11 @@ static size_t ZSTD_cParam_clampBounds(ZS + return 0; + } + +-#define BOUNDCHECK(cParam, val) { \ +- RETURN_ERROR_IF(!ZSTD_cParam_withinBounds(cParam,val), \ +- parameter_outOfBound, "Param out of bounds"); \ +-} ++#define BOUNDCHECK(cParam, val) \ ++ do { \ ++ RETURN_ERROR_IF(!ZSTD_cParam_withinBounds(cParam,val), \ ++ parameter_outOfBound, "Param out of bounds"); \ ++ } while (0) + + + static int ZSTD_isUpdateAuthorized(ZSTD_cParameter param) +@@ -613,6 +675,10 @@ static int ZSTD_isUpdateAuthorized(ZSTD_ + case ZSTD_c_useBlockSplitter: + case ZSTD_c_useRowMatchFinder: + case ZSTD_c_deterministicRefPrefix: ++ case ZSTD_c_prefetchCDictTables: ++ case ZSTD_c_enableSeqProducerFallback: ++ case ZSTD_c_maxBlockSize: ++ case ZSTD_c_searchForExternalRepcodes: + default: + return 0; + } +@@ -625,7 +691,7 @@ size_t ZSTD_CCtx_setParameter(ZSTD_CCtx* + if (ZSTD_isUpdateAuthorized(param)) { + cctx->cParamsChanged = 1; + } else { +- RETURN_ERROR(stage_wrong, "can only set params in ctx init stage"); ++ RETURN_ERROR(stage_wrong, "can only set params in cctx init stage"); + } } + + switch(param) +@@ -668,6 +734,10 @@ size_t ZSTD_CCtx_setParameter(ZSTD_CCtx* + case ZSTD_c_useBlockSplitter: + case ZSTD_c_useRowMatchFinder: + case ZSTD_c_deterministicRefPrefix: ++ case ZSTD_c_prefetchCDictTables: ++ case ZSTD_c_enableSeqProducerFallback: ++ case ZSTD_c_maxBlockSize: ++ case ZSTD_c_searchForExternalRepcodes: + break; + + default: RETURN_ERROR(parameter_unsupported, "unknown parameter"); +@@ -723,12 +793,12 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD + case ZSTD_c_minMatch : + if (value!=0) /* 0 => use default */ + BOUNDCHECK(ZSTD_c_minMatch, value); +- CCtxParams->cParams.minMatch = value; ++ CCtxParams->cParams.minMatch = (U32)value; + return CCtxParams->cParams.minMatch; + + case ZSTD_c_targetLength : + BOUNDCHECK(ZSTD_c_targetLength, value); +- CCtxParams->cParams.targetLength = value; ++ CCtxParams->cParams.targetLength = (U32)value; + return CCtxParams->cParams.targetLength; + + case ZSTD_c_strategy : +@@ -741,12 +811,12 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD + /* Content size written in frame header _when known_ (default:1) */ + DEBUGLOG(4, "set content size flag = %u", (value!=0)); + CCtxParams->fParams.contentSizeFlag = value != 0; +- return CCtxParams->fParams.contentSizeFlag; ++ return (size_t)CCtxParams->fParams.contentSizeFlag; + + case ZSTD_c_checksumFlag : + /* A 32-bits content checksum will be calculated and written at end of frame (default:0) */ + CCtxParams->fParams.checksumFlag = value != 0; +- return CCtxParams->fParams.checksumFlag; ++ return (size_t)CCtxParams->fParams.checksumFlag; + + case ZSTD_c_dictIDFlag : /* When applicable, dictionary's dictID is provided in frame header (default:1) */ + DEBUGLOG(4, "set dictIDFlag = %u", (value!=0)); +@@ -755,18 +825,18 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD + + case ZSTD_c_forceMaxWindow : + CCtxParams->forceWindow = (value != 0); +- return CCtxParams->forceWindow; ++ return (size_t)CCtxParams->forceWindow; + + case ZSTD_c_forceAttachDict : { + const ZSTD_dictAttachPref_e pref = (ZSTD_dictAttachPref_e)value; +- BOUNDCHECK(ZSTD_c_forceAttachDict, pref); ++ BOUNDCHECK(ZSTD_c_forceAttachDict, (int)pref); + CCtxParams->attachDictPref = pref; + return CCtxParams->attachDictPref; + } + + case ZSTD_c_literalCompressionMode : { + const ZSTD_paramSwitch_e lcm = (ZSTD_paramSwitch_e)value; +- BOUNDCHECK(ZSTD_c_literalCompressionMode, lcm); ++ BOUNDCHECK(ZSTD_c_literalCompressionMode, (int)lcm); + CCtxParams->literalCompressionMode = lcm; + return CCtxParams->literalCompressionMode; + } +@@ -789,47 +859,50 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD + + case ZSTD_c_enableDedicatedDictSearch : + CCtxParams->enableDedicatedDictSearch = (value!=0); +- return CCtxParams->enableDedicatedDictSearch; ++ return (size_t)CCtxParams->enableDedicatedDictSearch; + + case ZSTD_c_enableLongDistanceMatching : ++ BOUNDCHECK(ZSTD_c_enableLongDistanceMatching, value); + CCtxParams->ldmParams.enableLdm = (ZSTD_paramSwitch_e)value; + return CCtxParams->ldmParams.enableLdm; + + case ZSTD_c_ldmHashLog : + if (value!=0) /* 0 ==> auto */ + BOUNDCHECK(ZSTD_c_ldmHashLog, value); +- CCtxParams->ldmParams.hashLog = value; ++ CCtxParams->ldmParams.hashLog = (U32)value; + return CCtxParams->ldmParams.hashLog; + + case ZSTD_c_ldmMinMatch : + if (value!=0) /* 0 ==> default */ + BOUNDCHECK(ZSTD_c_ldmMinMatch, value); +- CCtxParams->ldmParams.minMatchLength = value; ++ CCtxParams->ldmParams.minMatchLength = (U32)value; + return CCtxParams->ldmParams.minMatchLength; + + case ZSTD_c_ldmBucketSizeLog : + if (value!=0) /* 0 ==> default */ + BOUNDCHECK(ZSTD_c_ldmBucketSizeLog, value); +- CCtxParams->ldmParams.bucketSizeLog = value; ++ CCtxParams->ldmParams.bucketSizeLog = (U32)value; + return CCtxParams->ldmParams.bucketSizeLog; + + case ZSTD_c_ldmHashRateLog : + if (value!=0) /* 0 ==> default */ + BOUNDCHECK(ZSTD_c_ldmHashRateLog, value); +- CCtxParams->ldmParams.hashRateLog = value; ++ CCtxParams->ldmParams.hashRateLog = (U32)value; + return CCtxParams->ldmParams.hashRateLog; + + case ZSTD_c_targetCBlockSize : +- if (value!=0) /* 0 ==> default */ ++ if (value!=0) { /* 0 ==> default */ ++ value = MAX(value, ZSTD_TARGETCBLOCKSIZE_MIN); + BOUNDCHECK(ZSTD_c_targetCBlockSize, value); +- CCtxParams->targetCBlockSize = value; ++ } ++ CCtxParams->targetCBlockSize = (U32)value; + return CCtxParams->targetCBlockSize; + + case ZSTD_c_srcSizeHint : + if (value!=0) /* 0 ==> default */ + BOUNDCHECK(ZSTD_c_srcSizeHint, value); + CCtxParams->srcSizeHint = value; +- return CCtxParams->srcSizeHint; ++ return (size_t)CCtxParams->srcSizeHint; + + case ZSTD_c_stableInBuffer: + BOUNDCHECK(ZSTD_c_stableInBuffer, value); +@@ -849,7 +922,7 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD + case ZSTD_c_validateSequences: + BOUNDCHECK(ZSTD_c_validateSequences, value); + CCtxParams->validateSequences = value; +- return CCtxParams->validateSequences; ++ return (size_t)CCtxParams->validateSequences; + + case ZSTD_c_useBlockSplitter: + BOUNDCHECK(ZSTD_c_useBlockSplitter, value); +@@ -864,7 +937,28 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD + case ZSTD_c_deterministicRefPrefix: + BOUNDCHECK(ZSTD_c_deterministicRefPrefix, value); + CCtxParams->deterministicRefPrefix = !!value; +- return CCtxParams->deterministicRefPrefix; ++ return (size_t)CCtxParams->deterministicRefPrefix; ++ ++ case ZSTD_c_prefetchCDictTables: ++ BOUNDCHECK(ZSTD_c_prefetchCDictTables, value); ++ CCtxParams->prefetchCDictTables = (ZSTD_paramSwitch_e)value; ++ return CCtxParams->prefetchCDictTables; ++ ++ case ZSTD_c_enableSeqProducerFallback: ++ BOUNDCHECK(ZSTD_c_enableSeqProducerFallback, value); ++ CCtxParams->enableMatchFinderFallback = value; ++ return (size_t)CCtxParams->enableMatchFinderFallback; ++ ++ case ZSTD_c_maxBlockSize: ++ if (value!=0) /* 0 ==> default */ ++ BOUNDCHECK(ZSTD_c_maxBlockSize, value); ++ CCtxParams->maxBlockSize = value; ++ return CCtxParams->maxBlockSize; ++ ++ case ZSTD_c_searchForExternalRepcodes: ++ BOUNDCHECK(ZSTD_c_searchForExternalRepcodes, value); ++ CCtxParams->searchForExternalRepcodes = (ZSTD_paramSwitch_e)value; ++ return CCtxParams->searchForExternalRepcodes; + + default: RETURN_ERROR(parameter_unsupported, "unknown parameter"); + } +@@ -980,6 +1074,18 @@ size_t ZSTD_CCtxParams_getParameter( + case ZSTD_c_deterministicRefPrefix: + *value = (int)CCtxParams->deterministicRefPrefix; + break; ++ case ZSTD_c_prefetchCDictTables: ++ *value = (int)CCtxParams->prefetchCDictTables; ++ break; ++ case ZSTD_c_enableSeqProducerFallback: ++ *value = CCtxParams->enableMatchFinderFallback; ++ break; ++ case ZSTD_c_maxBlockSize: ++ *value = (int)CCtxParams->maxBlockSize; ++ break; ++ case ZSTD_c_searchForExternalRepcodes: ++ *value = (int)CCtxParams->searchForExternalRepcodes; ++ break; + default: RETURN_ERROR(parameter_unsupported, "unknown parameter"); + } + return 0; +@@ -1006,9 +1112,47 @@ size_t ZSTD_CCtx_setParametersUsingCCtxP + return 0; + } + ++size_t ZSTD_CCtx_setCParams(ZSTD_CCtx* cctx, ZSTD_compressionParameters cparams) ++{ ++ ZSTD_STATIC_ASSERT(sizeof(cparams) == 7 * 4 /* all params are listed below */); ++ DEBUGLOG(4, "ZSTD_CCtx_setCParams"); ++ /* only update if all parameters are valid */ ++ FORWARD_IF_ERROR(ZSTD_checkCParams(cparams), ""); ++ FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_windowLog, cparams.windowLog), ""); ++ FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_chainLog, cparams.chainLog), ""); ++ FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_hashLog, cparams.hashLog), ""); ++ FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_searchLog, cparams.searchLog), ""); ++ FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_minMatch, cparams.minMatch), ""); ++ FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_targetLength, cparams.targetLength), ""); ++ FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_strategy, cparams.strategy), ""); ++ return 0; ++} ++ ++size_t ZSTD_CCtx_setFParams(ZSTD_CCtx* cctx, ZSTD_frameParameters fparams) ++{ ++ ZSTD_STATIC_ASSERT(sizeof(fparams) == 3 * 4 /* all params are listed below */); ++ DEBUGLOG(4, "ZSTD_CCtx_setFParams"); ++ FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_contentSizeFlag, fparams.contentSizeFlag != 0), ""); ++ FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_checksumFlag, fparams.checksumFlag != 0), ""); ++ FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_dictIDFlag, fparams.noDictIDFlag == 0), ""); ++ return 0; ++} ++ ++size_t ZSTD_CCtx_setParams(ZSTD_CCtx* cctx, ZSTD_parameters params) ++{ ++ DEBUGLOG(4, "ZSTD_CCtx_setParams"); ++ /* First check cParams, because we want to update all or none. */ ++ FORWARD_IF_ERROR(ZSTD_checkCParams(params.cParams), ""); ++ /* Next set fParams, because this could fail if the cctx isn't in init stage. */ ++ FORWARD_IF_ERROR(ZSTD_CCtx_setFParams(cctx, params.fParams), ""); ++ /* Finally set cParams, which should succeed. */ ++ FORWARD_IF_ERROR(ZSTD_CCtx_setCParams(cctx, params.cParams), ""); ++ return 0; ++} ++ + size_t ZSTD_CCtx_setPledgedSrcSize(ZSTD_CCtx* cctx, unsigned long long pledgedSrcSize) + { +- DEBUGLOG(4, "ZSTD_CCtx_setPledgedSrcSize to %u bytes", (U32)pledgedSrcSize); ++ DEBUGLOG(4, "ZSTD_CCtx_setPledgedSrcSize to %llu bytes", pledgedSrcSize); + RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong, + "Can't set pledgedSrcSize when not in init stage."); + cctx->pledgedSrcSizePlusOne = pledgedSrcSize+1; +@@ -1024,9 +1168,9 @@ static void ZSTD_dedicatedDictSearch_rev + ZSTD_compressionParameters* cParams); + + /* +- * Initializes the local dict using the requested parameters. +- * NOTE: This does not use the pledged src size, because it may be used for more +- * than one compression. ++ * Initializes the local dictionary using requested parameters. ++ * NOTE: Initialization does not employ the pledged src size, ++ * because the dictionary may be used for multiple compressions. + */ + static size_t ZSTD_initLocalDict(ZSTD_CCtx* cctx) + { +@@ -1039,8 +1183,8 @@ static size_t ZSTD_initLocalDict(ZSTD_CC + return 0; + } + if (dl->cdict != NULL) { +- assert(cctx->cdict == dl->cdict); + /* Local dictionary already initialized. */ ++ assert(cctx->cdict == dl->cdict); + return 0; + } + assert(dl->dictSize > 0); +@@ -1060,26 +1204,30 @@ static size_t ZSTD_initLocalDict(ZSTD_CC + } + + size_t ZSTD_CCtx_loadDictionary_advanced( +- ZSTD_CCtx* cctx, const void* dict, size_t dictSize, +- ZSTD_dictLoadMethod_e dictLoadMethod, ZSTD_dictContentType_e dictContentType) ++ ZSTD_CCtx* cctx, ++ const void* dict, size_t dictSize, ++ ZSTD_dictLoadMethod_e dictLoadMethod, ++ ZSTD_dictContentType_e dictContentType) + { +- RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong, +- "Can't load a dictionary when ctx is not in init stage."); + DEBUGLOG(4, "ZSTD_CCtx_loadDictionary_advanced (size: %u)", (U32)dictSize); +- ZSTD_clearAllDicts(cctx); /* in case one already exists */ +- if (dict == NULL || dictSize == 0) /* no dictionary mode */ ++ RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong, ++ "Can't load a dictionary when cctx is not in init stage."); ++ ZSTD_clearAllDicts(cctx); /* erase any previously set dictionary */ ++ if (dict == NULL || dictSize == 0) /* no dictionary */ + return 0; + if (dictLoadMethod == ZSTD_dlm_byRef) { + cctx->localDict.dict = dict; + } else { ++ /* copy dictionary content inside CCtx to own its lifetime */ + void* dictBuffer; + RETURN_ERROR_IF(cctx->staticSize, memory_allocation, +- "no malloc for static CCtx"); ++ "static CCtx can't allocate for an internal copy of dictionary"); + dictBuffer = ZSTD_customMalloc(dictSize, cctx->customMem); +- RETURN_ERROR_IF(!dictBuffer, memory_allocation, "NULL pointer!"); ++ RETURN_ERROR_IF(dictBuffer==NULL, memory_allocation, ++ "allocation failed for dictionary content"); + ZSTD_memcpy(dictBuffer, dict, dictSize); +- cctx->localDict.dictBuffer = dictBuffer; +- cctx->localDict.dict = dictBuffer; ++ cctx->localDict.dictBuffer = dictBuffer; /* owned ptr to free */ ++ cctx->localDict.dict = dictBuffer; /* read-only reference */ + } + cctx->localDict.dictSize = dictSize; + cctx->localDict.dictContentType = dictContentType; +@@ -1149,7 +1297,7 @@ size_t ZSTD_CCtx_reset(ZSTD_CCtx* cctx, + if ( (reset == ZSTD_reset_parameters) + || (reset == ZSTD_reset_session_and_parameters) ) { + RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong, +- "Can't reset parameters only when not in init stage."); ++ "Reset parameters is only possible during init stage."); + ZSTD_clearAllDicts(cctx); + return ZSTD_CCtxParams_reset(&cctx->requestedParams); + } +@@ -1178,11 +1326,12 @@ size_t ZSTD_checkCParams(ZSTD_compressio + static ZSTD_compressionParameters + ZSTD_clampCParams(ZSTD_compressionParameters cParams) + { +-# define CLAMP_TYPE(cParam, val, type) { \ +- ZSTD_bounds const bounds = ZSTD_cParam_getBounds(cParam); \ +- if ((int)valbounds.upperBound) val=(type)bounds.upperBound; \ +- } ++# define CLAMP_TYPE(cParam, val, type) \ ++ do { \ ++ ZSTD_bounds const bounds = ZSTD_cParam_getBounds(cParam); \ ++ if ((int)valbounds.upperBound) val=(type)bounds.upperBound; \ ++ } while (0) + # define CLAMP(cParam, val) CLAMP_TYPE(cParam, val, unsigned) + CLAMP(ZSTD_c_windowLog, cParams.windowLog); + CLAMP(ZSTD_c_chainLog, cParams.chainLog); +@@ -1247,12 +1396,55 @@ static ZSTD_compressionParameters + ZSTD_adjustCParams_internal(ZSTD_compressionParameters cPar, + unsigned long long srcSize, + size_t dictSize, +- ZSTD_cParamMode_e mode) ++ ZSTD_cParamMode_e mode, ++ ZSTD_paramSwitch_e useRowMatchFinder) + { + const U64 minSrcSize = 513; /* (1<<9) + 1 */ + const U64 maxWindowResize = 1ULL << (ZSTD_WINDOWLOG_MAX-1); + assert(ZSTD_checkCParams(cPar)==0); + ++ /* Cascade the selected strategy down to the next-highest one built into ++ * this binary. */ ++#ifdef ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR ++ if (cPar.strategy == ZSTD_btultra2) { ++ cPar.strategy = ZSTD_btultra; ++ } ++ if (cPar.strategy == ZSTD_btultra) { ++ cPar.strategy = ZSTD_btopt; ++ } ++#endif ++#ifdef ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR ++ if (cPar.strategy == ZSTD_btopt) { ++ cPar.strategy = ZSTD_btlazy2; ++ } ++#endif ++#ifdef ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR ++ if (cPar.strategy == ZSTD_btlazy2) { ++ cPar.strategy = ZSTD_lazy2; ++ } ++#endif ++#ifdef ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR ++ if (cPar.strategy == ZSTD_lazy2) { ++ cPar.strategy = ZSTD_lazy; ++ } ++#endif ++#ifdef ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR ++ if (cPar.strategy == ZSTD_lazy) { ++ cPar.strategy = ZSTD_greedy; ++ } ++#endif ++#ifdef ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR ++ if (cPar.strategy == ZSTD_greedy) { ++ cPar.strategy = ZSTD_dfast; ++ } ++#endif ++#ifdef ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR ++ if (cPar.strategy == ZSTD_dfast) { ++ cPar.strategy = ZSTD_fast; ++ cPar.targetLength = 0; ++ } ++#endif ++ + switch (mode) { + case ZSTD_cpm_unknown: + case ZSTD_cpm_noAttachDict: +@@ -1281,8 +1473,8 @@ ZSTD_adjustCParams_internal(ZSTD_compres + } + + /* resize windowLog if input is small enough, to use less memory */ +- if ( (srcSize < maxWindowResize) +- && (dictSize < maxWindowResize) ) { ++ if ( (srcSize <= maxWindowResize) ++ && (dictSize <= maxWindowResize) ) { + U32 const tSize = (U32)(srcSize + dictSize); + static U32 const hashSizeMin = 1 << ZSTD_HASHLOG_MIN; + U32 const srcLog = (tSize < hashSizeMin) ? ZSTD_HASHLOG_MIN : +@@ -1300,6 +1492,42 @@ ZSTD_adjustCParams_internal(ZSTD_compres + if (cPar.windowLog < ZSTD_WINDOWLOG_ABSOLUTEMIN) + cPar.windowLog = ZSTD_WINDOWLOG_ABSOLUTEMIN; /* minimum wlog required for valid frame header */ + ++ /* We can't use more than 32 bits of hash in total, so that means that we require: ++ * (hashLog + 8) <= 32 && (chainLog + 8) <= 32 ++ */ ++ if (mode == ZSTD_cpm_createCDict && ZSTD_CDictIndicesAreTagged(&cPar)) { ++ U32 const maxShortCacheHashLog = 32 - ZSTD_SHORT_CACHE_TAG_BITS; ++ if (cPar.hashLog > maxShortCacheHashLog) { ++ cPar.hashLog = maxShortCacheHashLog; ++ } ++ if (cPar.chainLog > maxShortCacheHashLog) { ++ cPar.chainLog = maxShortCacheHashLog; ++ } ++ } ++ ++ ++ /* At this point, we aren't 100% sure if we are using the row match finder. ++ * Unless it is explicitly disabled, conservatively assume that it is enabled. ++ * In this case it will only be disabled for small sources, so shrinking the ++ * hash log a little bit shouldn't result in any ratio loss. ++ */ ++ if (useRowMatchFinder == ZSTD_ps_auto) ++ useRowMatchFinder = ZSTD_ps_enable; ++ ++ /* We can't hash more than 32-bits in total. So that means that we require: ++ * (hashLog - rowLog + 8) <= 32 ++ */ ++ if (ZSTD_rowMatchFinderUsed(cPar.strategy, useRowMatchFinder)) { ++ /* Switch to 32-entry rows if searchLog is 5 (or more) */ ++ U32 const rowLog = BOUNDED(4, cPar.searchLog, 6); ++ U32 const maxRowHashLog = 32 - ZSTD_ROW_HASH_TAG_BITS; ++ U32 const maxHashLog = maxRowHashLog + rowLog; ++ assert(cPar.hashLog >= rowLog); ++ if (cPar.hashLog > maxHashLog) { ++ cPar.hashLog = maxHashLog; ++ } ++ } ++ + return cPar; + } + +@@ -1310,7 +1538,7 @@ ZSTD_adjustCParams(ZSTD_compressionParam + { + cPar = ZSTD_clampCParams(cPar); /* resulting cPar is necessarily valid (all parameters within range) */ + if (srcSize == 0) srcSize = ZSTD_CONTENTSIZE_UNKNOWN; +- return ZSTD_adjustCParams_internal(cPar, srcSize, dictSize, ZSTD_cpm_unknown); ++ return ZSTD_adjustCParams_internal(cPar, srcSize, dictSize, ZSTD_cpm_unknown, ZSTD_ps_auto); + } + + static ZSTD_compressionParameters ZSTD_getCParams_internal(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize, ZSTD_cParamMode_e mode); +@@ -1341,7 +1569,7 @@ ZSTD_compressionParameters ZSTD_getCPara + ZSTD_overrideCParams(&cParams, &CCtxParams->cParams); + assert(!ZSTD_checkCParams(cParams)); + /* srcSizeHint == 0 means 0 */ +- return ZSTD_adjustCParams_internal(cParams, srcSizeHint, dictSize, mode); ++ return ZSTD_adjustCParams_internal(cParams, srcSizeHint, dictSize, mode, CCtxParams->useRowMatchFinder); + } + + static size_t +@@ -1367,10 +1595,10 @@ ZSTD_sizeof_matchState(const ZSTD_compre + + ZSTD_cwksp_aligned_alloc_size((MaxLL+1) * sizeof(U32)) + + ZSTD_cwksp_aligned_alloc_size((MaxOff+1) * sizeof(U32)) + + ZSTD_cwksp_aligned_alloc_size((1<strategy, useRowMatchFinder) +- ? ZSTD_cwksp_aligned_alloc_size(hSize*sizeof(U16)) ++ ? ZSTD_cwksp_aligned_alloc_size(hSize) + : 0; + size_t const optSpace = (forCCtx && (cParams->strategy >= ZSTD_btopt)) + ? optPotentialSpace +@@ -1386,6 +1614,13 @@ ZSTD_sizeof_matchState(const ZSTD_compre + return tableSpace + optSpace + slackSpace + lazyAdditionalSpace; + } + ++/* Helper function for calculating memory requirements. ++ * Gives a tighter bound than ZSTD_sequenceBound() by taking minMatch into account. */ ++static size_t ZSTD_maxNbSeq(size_t blockSize, unsigned minMatch, int useSequenceProducer) { ++ U32 const divider = (minMatch==3 || useSequenceProducer) ? 3 : 4; ++ return blockSize / divider; ++} ++ + static size_t ZSTD_estimateCCtxSize_usingCCtxParams_internal( + const ZSTD_compressionParameters* cParams, + const ldmParams_t* ldmParams, +@@ -1393,12 +1628,13 @@ static size_t ZSTD_estimateCCtxSize_usin + const ZSTD_paramSwitch_e useRowMatchFinder, + const size_t buffInSize, + const size_t buffOutSize, +- const U64 pledgedSrcSize) ++ const U64 pledgedSrcSize, ++ int useSequenceProducer, ++ size_t maxBlockSize) + { + size_t const windowSize = (size_t) BOUNDED(1ULL, 1ULL << cParams->windowLog, pledgedSrcSize); +- size_t const blockSize = MIN(ZSTD_BLOCKSIZE_MAX, windowSize); +- U32 const divider = (cParams->minMatch==3) ? 3 : 4; +- size_t const maxNbSeq = blockSize / divider; ++ size_t const blockSize = MIN(ZSTD_resolveMaxBlockSize(maxBlockSize), windowSize); ++ size_t const maxNbSeq = ZSTD_maxNbSeq(blockSize, cParams->minMatch, useSequenceProducer); + size_t const tokenSpace = ZSTD_cwksp_alloc_size(WILDCOPY_OVERLENGTH + blockSize) + + ZSTD_cwksp_aligned_alloc_size(maxNbSeq * sizeof(seqDef)) + + 3 * ZSTD_cwksp_alloc_size(maxNbSeq * sizeof(BYTE)); +@@ -1417,6 +1653,11 @@ static size_t ZSTD_estimateCCtxSize_usin + + size_t const cctxSpace = isStatic ? ZSTD_cwksp_alloc_size(sizeof(ZSTD_CCtx)) : 0; + ++ size_t const maxNbExternalSeq = ZSTD_sequenceBound(blockSize); ++ size_t const externalSeqSpace = useSequenceProducer ++ ? ZSTD_cwksp_aligned_alloc_size(maxNbExternalSeq * sizeof(ZSTD_Sequence)) ++ : 0; ++ + size_t const neededSpace = + cctxSpace + + entropySpace + +@@ -1425,7 +1666,8 @@ static size_t ZSTD_estimateCCtxSize_usin + ldmSeqSpace + + matchStateSize + + tokenSpace + +- bufferSpace; ++ bufferSpace + ++ externalSeqSpace; + + DEBUGLOG(5, "estimate workspace : %u", (U32)neededSpace); + return neededSpace; +@@ -1443,7 +1685,7 @@ size_t ZSTD_estimateCCtxSize_usingCCtxPa + * be needed. However, we still allocate two 0-sized buffers, which can + * take space under ASAN. */ + return ZSTD_estimateCCtxSize_usingCCtxParams_internal( +- &cParams, ¶ms->ldmParams, 1, useRowMatchFinder, 0, 0, ZSTD_CONTENTSIZE_UNKNOWN); ++ &cParams, ¶ms->ldmParams, 1, useRowMatchFinder, 0, 0, ZSTD_CONTENTSIZE_UNKNOWN, ZSTD_hasExtSeqProd(params), params->maxBlockSize); + } + + size_t ZSTD_estimateCCtxSize_usingCParams(ZSTD_compressionParameters cParams) +@@ -1493,7 +1735,7 @@ size_t ZSTD_estimateCStreamSize_usingCCt + RETURN_ERROR_IF(params->nbWorkers > 0, GENERIC, "Estimate CCtx size is supported for single-threaded compression only."); + { ZSTD_compressionParameters const cParams = + ZSTD_getCParamsFromCCtxParams(params, ZSTD_CONTENTSIZE_UNKNOWN, 0, ZSTD_cpm_noAttachDict); +- size_t const blockSize = MIN(ZSTD_BLOCKSIZE_MAX, (size_t)1 << cParams.windowLog); ++ size_t const blockSize = MIN(ZSTD_resolveMaxBlockSize(params->maxBlockSize), (size_t)1 << cParams.windowLog); + size_t const inBuffSize = (params->inBufferMode == ZSTD_bm_buffered) + ? ((size_t)1 << cParams.windowLog) + blockSize + : 0; +@@ -1504,7 +1746,7 @@ size_t ZSTD_estimateCStreamSize_usingCCt + + return ZSTD_estimateCCtxSize_usingCCtxParams_internal( + &cParams, ¶ms->ldmParams, 1, useRowMatchFinder, inBuffSize, outBuffSize, +- ZSTD_CONTENTSIZE_UNKNOWN); ++ ZSTD_CONTENTSIZE_UNKNOWN, ZSTD_hasExtSeqProd(params), params->maxBlockSize); + } + } + +@@ -1637,6 +1879,19 @@ typedef enum { + ZSTD_resetTarget_CCtx + } ZSTD_resetTarget_e; + ++/* Mixes bits in a 64 bits in a value, based on XXH3_rrmxmx */ ++static U64 ZSTD_bitmix(U64 val, U64 len) { ++ val ^= ZSTD_rotateRight_U64(val, 49) ^ ZSTD_rotateRight_U64(val, 24); ++ val *= 0x9FB21C651E98DF25ULL; ++ val ^= (val >> 35) + len ; ++ val *= 0x9FB21C651E98DF25ULL; ++ return val ^ (val >> 28); ++} ++ ++/* Mixes in the hashSalt and hashSaltEntropy to create a new hashSalt */ ++static void ZSTD_advanceHashSalt(ZSTD_matchState_t* ms) { ++ ms->hashSalt = ZSTD_bitmix(ms->hashSalt, 8) ^ ZSTD_bitmix((U64) ms->hashSaltEntropy, 4); ++} + + static size_t + ZSTD_reset_matchState(ZSTD_matchState_t* ms, +@@ -1664,6 +1919,7 @@ ZSTD_reset_matchState(ZSTD_matchState_t* + } + + ms->hashLog3 = hashLog3; ++ ms->lazySkipping = 0; + + ZSTD_invalidateMatchState(ms); + +@@ -1685,22 +1941,19 @@ ZSTD_reset_matchState(ZSTD_matchState_t* + ZSTD_cwksp_clean_tables(ws); + } + +- /* opt parser space */ +- if ((forWho == ZSTD_resetTarget_CCtx) && (cParams->strategy >= ZSTD_btopt)) { +- DEBUGLOG(4, "reserving optimal parser space"); +- ms->opt.litFreq = (unsigned*)ZSTD_cwksp_reserve_aligned(ws, (1<opt.litLengthFreq = (unsigned*)ZSTD_cwksp_reserve_aligned(ws, (MaxLL+1) * sizeof(unsigned)); +- ms->opt.matchLengthFreq = (unsigned*)ZSTD_cwksp_reserve_aligned(ws, (MaxML+1) * sizeof(unsigned)); +- ms->opt.offCodeFreq = (unsigned*)ZSTD_cwksp_reserve_aligned(ws, (MaxOff+1) * sizeof(unsigned)); +- ms->opt.matchTable = (ZSTD_match_t*)ZSTD_cwksp_reserve_aligned(ws, (ZSTD_OPT_NUM+1) * sizeof(ZSTD_match_t)); +- ms->opt.priceTable = (ZSTD_optimal_t*)ZSTD_cwksp_reserve_aligned(ws, (ZSTD_OPT_NUM+1) * sizeof(ZSTD_optimal_t)); +- } +- + if (ZSTD_rowMatchFinderUsed(cParams->strategy, useRowMatchFinder)) { +- { /* Row match finder needs an additional table of hashes ("tags") */ +- size_t const tagTableSize = hSize*sizeof(U16); +- ms->tagTable = (U16*)ZSTD_cwksp_reserve_aligned(ws, tagTableSize); +- if (ms->tagTable) ZSTD_memset(ms->tagTable, 0, tagTableSize); ++ /* Row match finder needs an additional table of hashes ("tags") */ ++ size_t const tagTableSize = hSize; ++ /* We want to generate a new salt in case we reset a Cctx, but we always want to use ++ * 0 when we reset a Cdict */ ++ if(forWho == ZSTD_resetTarget_CCtx) { ++ ms->tagTable = (BYTE*) ZSTD_cwksp_reserve_aligned_init_once(ws, tagTableSize); ++ ZSTD_advanceHashSalt(ms); ++ } else { ++ /* When we are not salting we want to always memset the memory */ ++ ms->tagTable = (BYTE*) ZSTD_cwksp_reserve_aligned(ws, tagTableSize); ++ ZSTD_memset(ms->tagTable, 0, tagTableSize); ++ ms->hashSalt = 0; + } + { /* Switch to 32-entry rows if searchLog is 5 (or more) */ + U32 const rowLog = BOUNDED(4, cParams->searchLog, 6); +@@ -1709,6 +1962,17 @@ ZSTD_reset_matchState(ZSTD_matchState_t* + } + } + ++ /* opt parser space */ ++ if ((forWho == ZSTD_resetTarget_CCtx) && (cParams->strategy >= ZSTD_btopt)) { ++ DEBUGLOG(4, "reserving optimal parser space"); ++ ms->opt.litFreq = (unsigned*)ZSTD_cwksp_reserve_aligned(ws, (1<opt.litLengthFreq = (unsigned*)ZSTD_cwksp_reserve_aligned(ws, (MaxLL+1) * sizeof(unsigned)); ++ ms->opt.matchLengthFreq = (unsigned*)ZSTD_cwksp_reserve_aligned(ws, (MaxML+1) * sizeof(unsigned)); ++ ms->opt.offCodeFreq = (unsigned*)ZSTD_cwksp_reserve_aligned(ws, (MaxOff+1) * sizeof(unsigned)); ++ ms->opt.matchTable = (ZSTD_match_t*)ZSTD_cwksp_reserve_aligned(ws, ZSTD_OPT_SIZE * sizeof(ZSTD_match_t)); ++ ms->opt.priceTable = (ZSTD_optimal_t*)ZSTD_cwksp_reserve_aligned(ws, ZSTD_OPT_SIZE * sizeof(ZSTD_optimal_t)); ++ } ++ + ms->cParams = *cParams; + + RETURN_ERROR_IF(ZSTD_cwksp_reserve_failed(ws), memory_allocation, +@@ -1768,6 +2032,7 @@ static size_t ZSTD_resetCCtx_internal(ZS + assert(params->useRowMatchFinder != ZSTD_ps_auto); + assert(params->useBlockSplitter != ZSTD_ps_auto); + assert(params->ldmParams.enableLdm != ZSTD_ps_auto); ++ assert(params->maxBlockSize != 0); + if (params->ldmParams.enableLdm == ZSTD_ps_enable) { + /* Adjust long distance matching parameters */ + ZSTD_ldm_adjustParameters(&zc->appliedParams.ldmParams, ¶ms->cParams); +@@ -1776,9 +2041,8 @@ static size_t ZSTD_resetCCtx_internal(ZS + } + + { size_t const windowSize = MAX(1, (size_t)MIN(((U64)1 << params->cParams.windowLog), pledgedSrcSize)); +- size_t const blockSize = MIN(ZSTD_BLOCKSIZE_MAX, windowSize); +- U32 const divider = (params->cParams.minMatch==3) ? 3 : 4; +- size_t const maxNbSeq = blockSize / divider; ++ size_t const blockSize = MIN(params->maxBlockSize, windowSize); ++ size_t const maxNbSeq = ZSTD_maxNbSeq(blockSize, params->cParams.minMatch, ZSTD_hasExtSeqProd(params)); + size_t const buffOutSize = (zbuff == ZSTDb_buffered && params->outBufferMode == ZSTD_bm_buffered) + ? ZSTD_compressBound(blockSize) + 1 + : 0; +@@ -1795,8 +2059,7 @@ static size_t ZSTD_resetCCtx_internal(ZS + size_t const neededSpace = + ZSTD_estimateCCtxSize_usingCCtxParams_internal( + ¶ms->cParams, ¶ms->ldmParams, zc->staticSize != 0, params->useRowMatchFinder, +- buffInSize, buffOutSize, pledgedSrcSize); +- int resizeWorkspace; ++ buffInSize, buffOutSize, pledgedSrcSize, ZSTD_hasExtSeqProd(params), params->maxBlockSize); + + FORWARD_IF_ERROR(neededSpace, "cctx size estimate failed!"); + +@@ -1805,7 +2068,7 @@ static size_t ZSTD_resetCCtx_internal(ZS + { /* Check if workspace is large enough, alloc a new one if needed */ + int const workspaceTooSmall = ZSTD_cwksp_sizeof(ws) < neededSpace; + int const workspaceWasteful = ZSTD_cwksp_check_wasteful(ws, neededSpace); +- resizeWorkspace = workspaceTooSmall || workspaceWasteful; ++ int resizeWorkspace = workspaceTooSmall || workspaceWasteful; + DEBUGLOG(4, "Need %zu B workspace", neededSpace); + DEBUGLOG(4, "windowSize: %zu - blockSize: %zu", windowSize, blockSize); + +@@ -1838,6 +2101,7 @@ static size_t ZSTD_resetCCtx_internal(ZS + + /* init params */ + zc->blockState.matchState.cParams = params->cParams; ++ zc->blockState.matchState.prefetchCDictTables = params->prefetchCDictTables == ZSTD_ps_enable; + zc->pledgedSrcSizePlusOne = pledgedSrcSize+1; + zc->consumedSrcSize = 0; + zc->producedCSize = 0; +@@ -1854,13 +2118,46 @@ static size_t ZSTD_resetCCtx_internal(ZS + + ZSTD_reset_compressedBlockState(zc->blockState.prevCBlock); + ++ FORWARD_IF_ERROR(ZSTD_reset_matchState( ++ &zc->blockState.matchState, ++ ws, ++ ¶ms->cParams, ++ params->useRowMatchFinder, ++ crp, ++ needsIndexReset, ++ ZSTD_resetTarget_CCtx), ""); ++ ++ zc->seqStore.sequencesStart = (seqDef*)ZSTD_cwksp_reserve_aligned(ws, maxNbSeq * sizeof(seqDef)); ++ ++ /* ldm hash table */ ++ if (params->ldmParams.enableLdm == ZSTD_ps_enable) { ++ /* TODO: avoid memset? */ ++ size_t const ldmHSize = ((size_t)1) << params->ldmParams.hashLog; ++ zc->ldmState.hashTable = (ldmEntry_t*)ZSTD_cwksp_reserve_aligned(ws, ldmHSize * sizeof(ldmEntry_t)); ++ ZSTD_memset(zc->ldmState.hashTable, 0, ldmHSize * sizeof(ldmEntry_t)); ++ zc->ldmSequences = (rawSeq*)ZSTD_cwksp_reserve_aligned(ws, maxNbLdmSeq * sizeof(rawSeq)); ++ zc->maxNbLdmSequences = maxNbLdmSeq; ++ ++ ZSTD_window_init(&zc->ldmState.window); ++ zc->ldmState.loadedDictEnd = 0; ++ } ++ ++ /* reserve space for block-level external sequences */ ++ if (ZSTD_hasExtSeqProd(params)) { ++ size_t const maxNbExternalSeq = ZSTD_sequenceBound(blockSize); ++ zc->extSeqBufCapacity = maxNbExternalSeq; ++ zc->extSeqBuf = ++ (ZSTD_Sequence*)ZSTD_cwksp_reserve_aligned(ws, maxNbExternalSeq * sizeof(ZSTD_Sequence)); ++ } ++ ++ /* buffers */ ++ + /* ZSTD_wildcopy() is used to copy into the literals buffer, + * so we have to oversize the buffer by WILDCOPY_OVERLENGTH bytes. + */ + zc->seqStore.litStart = ZSTD_cwksp_reserve_buffer(ws, blockSize + WILDCOPY_OVERLENGTH); + zc->seqStore.maxNbLit = blockSize; + +- /* buffers */ + zc->bufferedPolicy = zbuff; + zc->inBuffSize = buffInSize; + zc->inBuff = (char*)ZSTD_cwksp_reserve_buffer(ws, buffInSize); +@@ -1883,32 +2180,9 @@ static size_t ZSTD_resetCCtx_internal(ZS + zc->seqStore.llCode = ZSTD_cwksp_reserve_buffer(ws, maxNbSeq * sizeof(BYTE)); + zc->seqStore.mlCode = ZSTD_cwksp_reserve_buffer(ws, maxNbSeq * sizeof(BYTE)); + zc->seqStore.ofCode = ZSTD_cwksp_reserve_buffer(ws, maxNbSeq * sizeof(BYTE)); +- zc->seqStore.sequencesStart = (seqDef*)ZSTD_cwksp_reserve_aligned(ws, maxNbSeq * sizeof(seqDef)); +- +- FORWARD_IF_ERROR(ZSTD_reset_matchState( +- &zc->blockState.matchState, +- ws, +- ¶ms->cParams, +- params->useRowMatchFinder, +- crp, +- needsIndexReset, +- ZSTD_resetTarget_CCtx), ""); +- +- /* ldm hash table */ +- if (params->ldmParams.enableLdm == ZSTD_ps_enable) { +- /* TODO: avoid memset? */ +- size_t const ldmHSize = ((size_t)1) << params->ldmParams.hashLog; +- zc->ldmState.hashTable = (ldmEntry_t*)ZSTD_cwksp_reserve_aligned(ws, ldmHSize * sizeof(ldmEntry_t)); +- ZSTD_memset(zc->ldmState.hashTable, 0, ldmHSize * sizeof(ldmEntry_t)); +- zc->ldmSequences = (rawSeq*)ZSTD_cwksp_reserve_aligned(ws, maxNbLdmSeq * sizeof(rawSeq)); +- zc->maxNbLdmSequences = maxNbLdmSeq; +- +- ZSTD_window_init(&zc->ldmState.window); +- zc->ldmState.loadedDictEnd = 0; +- } + + DEBUGLOG(3, "wksp: finished allocating, %zd bytes remain available", ZSTD_cwksp_available_space(ws)); +- assert(ZSTD_cwksp_estimated_space_within_bounds(ws, neededSpace, resizeWorkspace)); ++ assert(ZSTD_cwksp_estimated_space_within_bounds(ws, neededSpace)); + + zc->initialized = 1; + +@@ -1980,7 +2254,8 @@ ZSTD_resetCCtx_byAttachingCDict(ZSTD_CCt + } + + params.cParams = ZSTD_adjustCParams_internal(adjusted_cdict_cParams, pledgedSrcSize, +- cdict->dictContentSize, ZSTD_cpm_attachDict); ++ cdict->dictContentSize, ZSTD_cpm_attachDict, ++ params.useRowMatchFinder); + params.cParams.windowLog = windowLog; + params.useRowMatchFinder = cdict->useRowMatchFinder; /* cdict overrides */ + FORWARD_IF_ERROR(ZSTD_resetCCtx_internal(cctx, ¶ms, pledgedSrcSize, +@@ -2019,6 +2294,22 @@ ZSTD_resetCCtx_byAttachingCDict(ZSTD_CCt + return 0; + } + ++static void ZSTD_copyCDictTableIntoCCtx(U32* dst, U32 const* src, size_t tableSize, ++ ZSTD_compressionParameters const* cParams) { ++ if (ZSTD_CDictIndicesAreTagged(cParams)){ ++ /* Remove tags from the CDict table if they are present. ++ * See docs on "short cache" in zstd_compress_internal.h for context. */ ++ size_t i; ++ for (i = 0; i < tableSize; i++) { ++ U32 const taggedIndex = src[i]; ++ U32 const index = taggedIndex >> ZSTD_SHORT_CACHE_TAG_BITS; ++ dst[i] = index; ++ } ++ } else { ++ ZSTD_memcpy(dst, src, tableSize * sizeof(U32)); ++ } ++} ++ + static size_t ZSTD_resetCCtx_byCopyingCDict(ZSTD_CCtx* cctx, + const ZSTD_CDict* cdict, + ZSTD_CCtx_params params, +@@ -2054,21 +2345,23 @@ static size_t ZSTD_resetCCtx_byCopyingCD + : 0; + size_t const hSize = (size_t)1 << cdict_cParams->hashLog; + +- ZSTD_memcpy(cctx->blockState.matchState.hashTable, +- cdict->matchState.hashTable, +- hSize * sizeof(U32)); ++ ZSTD_copyCDictTableIntoCCtx(cctx->blockState.matchState.hashTable, ++ cdict->matchState.hashTable, ++ hSize, cdict_cParams); ++ + /* Do not copy cdict's chainTable if cctx has parameters such that it would not use chainTable */ + if (ZSTD_allocateChainTable(cctx->appliedParams.cParams.strategy, cctx->appliedParams.useRowMatchFinder, 0 /* forDDSDict */)) { +- ZSTD_memcpy(cctx->blockState.matchState.chainTable, +- cdict->matchState.chainTable, +- chainSize * sizeof(U32)); ++ ZSTD_copyCDictTableIntoCCtx(cctx->blockState.matchState.chainTable, ++ cdict->matchState.chainTable, ++ chainSize, cdict_cParams); + } + /* copy tag table */ + if (ZSTD_rowMatchFinderUsed(cdict_cParams->strategy, cdict->useRowMatchFinder)) { +- size_t const tagTableSize = hSize*sizeof(U16); ++ size_t const tagTableSize = hSize; + ZSTD_memcpy(cctx->blockState.matchState.tagTable, +- cdict->matchState.tagTable, +- tagTableSize); ++ cdict->matchState.tagTable, ++ tagTableSize); ++ cctx->blockState.matchState.hashSalt = cdict->matchState.hashSalt; + } + } + +@@ -2147,6 +2440,7 @@ static size_t ZSTD_copyCCtx_internal(ZST + params.useBlockSplitter = srcCCtx->appliedParams.useBlockSplitter; + params.ldmParams = srcCCtx->appliedParams.ldmParams; + params.fParams = fParams; ++ params.maxBlockSize = srcCCtx->appliedParams.maxBlockSize; + ZSTD_resetCCtx_internal(dstCCtx, ¶ms, pledgedSrcSize, + /* loadedDictSize */ 0, + ZSTDcrp_leaveDirty, zbuff); +@@ -2294,7 +2588,7 @@ static void ZSTD_reduceIndex (ZSTD_match + + /* See doc/zstd_compression_format.md for detailed format description */ + +-void ZSTD_seqToCodes(const seqStore_t* seqStorePtr) ++int ZSTD_seqToCodes(const seqStore_t* seqStorePtr) + { + const seqDef* const sequences = seqStorePtr->sequencesStart; + BYTE* const llCodeTable = seqStorePtr->llCode; +@@ -2302,18 +2596,24 @@ void ZSTD_seqToCodes(const seqStore_t* s + BYTE* const mlCodeTable = seqStorePtr->mlCode; + U32 const nbSeq = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart); + U32 u; ++ int longOffsets = 0; + assert(nbSeq <= seqStorePtr->maxNbSeq); + for (u=0; u= STREAM_ACCUMULATOR_MIN)); ++ if (MEM_32bits() && ofCode >= STREAM_ACCUMULATOR_MIN) ++ longOffsets = 1; + } + if (seqStorePtr->longLengthType==ZSTD_llt_literalLength) + llCodeTable[seqStorePtr->longLengthPos] = MaxLL; + if (seqStorePtr->longLengthType==ZSTD_llt_matchLength) + mlCodeTable[seqStorePtr->longLengthPos] = MaxML; ++ return longOffsets; + } + + /* ZSTD_useTargetCBlockSize(): +@@ -2347,6 +2647,7 @@ typedef struct { + U32 MLtype; + size_t size; + size_t lastCountSize; /* Accounts for bug in 1.3.4. More detail in ZSTD_entropyCompressSeqStore_internal() */ ++ int longOffsets; + } ZSTD_symbolEncodingTypeStats_t; + + /* ZSTD_buildSequencesStatistics(): +@@ -2357,11 +2658,13 @@ typedef struct { + * entropyWkspSize must be of size at least ENTROPY_WORKSPACE_SIZE - (MaxSeq + 1)*sizeof(U32) + */ + static ZSTD_symbolEncodingTypeStats_t +-ZSTD_buildSequencesStatistics(seqStore_t* seqStorePtr, size_t nbSeq, +- const ZSTD_fseCTables_t* prevEntropy, ZSTD_fseCTables_t* nextEntropy, +- BYTE* dst, const BYTE* const dstEnd, +- ZSTD_strategy strategy, unsigned* countWorkspace, +- void* entropyWorkspace, size_t entropyWkspSize) { ++ZSTD_buildSequencesStatistics( ++ const seqStore_t* seqStorePtr, size_t nbSeq, ++ const ZSTD_fseCTables_t* prevEntropy, ZSTD_fseCTables_t* nextEntropy, ++ BYTE* dst, const BYTE* const dstEnd, ++ ZSTD_strategy strategy, unsigned* countWorkspace, ++ void* entropyWorkspace, size_t entropyWkspSize) ++{ + BYTE* const ostart = dst; + const BYTE* const oend = dstEnd; + BYTE* op = ostart; +@@ -2375,7 +2678,7 @@ ZSTD_buildSequencesStatistics(seqStore_t + + stats.lastCountSize = 0; + /* convert length/distances into codes */ +- ZSTD_seqToCodes(seqStorePtr); ++ stats.longOffsets = ZSTD_seqToCodes(seqStorePtr); + assert(op <= oend); + assert(nbSeq != 0); /* ZSTD_selectEncodingType() divides by nbSeq */ + /* build CTable for Literal Lengths */ +@@ -2480,22 +2783,22 @@ ZSTD_buildSequencesStatistics(seqStore_t + */ + #define SUSPECT_UNCOMPRESSIBLE_LITERAL_RATIO 20 + MEM_STATIC size_t +-ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr, +- const ZSTD_entropyCTables_t* prevEntropy, +- ZSTD_entropyCTables_t* nextEntropy, +- const ZSTD_CCtx_params* cctxParams, +- void* dst, size_t dstCapacity, +- void* entropyWorkspace, size_t entropyWkspSize, +- const int bmi2) ++ZSTD_entropyCompressSeqStore_internal( ++ const seqStore_t* seqStorePtr, ++ const ZSTD_entropyCTables_t* prevEntropy, ++ ZSTD_entropyCTables_t* nextEntropy, ++ const ZSTD_CCtx_params* cctxParams, ++ void* dst, size_t dstCapacity, ++ void* entropyWorkspace, size_t entropyWkspSize, ++ const int bmi2) + { +- const int longOffsets = cctxParams->cParams.windowLog > STREAM_ACCUMULATOR_MIN; + ZSTD_strategy const strategy = cctxParams->cParams.strategy; + unsigned* count = (unsigned*)entropyWorkspace; + FSE_CTable* CTable_LitLength = nextEntropy->fse.litlengthCTable; + FSE_CTable* CTable_OffsetBits = nextEntropy->fse.offcodeCTable; + FSE_CTable* CTable_MatchLength = nextEntropy->fse.matchlengthCTable; + const seqDef* const sequences = seqStorePtr->sequencesStart; +- const size_t nbSeq = seqStorePtr->sequences - seqStorePtr->sequencesStart; ++ const size_t nbSeq = (size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart); + const BYTE* const ofCodeTable = seqStorePtr->ofCode; + const BYTE* const llCodeTable = seqStorePtr->llCode; + const BYTE* const mlCodeTable = seqStorePtr->mlCode; +@@ -2503,29 +2806,31 @@ ZSTD_entropyCompressSeqStore_internal(se + BYTE* const oend = ostart + dstCapacity; + BYTE* op = ostart; + size_t lastCountSize; ++ int longOffsets = 0; + + entropyWorkspace = count + (MaxSeq + 1); + entropyWkspSize -= (MaxSeq + 1) * sizeof(*count); + +- DEBUGLOG(4, "ZSTD_entropyCompressSeqStore_internal (nbSeq=%zu)", nbSeq); ++ DEBUGLOG(5, "ZSTD_entropyCompressSeqStore_internal (nbSeq=%zu, dstCapacity=%zu)", nbSeq, dstCapacity); + ZSTD_STATIC_ASSERT(HUF_WORKSPACE_SIZE >= (1<= HUF_WORKSPACE_SIZE); + + /* Compress literals */ + { const BYTE* const literals = seqStorePtr->litStart; +- size_t const numSequences = seqStorePtr->sequences - seqStorePtr->sequencesStart; +- size_t const numLiterals = seqStorePtr->lit - seqStorePtr->litStart; ++ size_t const numSequences = (size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart); ++ size_t const numLiterals = (size_t)(seqStorePtr->lit - seqStorePtr->litStart); + /* Base suspicion of uncompressibility on ratio of literals to sequences */ + unsigned const suspectUncompressible = (numSequences == 0) || (numLiterals / numSequences >= SUSPECT_UNCOMPRESSIBLE_LITERAL_RATIO); + size_t const litSize = (size_t)(seqStorePtr->lit - literals); ++ + size_t const cSize = ZSTD_compressLiterals( +- &prevEntropy->huf, &nextEntropy->huf, +- cctxParams->cParams.strategy, +- ZSTD_literalsCompressionIsDisabled(cctxParams), + op, dstCapacity, + literals, litSize, + entropyWorkspace, entropyWkspSize, +- bmi2, suspectUncompressible); ++ &prevEntropy->huf, &nextEntropy->huf, ++ cctxParams->cParams.strategy, ++ ZSTD_literalsCompressionIsDisabled(cctxParams), ++ suspectUncompressible, bmi2); + FORWARD_IF_ERROR(cSize, "ZSTD_compressLiterals failed"); + assert(cSize <= dstCapacity); + op += cSize; +@@ -2551,11 +2856,10 @@ ZSTD_entropyCompressSeqStore_internal(se + ZSTD_memcpy(&nextEntropy->fse, &prevEntropy->fse, sizeof(prevEntropy->fse)); + return (size_t)(op - ostart); + } +- { +- ZSTD_symbolEncodingTypeStats_t stats; +- BYTE* seqHead = op++; ++ { BYTE* const seqHead = op++; + /* build stats for sequences */ +- stats = ZSTD_buildSequencesStatistics(seqStorePtr, nbSeq, ++ const ZSTD_symbolEncodingTypeStats_t stats = ++ ZSTD_buildSequencesStatistics(seqStorePtr, nbSeq, + &prevEntropy->fse, &nextEntropy->fse, + op, oend, + strategy, count, +@@ -2564,6 +2868,7 @@ ZSTD_entropyCompressSeqStore_internal(se + *seqHead = (BYTE)((stats.LLtype<<6) + (stats.Offtype<<4) + (stats.MLtype<<2)); + lastCountSize = stats.lastCountSize; + op += stats.size; ++ longOffsets = stats.longOffsets; + } + + { size_t const bitstreamSize = ZSTD_encodeSequences( +@@ -2598,14 +2903,15 @@ ZSTD_entropyCompressSeqStore_internal(se + } + + MEM_STATIC size_t +-ZSTD_entropyCompressSeqStore(seqStore_t* seqStorePtr, +- const ZSTD_entropyCTables_t* prevEntropy, +- ZSTD_entropyCTables_t* nextEntropy, +- const ZSTD_CCtx_params* cctxParams, +- void* dst, size_t dstCapacity, +- size_t srcSize, +- void* entropyWorkspace, size_t entropyWkspSize, +- int bmi2) ++ZSTD_entropyCompressSeqStore( ++ const seqStore_t* seqStorePtr, ++ const ZSTD_entropyCTables_t* prevEntropy, ++ ZSTD_entropyCTables_t* nextEntropy, ++ const ZSTD_CCtx_params* cctxParams, ++ void* dst, size_t dstCapacity, ++ size_t srcSize, ++ void* entropyWorkspace, size_t entropyWkspSize, ++ int bmi2) + { + size_t const cSize = ZSTD_entropyCompressSeqStore_internal( + seqStorePtr, prevEntropy, nextEntropy, cctxParams, +@@ -2615,15 +2921,21 @@ ZSTD_entropyCompressSeqStore(seqStore_t* + /* When srcSize <= dstCapacity, there is enough space to write a raw uncompressed block. + * Since we ran out of space, block must be not compressible, so fall back to raw uncompressed block. + */ +- if ((cSize == ERROR(dstSize_tooSmall)) & (srcSize <= dstCapacity)) ++ if ((cSize == ERROR(dstSize_tooSmall)) & (srcSize <= dstCapacity)) { ++ DEBUGLOG(4, "not enough dstCapacity (%zu) for ZSTD_entropyCompressSeqStore_internal()=> do not compress block", dstCapacity); + return 0; /* block not compressed */ ++ } + FORWARD_IF_ERROR(cSize, "ZSTD_entropyCompressSeqStore_internal failed"); + + /* Check compressibility */ + { size_t const maxCSize = srcSize - ZSTD_minGain(srcSize, cctxParams->cParams.strategy); + if (cSize >= maxCSize) return 0; /* block not compressed */ + } +- DEBUGLOG(4, "ZSTD_entropyCompressSeqStore() cSize: %zu", cSize); ++ DEBUGLOG(5, "ZSTD_entropyCompressSeqStore() cSize: %zu", cSize); ++ /* libzstd decoder before > v1.5.4 is not compatible with compressed blocks of size ZSTD_BLOCKSIZE_MAX exactly. ++ * This restriction is indirectly already fulfilled by respecting ZSTD_minGain() condition above. ++ */ ++ assert(cSize < ZSTD_BLOCKSIZE_MAX); + return cSize; + } + +@@ -2635,40 +2947,43 @@ ZSTD_blockCompressor ZSTD_selectBlockCom + static const ZSTD_blockCompressor blockCompressor[4][ZSTD_STRATEGY_MAX+1] = { + { ZSTD_compressBlock_fast /* default for 0 */, + ZSTD_compressBlock_fast, +- ZSTD_compressBlock_doubleFast, +- ZSTD_compressBlock_greedy, +- ZSTD_compressBlock_lazy, +- ZSTD_compressBlock_lazy2, +- ZSTD_compressBlock_btlazy2, +- ZSTD_compressBlock_btopt, +- ZSTD_compressBlock_btultra, +- ZSTD_compressBlock_btultra2 }, ++ ZSTD_COMPRESSBLOCK_DOUBLEFAST, ++ ZSTD_COMPRESSBLOCK_GREEDY, ++ ZSTD_COMPRESSBLOCK_LAZY, ++ ZSTD_COMPRESSBLOCK_LAZY2, ++ ZSTD_COMPRESSBLOCK_BTLAZY2, ++ ZSTD_COMPRESSBLOCK_BTOPT, ++ ZSTD_COMPRESSBLOCK_BTULTRA, ++ ZSTD_COMPRESSBLOCK_BTULTRA2 ++ }, + { ZSTD_compressBlock_fast_extDict /* default for 0 */, + ZSTD_compressBlock_fast_extDict, +- ZSTD_compressBlock_doubleFast_extDict, +- ZSTD_compressBlock_greedy_extDict, +- ZSTD_compressBlock_lazy_extDict, +- ZSTD_compressBlock_lazy2_extDict, +- ZSTD_compressBlock_btlazy2_extDict, +- ZSTD_compressBlock_btopt_extDict, +- ZSTD_compressBlock_btultra_extDict, +- ZSTD_compressBlock_btultra_extDict }, ++ ZSTD_COMPRESSBLOCK_DOUBLEFAST_EXTDICT, ++ ZSTD_COMPRESSBLOCK_GREEDY_EXTDICT, ++ ZSTD_COMPRESSBLOCK_LAZY_EXTDICT, ++ ZSTD_COMPRESSBLOCK_LAZY2_EXTDICT, ++ ZSTD_COMPRESSBLOCK_BTLAZY2_EXTDICT, ++ ZSTD_COMPRESSBLOCK_BTOPT_EXTDICT, ++ ZSTD_COMPRESSBLOCK_BTULTRA_EXTDICT, ++ ZSTD_COMPRESSBLOCK_BTULTRA_EXTDICT ++ }, + { ZSTD_compressBlock_fast_dictMatchState /* default for 0 */, + ZSTD_compressBlock_fast_dictMatchState, +- ZSTD_compressBlock_doubleFast_dictMatchState, +- ZSTD_compressBlock_greedy_dictMatchState, +- ZSTD_compressBlock_lazy_dictMatchState, +- ZSTD_compressBlock_lazy2_dictMatchState, +- ZSTD_compressBlock_btlazy2_dictMatchState, +- ZSTD_compressBlock_btopt_dictMatchState, +- ZSTD_compressBlock_btultra_dictMatchState, +- ZSTD_compressBlock_btultra_dictMatchState }, ++ ZSTD_COMPRESSBLOCK_DOUBLEFAST_DICTMATCHSTATE, ++ ZSTD_COMPRESSBLOCK_GREEDY_DICTMATCHSTATE, ++ ZSTD_COMPRESSBLOCK_LAZY_DICTMATCHSTATE, ++ ZSTD_COMPRESSBLOCK_LAZY2_DICTMATCHSTATE, ++ ZSTD_COMPRESSBLOCK_BTLAZY2_DICTMATCHSTATE, ++ ZSTD_COMPRESSBLOCK_BTOPT_DICTMATCHSTATE, ++ ZSTD_COMPRESSBLOCK_BTULTRA_DICTMATCHSTATE, ++ ZSTD_COMPRESSBLOCK_BTULTRA_DICTMATCHSTATE ++ }, + { NULL /* default for 0 */, + NULL, + NULL, +- ZSTD_compressBlock_greedy_dedicatedDictSearch, +- ZSTD_compressBlock_lazy_dedicatedDictSearch, +- ZSTD_compressBlock_lazy2_dedicatedDictSearch, ++ ZSTD_COMPRESSBLOCK_GREEDY_DEDICATEDDICTSEARCH, ++ ZSTD_COMPRESSBLOCK_LAZY_DEDICATEDDICTSEARCH, ++ ZSTD_COMPRESSBLOCK_LAZY2_DEDICATEDDICTSEARCH, + NULL, + NULL, + NULL, +@@ -2681,18 +2996,26 @@ ZSTD_blockCompressor ZSTD_selectBlockCom + DEBUGLOG(4, "Selected block compressor: dictMode=%d strat=%d rowMatchfinder=%d", (int)dictMode, (int)strat, (int)useRowMatchFinder); + if (ZSTD_rowMatchFinderUsed(strat, useRowMatchFinder)) { + static const ZSTD_blockCompressor rowBasedBlockCompressors[4][3] = { +- { ZSTD_compressBlock_greedy_row, +- ZSTD_compressBlock_lazy_row, +- ZSTD_compressBlock_lazy2_row }, +- { ZSTD_compressBlock_greedy_extDict_row, +- ZSTD_compressBlock_lazy_extDict_row, +- ZSTD_compressBlock_lazy2_extDict_row }, +- { ZSTD_compressBlock_greedy_dictMatchState_row, +- ZSTD_compressBlock_lazy_dictMatchState_row, +- ZSTD_compressBlock_lazy2_dictMatchState_row }, +- { ZSTD_compressBlock_greedy_dedicatedDictSearch_row, +- ZSTD_compressBlock_lazy_dedicatedDictSearch_row, +- ZSTD_compressBlock_lazy2_dedicatedDictSearch_row } ++ { ++ ZSTD_COMPRESSBLOCK_GREEDY_ROW, ++ ZSTD_COMPRESSBLOCK_LAZY_ROW, ++ ZSTD_COMPRESSBLOCK_LAZY2_ROW ++ }, ++ { ++ ZSTD_COMPRESSBLOCK_GREEDY_EXTDICT_ROW, ++ ZSTD_COMPRESSBLOCK_LAZY_EXTDICT_ROW, ++ ZSTD_COMPRESSBLOCK_LAZY2_EXTDICT_ROW ++ }, ++ { ++ ZSTD_COMPRESSBLOCK_GREEDY_DICTMATCHSTATE_ROW, ++ ZSTD_COMPRESSBLOCK_LAZY_DICTMATCHSTATE_ROW, ++ ZSTD_COMPRESSBLOCK_LAZY2_DICTMATCHSTATE_ROW ++ }, ++ { ++ ZSTD_COMPRESSBLOCK_GREEDY_DEDICATEDDICTSEARCH_ROW, ++ ZSTD_COMPRESSBLOCK_LAZY_DEDICATEDDICTSEARCH_ROW, ++ ZSTD_COMPRESSBLOCK_LAZY2_DEDICATEDDICTSEARCH_ROW ++ } + }; + DEBUGLOG(4, "Selecting a row-based matchfinder"); + assert(useRowMatchFinder != ZSTD_ps_auto); +@@ -2718,6 +3041,72 @@ void ZSTD_resetSeqStore(seqStore_t* ssPt + ssPtr->longLengthType = ZSTD_llt_none; + } + ++/* ZSTD_postProcessSequenceProducerResult() : ++ * Validates and post-processes sequences obtained through the external matchfinder API: ++ * - Checks whether nbExternalSeqs represents an error condition. ++ * - Appends a block delimiter to outSeqs if one is not already present. ++ * See zstd.h for context regarding block delimiters. ++ * Returns the number of sequences after post-processing, or an error code. */ ++static size_t ZSTD_postProcessSequenceProducerResult( ++ ZSTD_Sequence* outSeqs, size_t nbExternalSeqs, size_t outSeqsCapacity, size_t srcSize ++) { ++ RETURN_ERROR_IF( ++ nbExternalSeqs > outSeqsCapacity, ++ sequenceProducer_failed, ++ "External sequence producer returned error code %lu", ++ (unsigned long)nbExternalSeqs ++ ); ++ ++ RETURN_ERROR_IF( ++ nbExternalSeqs == 0 && srcSize > 0, ++ sequenceProducer_failed, ++ "Got zero sequences from external sequence producer for a non-empty src buffer!" ++ ); ++ ++ if (srcSize == 0) { ++ ZSTD_memset(&outSeqs[0], 0, sizeof(ZSTD_Sequence)); ++ return 1; ++ } ++ ++ { ++ ZSTD_Sequence const lastSeq = outSeqs[nbExternalSeqs - 1]; ++ ++ /* We can return early if lastSeq is already a block delimiter. */ ++ if (lastSeq.offset == 0 && lastSeq.matchLength == 0) { ++ return nbExternalSeqs; ++ } ++ ++ /* This error condition is only possible if the external matchfinder ++ * produced an invalid parse, by definition of ZSTD_sequenceBound(). */ ++ RETURN_ERROR_IF( ++ nbExternalSeqs == outSeqsCapacity, ++ sequenceProducer_failed, ++ "nbExternalSeqs == outSeqsCapacity but lastSeq is not a block delimiter!" ++ ); ++ ++ /* lastSeq is not a block delimiter, so we need to append one. */ ++ ZSTD_memset(&outSeqs[nbExternalSeqs], 0, sizeof(ZSTD_Sequence)); ++ return nbExternalSeqs + 1; ++ } ++} ++ ++/* ZSTD_fastSequenceLengthSum() : ++ * Returns sum(litLen) + sum(matchLen) + lastLits for *seqBuf*. ++ * Similar to another function in zstd_compress.c (determine_blockSize), ++ * except it doesn't check for a block delimiter to end summation. ++ * Removing the early exit allows the compiler to auto-vectorize (https://godbolt.org/z/cY1cajz9P). ++ * This function can be deleted and replaced by determine_blockSize after we resolve issue #3456. */ ++static size_t ZSTD_fastSequenceLengthSum(ZSTD_Sequence const* seqBuf, size_t seqBufSize) { ++ size_t matchLenSum, litLenSum, i; ++ matchLenSum = 0; ++ litLenSum = 0; ++ for (i = 0; i < seqBufSize; i++) { ++ litLenSum += seqBuf[i].litLength; ++ matchLenSum += seqBuf[i].matchLength; ++ } ++ return litLenSum + matchLenSum; ++} ++ + typedef enum { ZSTDbss_compress, ZSTDbss_noCompress } ZSTD_buildSeqStore_e; + + static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize) +@@ -2727,7 +3116,9 @@ static size_t ZSTD_buildSeqStore(ZSTD_CC + assert(srcSize <= ZSTD_BLOCKSIZE_MAX); + /* Assert that we have correctly flushed the ctx params into the ms's copy */ + ZSTD_assertEqualCParams(zc->appliedParams.cParams, ms->cParams); +- if (srcSize < MIN_CBLOCK_SIZE+ZSTD_blockHeaderSize+1) { ++ /* TODO: See 3090. We reduced MIN_CBLOCK_SIZE from 3 to 2 so to compensate we are adding ++ * additional 1. We need to revisit and change this logic to be more consistent */ ++ if (srcSize < MIN_CBLOCK_SIZE+ZSTD_blockHeaderSize+1+1) { + if (zc->appliedParams.cParams.strategy >= ZSTD_btopt) { + ZSTD_ldm_skipRawSeqStoreBytes(&zc->externSeqStore, srcSize); + } else { +@@ -2763,6 +3154,15 @@ static size_t ZSTD_buildSeqStore(ZSTD_CC + } + if (zc->externSeqStore.pos < zc->externSeqStore.size) { + assert(zc->appliedParams.ldmParams.enableLdm == ZSTD_ps_disable); ++ ++ /* External matchfinder + LDM is technically possible, just not implemented yet. ++ * We need to revisit soon and implement it. */ ++ RETURN_ERROR_IF( ++ ZSTD_hasExtSeqProd(&zc->appliedParams), ++ parameter_combination_unsupported, ++ "Long-distance matching with external sequence producer enabled is not currently supported." ++ ); ++ + /* Updates ldmSeqStore.pos */ + lastLLSize = + ZSTD_ldm_blockCompress(&zc->externSeqStore, +@@ -2774,6 +3174,14 @@ static size_t ZSTD_buildSeqStore(ZSTD_CC + } else if (zc->appliedParams.ldmParams.enableLdm == ZSTD_ps_enable) { + rawSeqStore_t ldmSeqStore = kNullRawSeqStore; + ++ /* External matchfinder + LDM is technically possible, just not implemented yet. ++ * We need to revisit soon and implement it. */ ++ RETURN_ERROR_IF( ++ ZSTD_hasExtSeqProd(&zc->appliedParams), ++ parameter_combination_unsupported, ++ "Long-distance matching with external sequence producer enabled is not currently supported." ++ ); ++ + ldmSeqStore.seq = zc->ldmSequences; + ldmSeqStore.capacity = zc->maxNbLdmSequences; + /* Updates ldmSeqStore.size */ +@@ -2788,10 +3196,74 @@ static size_t ZSTD_buildSeqStore(ZSTD_CC + zc->appliedParams.useRowMatchFinder, + src, srcSize); + assert(ldmSeqStore.pos == ldmSeqStore.size); +- } else { /* not long range mode */ +- ZSTD_blockCompressor const blockCompressor = ZSTD_selectBlockCompressor(zc->appliedParams.cParams.strategy, +- zc->appliedParams.useRowMatchFinder, +- dictMode); ++ } else if (ZSTD_hasExtSeqProd(&zc->appliedParams)) { ++ assert( ++ zc->extSeqBufCapacity >= ZSTD_sequenceBound(srcSize) ++ ); ++ assert(zc->appliedParams.extSeqProdFunc != NULL); ++ ++ { U32 const windowSize = (U32)1 << zc->appliedParams.cParams.windowLog; ++ ++ size_t const nbExternalSeqs = (zc->appliedParams.extSeqProdFunc)( ++ zc->appliedParams.extSeqProdState, ++ zc->extSeqBuf, ++ zc->extSeqBufCapacity, ++ src, srcSize, ++ NULL, 0, /* dict and dictSize, currently not supported */ ++ zc->appliedParams.compressionLevel, ++ windowSize ++ ); ++ ++ size_t const nbPostProcessedSeqs = ZSTD_postProcessSequenceProducerResult( ++ zc->extSeqBuf, ++ nbExternalSeqs, ++ zc->extSeqBufCapacity, ++ srcSize ++ ); ++ ++ /* Return early if there is no error, since we don't need to worry about last literals */ ++ if (!ZSTD_isError(nbPostProcessedSeqs)) { ++ ZSTD_sequencePosition seqPos = {0,0,0}; ++ size_t const seqLenSum = ZSTD_fastSequenceLengthSum(zc->extSeqBuf, nbPostProcessedSeqs); ++ RETURN_ERROR_IF(seqLenSum > srcSize, externalSequences_invalid, "External sequences imply too large a block!"); ++ FORWARD_IF_ERROR( ++ ZSTD_copySequencesToSeqStoreExplicitBlockDelim( ++ zc, &seqPos, ++ zc->extSeqBuf, nbPostProcessedSeqs, ++ src, srcSize, ++ zc->appliedParams.searchForExternalRepcodes ++ ), ++ "Failed to copy external sequences to seqStore!" ++ ); ++ ms->ldmSeqStore = NULL; ++ DEBUGLOG(5, "Copied %lu sequences from external sequence producer to internal seqStore.", (unsigned long)nbExternalSeqs); ++ return ZSTDbss_compress; ++ } ++ ++ /* Propagate the error if fallback is disabled */ ++ if (!zc->appliedParams.enableMatchFinderFallback) { ++ return nbPostProcessedSeqs; ++ } ++ ++ /* Fallback to software matchfinder */ ++ { ZSTD_blockCompressor const blockCompressor = ++ ZSTD_selectBlockCompressor( ++ zc->appliedParams.cParams.strategy, ++ zc->appliedParams.useRowMatchFinder, ++ dictMode); ++ ms->ldmSeqStore = NULL; ++ DEBUGLOG( ++ 5, ++ "External sequence producer returned error code %lu. Falling back to internal parser.", ++ (unsigned long)nbExternalSeqs ++ ); ++ lastLLSize = blockCompressor(ms, &zc->seqStore, zc->blockState.nextCBlock->rep, src, srcSize); ++ } } ++ } else { /* not long range mode and no external matchfinder */ ++ ZSTD_blockCompressor const blockCompressor = ZSTD_selectBlockCompressor( ++ zc->appliedParams.cParams.strategy, ++ zc->appliedParams.useRowMatchFinder, ++ dictMode); + ms->ldmSeqStore = NULL; + lastLLSize = blockCompressor(ms, &zc->seqStore, zc->blockState.nextCBlock->rep, src, srcSize); + } +@@ -2801,29 +3273,38 @@ static size_t ZSTD_buildSeqStore(ZSTD_CC + return ZSTDbss_compress; + } + +-static void ZSTD_copyBlockSequences(ZSTD_CCtx* zc) ++static size_t ZSTD_copyBlockSequences(SeqCollector* seqCollector, const seqStore_t* seqStore, const U32 prevRepcodes[ZSTD_REP_NUM]) + { +- const seqStore_t* seqStore = ZSTD_getSeqStore(zc); +- const seqDef* seqStoreSeqs = seqStore->sequencesStart; +- size_t seqStoreSeqSize = seqStore->sequences - seqStoreSeqs; +- size_t seqStoreLiteralsSize = (size_t)(seqStore->lit - seqStore->litStart); +- size_t literalsRead = 0; +- size_t lastLLSize; +- +- ZSTD_Sequence* outSeqs = &zc->seqCollector.seqStart[zc->seqCollector.seqIndex]; ++ const seqDef* inSeqs = seqStore->sequencesStart; ++ const size_t nbInSequences = seqStore->sequences - inSeqs; ++ const size_t nbInLiterals = (size_t)(seqStore->lit - seqStore->litStart); ++ ++ ZSTD_Sequence* outSeqs = seqCollector->seqIndex == 0 ? seqCollector->seqStart : seqCollector->seqStart + seqCollector->seqIndex; ++ const size_t nbOutSequences = nbInSequences + 1; ++ size_t nbOutLiterals = 0; ++ repcodes_t repcodes; + size_t i; +- repcodes_t updatedRepcodes; + +- assert(zc->seqCollector.seqIndex + 1 < zc->seqCollector.maxSequences); +- /* Ensure we have enough space for last literals "sequence" */ +- assert(zc->seqCollector.maxSequences >= seqStoreSeqSize + 1); +- ZSTD_memcpy(updatedRepcodes.rep, zc->blockState.prevCBlock->rep, sizeof(repcodes_t)); +- for (i = 0; i < seqStoreSeqSize; ++i) { +- U32 rawOffset = seqStoreSeqs[i].offBase - ZSTD_REP_NUM; +- outSeqs[i].litLength = seqStoreSeqs[i].litLength; +- outSeqs[i].matchLength = seqStoreSeqs[i].mlBase + MINMATCH; ++ /* Bounds check that we have enough space for every input sequence ++ * and the block delimiter ++ */ ++ assert(seqCollector->seqIndex <= seqCollector->maxSequences); ++ RETURN_ERROR_IF( ++ nbOutSequences > (size_t)(seqCollector->maxSequences - seqCollector->seqIndex), ++ dstSize_tooSmall, ++ "Not enough space to copy sequences"); ++ ++ ZSTD_memcpy(&repcodes, prevRepcodes, sizeof(repcodes)); ++ for (i = 0; i < nbInSequences; ++i) { ++ U32 rawOffset; ++ outSeqs[i].litLength = inSeqs[i].litLength; ++ outSeqs[i].matchLength = inSeqs[i].mlBase + MINMATCH; + outSeqs[i].rep = 0; + ++ /* Handle the possible single length >= 64K ++ * There can only be one because we add MINMATCH to every match length, ++ * and blocks are at most 128K. ++ */ + if (i == seqStore->longLengthPos) { + if (seqStore->longLengthType == ZSTD_llt_literalLength) { + outSeqs[i].litLength += 0x10000; +@@ -2832,37 +3313,55 @@ static void ZSTD_copyBlockSequences(ZSTD + } + } + +- if (seqStoreSeqs[i].offBase <= ZSTD_REP_NUM) { +- /* Derive the correct offset corresponding to a repcode */ +- outSeqs[i].rep = seqStoreSeqs[i].offBase; ++ /* Determine the raw offset given the offBase, which may be a repcode. */ ++ if (OFFBASE_IS_REPCODE(inSeqs[i].offBase)) { ++ const U32 repcode = OFFBASE_TO_REPCODE(inSeqs[i].offBase); ++ assert(repcode > 0); ++ outSeqs[i].rep = repcode; + if (outSeqs[i].litLength != 0) { +- rawOffset = updatedRepcodes.rep[outSeqs[i].rep - 1]; ++ rawOffset = repcodes.rep[repcode - 1]; + } else { +- if (outSeqs[i].rep == 3) { +- rawOffset = updatedRepcodes.rep[0] - 1; ++ if (repcode == 3) { ++ assert(repcodes.rep[0] > 1); ++ rawOffset = repcodes.rep[0] - 1; + } else { +- rawOffset = updatedRepcodes.rep[outSeqs[i].rep]; ++ rawOffset = repcodes.rep[repcode]; + } + } ++ } else { ++ rawOffset = OFFBASE_TO_OFFSET(inSeqs[i].offBase); + } + outSeqs[i].offset = rawOffset; +- /* seqStoreSeqs[i].offset == offCode+1, and ZSTD_updateRep() expects offCode +- so we provide seqStoreSeqs[i].offset - 1 */ +- ZSTD_updateRep(updatedRepcodes.rep, +- seqStoreSeqs[i].offBase - 1, +- seqStoreSeqs[i].litLength == 0); +- literalsRead += outSeqs[i].litLength; ++ ++ /* Update repcode history for the sequence */ ++ ZSTD_updateRep(repcodes.rep, ++ inSeqs[i].offBase, ++ inSeqs[i].litLength == 0); ++ ++ nbOutLiterals += outSeqs[i].litLength; + } + /* Insert last literals (if any exist) in the block as a sequence with ml == off == 0. + * If there are no last literals, then we'll emit (of: 0, ml: 0, ll: 0), which is a marker + * for the block boundary, according to the API. + */ +- assert(seqStoreLiteralsSize >= literalsRead); +- lastLLSize = seqStoreLiteralsSize - literalsRead; +- outSeqs[i].litLength = (U32)lastLLSize; +- outSeqs[i].matchLength = outSeqs[i].offset = outSeqs[i].rep = 0; +- seqStoreSeqSize++; +- zc->seqCollector.seqIndex += seqStoreSeqSize; ++ assert(nbInLiterals >= nbOutLiterals); ++ { ++ const size_t lastLLSize = nbInLiterals - nbOutLiterals; ++ outSeqs[nbInSequences].litLength = (U32)lastLLSize; ++ outSeqs[nbInSequences].matchLength = 0; ++ outSeqs[nbInSequences].offset = 0; ++ assert(nbOutSequences == nbInSequences + 1); ++ } ++ seqCollector->seqIndex += nbOutSequences; ++ assert(seqCollector->seqIndex <= seqCollector->maxSequences); ++ ++ return 0; ++} ++ ++size_t ZSTD_sequenceBound(size_t srcSize) { ++ const size_t maxNbSeq = (srcSize / ZSTD_MINMATCH_MIN) + 1; ++ const size_t maxNbDelims = (srcSize / ZSTD_BLOCKSIZE_MAX_MIN) + 1; ++ return maxNbSeq + maxNbDelims; + } + + size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs, +@@ -2871,6 +3370,16 @@ size_t ZSTD_generateSequences(ZSTD_CCtx* + const size_t dstCapacity = ZSTD_compressBound(srcSize); + void* dst = ZSTD_customMalloc(dstCapacity, ZSTD_defaultCMem); + SeqCollector seqCollector; ++ { ++ int targetCBlockSize; ++ FORWARD_IF_ERROR(ZSTD_CCtx_getParameter(zc, ZSTD_c_targetCBlockSize, &targetCBlockSize), ""); ++ RETURN_ERROR_IF(targetCBlockSize != 0, parameter_unsupported, "targetCBlockSize != 0"); ++ } ++ { ++ int nbWorkers; ++ FORWARD_IF_ERROR(ZSTD_CCtx_getParameter(zc, ZSTD_c_nbWorkers, &nbWorkers), ""); ++ RETURN_ERROR_IF(nbWorkers != 0, parameter_unsupported, "nbWorkers != 0"); ++ } + + RETURN_ERROR_IF(dst == NULL, memory_allocation, "NULL pointer!"); + +@@ -2880,8 +3389,12 @@ size_t ZSTD_generateSequences(ZSTD_CCtx* + seqCollector.maxSequences = outSeqsSize; + zc->seqCollector = seqCollector; + +- ZSTD_compress2(zc, dst, dstCapacity, src, srcSize); +- ZSTD_customFree(dst, ZSTD_defaultCMem); ++ { ++ const size_t ret = ZSTD_compress2(zc, dst, dstCapacity, src, srcSize); ++ ZSTD_customFree(dst, ZSTD_defaultCMem); ++ FORWARD_IF_ERROR(ret, "ZSTD_compress2 failed"); ++ } ++ assert(zc->seqCollector.seqIndex <= ZSTD_sequenceBound(srcSize)); + return zc->seqCollector.seqIndex; + } + +@@ -2910,19 +3423,17 @@ static int ZSTD_isRLE(const BYTE* src, s + const size_t unrollMask = unrollSize - 1; + const size_t prefixLength = length & unrollMask; + size_t i; +- size_t u; + if (length == 1) return 1; + /* Check if prefix is RLE first before using unrolled loop */ + if (prefixLength && ZSTD_count(ip+1, ip, ip+prefixLength) != prefixLength-1) { + return 0; + } + for (i = prefixLength; i != length; i += unrollSize) { ++ size_t u; + for (u = 0; u < unrollSize; u += sizeof(size_t)) { + if (MEM_readST(ip + i + u) != valueST) { + return 0; +- } +- } +- } ++ } } } + return 1; + } + +@@ -2938,7 +3449,8 @@ static int ZSTD_maybeRLE(seqStore_t cons + return nbSeqs < 4 && nbLits < 10; + } + +-static void ZSTD_blockState_confirmRepcodesAndEntropyTables(ZSTD_blockState_t* const bs) ++static void ++ZSTD_blockState_confirmRepcodesAndEntropyTables(ZSTD_blockState_t* const bs) + { + ZSTD_compressedBlockState_t* const tmp = bs->prevCBlock; + bs->prevCBlock = bs->nextCBlock; +@@ -2946,7 +3458,9 @@ static void ZSTD_blockState_confirmRepco + } + + /* Writes the block header */ +-static void writeBlockHeader(void* op, size_t cSize, size_t blockSize, U32 lastBlock) { ++static void ++writeBlockHeader(void* op, size_t cSize, size_t blockSize, U32 lastBlock) ++{ + U32 const cBlockHeader = cSize == 1 ? + lastBlock + (((U32)bt_rle)<<1) + (U32)(blockSize << 3) : + lastBlock + (((U32)bt_compressed)<<1) + (U32)(cSize << 3); +@@ -2959,13 +3473,16 @@ static void writeBlockHeader(void* op, s + * Stores literals block type (raw, rle, compressed, repeat) and + * huffman description table to hufMetadata. + * Requires ENTROPY_WORKSPACE_SIZE workspace +- * @return : size of huffman description table or error code */ +-static size_t ZSTD_buildBlockEntropyStats_literals(void* const src, size_t srcSize, +- const ZSTD_hufCTables_t* prevHuf, +- ZSTD_hufCTables_t* nextHuf, +- ZSTD_hufCTablesMetadata_t* hufMetadata, +- const int literalsCompressionIsDisabled, +- void* workspace, size_t wkspSize) ++ * @return : size of huffman description table, or an error code ++ */ ++static size_t ++ZSTD_buildBlockEntropyStats_literals(void* const src, size_t srcSize, ++ const ZSTD_hufCTables_t* prevHuf, ++ ZSTD_hufCTables_t* nextHuf, ++ ZSTD_hufCTablesMetadata_t* hufMetadata, ++ const int literalsCompressionIsDisabled, ++ void* workspace, size_t wkspSize, ++ int hufFlags) + { + BYTE* const wkspStart = (BYTE*)workspace; + BYTE* const wkspEnd = wkspStart + wkspSize; +@@ -2973,9 +3490,9 @@ static size_t ZSTD_buildBlockEntropyStat + unsigned* const countWksp = (unsigned*)workspace; + const size_t countWkspSize = (HUF_SYMBOLVALUE_MAX + 1) * sizeof(unsigned); + BYTE* const nodeWksp = countWkspStart + countWkspSize; +- const size_t nodeWkspSize = wkspEnd-nodeWksp; ++ const size_t nodeWkspSize = (size_t)(wkspEnd - nodeWksp); + unsigned maxSymbolValue = HUF_SYMBOLVALUE_MAX; +- unsigned huffLog = HUF_TABLELOG_DEFAULT; ++ unsigned huffLog = LitHufLog; + HUF_repeat repeat = prevHuf->repeatMode; + DEBUGLOG(5, "ZSTD_buildBlockEntropyStats_literals (srcSize=%zu)", srcSize); + +@@ -2990,73 +3507,77 @@ static size_t ZSTD_buildBlockEntropyStat + + /* small ? don't even attempt compression (speed opt) */ + #ifndef COMPRESS_LITERALS_SIZE_MIN +-#define COMPRESS_LITERALS_SIZE_MIN 63 ++# define COMPRESS_LITERALS_SIZE_MIN 63 /* heuristic */ + #endif + { size_t const minLitSize = (prevHuf->repeatMode == HUF_repeat_valid) ? 6 : COMPRESS_LITERALS_SIZE_MIN; + if (srcSize <= minLitSize) { + DEBUGLOG(5, "set_basic - too small"); + hufMetadata->hType = set_basic; + return 0; +- } +- } ++ } } + + /* Scan input and build symbol stats */ +- { size_t const largest = HIST_count_wksp (countWksp, &maxSymbolValue, (const BYTE*)src, srcSize, workspace, wkspSize); ++ { size_t const largest = ++ HIST_count_wksp (countWksp, &maxSymbolValue, ++ (const BYTE*)src, srcSize, ++ workspace, wkspSize); + FORWARD_IF_ERROR(largest, "HIST_count_wksp failed"); + if (largest == srcSize) { ++ /* only one literal symbol */ + DEBUGLOG(5, "set_rle"); + hufMetadata->hType = set_rle; + return 0; + } + if (largest <= (srcSize >> 7)+4) { ++ /* heuristic: likely not compressible */ + DEBUGLOG(5, "set_basic - no gain"); + hufMetadata->hType = set_basic; + return 0; +- } +- } ++ } } + + /* Validate the previous Huffman table */ +- if (repeat == HUF_repeat_check && !HUF_validateCTable((HUF_CElt const*)prevHuf->CTable, countWksp, maxSymbolValue)) { ++ if (repeat == HUF_repeat_check ++ && !HUF_validateCTable((HUF_CElt const*)prevHuf->CTable, countWksp, maxSymbolValue)) { + repeat = HUF_repeat_none; + } + + /* Build Huffman Tree */ + ZSTD_memset(nextHuf->CTable, 0, sizeof(nextHuf->CTable)); +- huffLog = HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue); ++ huffLog = HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue, nodeWksp, nodeWkspSize, nextHuf->CTable, countWksp, hufFlags); ++ assert(huffLog <= LitHufLog); + { size_t const maxBits = HUF_buildCTable_wksp((HUF_CElt*)nextHuf->CTable, countWksp, + maxSymbolValue, huffLog, + nodeWksp, nodeWkspSize); + FORWARD_IF_ERROR(maxBits, "HUF_buildCTable_wksp"); + huffLog = (U32)maxBits; +- { /* Build and write the CTable */ +- size_t const newCSize = HUF_estimateCompressedSize( +- (HUF_CElt*)nextHuf->CTable, countWksp, maxSymbolValue); +- size_t const hSize = HUF_writeCTable_wksp( +- hufMetadata->hufDesBuffer, sizeof(hufMetadata->hufDesBuffer), +- (HUF_CElt*)nextHuf->CTable, maxSymbolValue, huffLog, +- nodeWksp, nodeWkspSize); +- /* Check against repeating the previous CTable */ +- if (repeat != HUF_repeat_none) { +- size_t const oldCSize = HUF_estimateCompressedSize( +- (HUF_CElt const*)prevHuf->CTable, countWksp, maxSymbolValue); +- if (oldCSize < srcSize && (oldCSize <= hSize + newCSize || hSize + 12 >= srcSize)) { +- DEBUGLOG(5, "set_repeat - smaller"); +- ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf)); +- hufMetadata->hType = set_repeat; +- return 0; +- } +- } +- if (newCSize + hSize >= srcSize) { +- DEBUGLOG(5, "set_basic - no gains"); ++ } ++ { /* Build and write the CTable */ ++ size_t const newCSize = HUF_estimateCompressedSize( ++ (HUF_CElt*)nextHuf->CTable, countWksp, maxSymbolValue); ++ size_t const hSize = HUF_writeCTable_wksp( ++ hufMetadata->hufDesBuffer, sizeof(hufMetadata->hufDesBuffer), ++ (HUF_CElt*)nextHuf->CTable, maxSymbolValue, huffLog, ++ nodeWksp, nodeWkspSize); ++ /* Check against repeating the previous CTable */ ++ if (repeat != HUF_repeat_none) { ++ size_t const oldCSize = HUF_estimateCompressedSize( ++ (HUF_CElt const*)prevHuf->CTable, countWksp, maxSymbolValue); ++ if (oldCSize < srcSize && (oldCSize <= hSize + newCSize || hSize + 12 >= srcSize)) { ++ DEBUGLOG(5, "set_repeat - smaller"); + ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf)); +- hufMetadata->hType = set_basic; ++ hufMetadata->hType = set_repeat; + return 0; +- } +- DEBUGLOG(5, "set_compressed (hSize=%u)", (U32)hSize); +- hufMetadata->hType = set_compressed; +- nextHuf->repeatMode = HUF_repeat_check; +- return hSize; ++ } } ++ if (newCSize + hSize >= srcSize) { ++ DEBUGLOG(5, "set_basic - no gains"); ++ ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf)); ++ hufMetadata->hType = set_basic; ++ return 0; + } ++ DEBUGLOG(5, "set_compressed (hSize=%u)", (U32)hSize); ++ hufMetadata->hType = set_compressed; ++ nextHuf->repeatMode = HUF_repeat_check; ++ return hSize; + } + } + +@@ -3066,8 +3587,9 @@ static size_t ZSTD_buildBlockEntropyStat + * and updates nextEntropy to the appropriate repeatMode. + */ + static ZSTD_symbolEncodingTypeStats_t +-ZSTD_buildDummySequencesStatistics(ZSTD_fseCTables_t* nextEntropy) { +- ZSTD_symbolEncodingTypeStats_t stats = {set_basic, set_basic, set_basic, 0, 0}; ++ZSTD_buildDummySequencesStatistics(ZSTD_fseCTables_t* nextEntropy) ++{ ++ ZSTD_symbolEncodingTypeStats_t stats = {set_basic, set_basic, set_basic, 0, 0, 0}; + nextEntropy->litlength_repeatMode = FSE_repeat_none; + nextEntropy->offcode_repeatMode = FSE_repeat_none; + nextEntropy->matchlength_repeatMode = FSE_repeat_none; +@@ -3078,16 +3600,18 @@ ZSTD_buildDummySequencesStatistics(ZSTD_ + * Builds entropy for the sequences. + * Stores symbol compression modes and fse table to fseMetadata. + * Requires ENTROPY_WORKSPACE_SIZE wksp. +- * @return : size of fse tables or error code */ +-static size_t ZSTD_buildBlockEntropyStats_sequences(seqStore_t* seqStorePtr, +- const ZSTD_fseCTables_t* prevEntropy, +- ZSTD_fseCTables_t* nextEntropy, +- const ZSTD_CCtx_params* cctxParams, +- ZSTD_fseCTablesMetadata_t* fseMetadata, +- void* workspace, size_t wkspSize) ++ * @return : size of fse tables or error code */ ++static size_t ++ZSTD_buildBlockEntropyStats_sequences( ++ const seqStore_t* seqStorePtr, ++ const ZSTD_fseCTables_t* prevEntropy, ++ ZSTD_fseCTables_t* nextEntropy, ++ const ZSTD_CCtx_params* cctxParams, ++ ZSTD_fseCTablesMetadata_t* fseMetadata, ++ void* workspace, size_t wkspSize) + { + ZSTD_strategy const strategy = cctxParams->cParams.strategy; +- size_t const nbSeq = seqStorePtr->sequences - seqStorePtr->sequencesStart; ++ size_t const nbSeq = (size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart); + BYTE* const ostart = fseMetadata->fseTablesBuffer; + BYTE* const oend = ostart + sizeof(fseMetadata->fseTablesBuffer); + BYTE* op = ostart; +@@ -3114,23 +3638,28 @@ static size_t ZSTD_buildBlockEntropyStat + /* ZSTD_buildBlockEntropyStats() : + * Builds entropy for the block. + * Requires workspace size ENTROPY_WORKSPACE_SIZE +- * +- * @return : 0 on success or error code ++ * @return : 0 on success, or an error code ++ * Note : also employed in superblock + */ +-size_t ZSTD_buildBlockEntropyStats(seqStore_t* seqStorePtr, +- const ZSTD_entropyCTables_t* prevEntropy, +- ZSTD_entropyCTables_t* nextEntropy, +- const ZSTD_CCtx_params* cctxParams, +- ZSTD_entropyCTablesMetadata_t* entropyMetadata, +- void* workspace, size_t wkspSize) +-{ +- size_t const litSize = seqStorePtr->lit - seqStorePtr->litStart; ++size_t ZSTD_buildBlockEntropyStats( ++ const seqStore_t* seqStorePtr, ++ const ZSTD_entropyCTables_t* prevEntropy, ++ ZSTD_entropyCTables_t* nextEntropy, ++ const ZSTD_CCtx_params* cctxParams, ++ ZSTD_entropyCTablesMetadata_t* entropyMetadata, ++ void* workspace, size_t wkspSize) ++{ ++ size_t const litSize = (size_t)(seqStorePtr->lit - seqStorePtr->litStart); ++ int const huf_useOptDepth = (cctxParams->cParams.strategy >= HUF_OPTIMAL_DEPTH_THRESHOLD); ++ int const hufFlags = huf_useOptDepth ? HUF_flags_optimalDepth : 0; ++ + entropyMetadata->hufMetadata.hufDesSize = + ZSTD_buildBlockEntropyStats_literals(seqStorePtr->litStart, litSize, + &prevEntropy->huf, &nextEntropy->huf, + &entropyMetadata->hufMetadata, + ZSTD_literalsCompressionIsDisabled(cctxParams), +- workspace, wkspSize); ++ workspace, wkspSize, hufFlags); ++ + FORWARD_IF_ERROR(entropyMetadata->hufMetadata.hufDesSize, "ZSTD_buildBlockEntropyStats_literals failed"); + entropyMetadata->fseMetadata.fseTablesSize = + ZSTD_buildBlockEntropyStats_sequences(seqStorePtr, +@@ -3143,11 +3672,12 @@ size_t ZSTD_buildBlockEntropyStats(seqSt + } + + /* Returns the size estimate for the literals section (header + content) of a block */ +-static size_t ZSTD_estimateBlockSize_literal(const BYTE* literals, size_t litSize, +- const ZSTD_hufCTables_t* huf, +- const ZSTD_hufCTablesMetadata_t* hufMetadata, +- void* workspace, size_t wkspSize, +- int writeEntropy) ++static size_t ++ZSTD_estimateBlockSize_literal(const BYTE* literals, size_t litSize, ++ const ZSTD_hufCTables_t* huf, ++ const ZSTD_hufCTablesMetadata_t* hufMetadata, ++ void* workspace, size_t wkspSize, ++ int writeEntropy) + { + unsigned* const countWksp = (unsigned*)workspace; + unsigned maxSymbolValue = HUF_SYMBOLVALUE_MAX; +@@ -3169,12 +3699,13 @@ static size_t ZSTD_estimateBlockSize_lit + } + + /* Returns the size estimate for the FSE-compressed symbols (of, ml, ll) of a block */ +-static size_t ZSTD_estimateBlockSize_symbolType(symbolEncodingType_e type, +- const BYTE* codeTable, size_t nbSeq, unsigned maxCode, +- const FSE_CTable* fseCTable, +- const U8* additionalBits, +- short const* defaultNorm, U32 defaultNormLog, U32 defaultMax, +- void* workspace, size_t wkspSize) ++static size_t ++ZSTD_estimateBlockSize_symbolType(symbolEncodingType_e type, ++ const BYTE* codeTable, size_t nbSeq, unsigned maxCode, ++ const FSE_CTable* fseCTable, ++ const U8* additionalBits, ++ short const* defaultNorm, U32 defaultNormLog, U32 defaultMax, ++ void* workspace, size_t wkspSize) + { + unsigned* const countWksp = (unsigned*)workspace; + const BYTE* ctp = codeTable; +@@ -3206,99 +3737,107 @@ static size_t ZSTD_estimateBlockSize_sym + } + + /* Returns the size estimate for the sequences section (header + content) of a block */ +-static size_t ZSTD_estimateBlockSize_sequences(const BYTE* ofCodeTable, +- const BYTE* llCodeTable, +- const BYTE* mlCodeTable, +- size_t nbSeq, +- const ZSTD_fseCTables_t* fseTables, +- const ZSTD_fseCTablesMetadata_t* fseMetadata, +- void* workspace, size_t wkspSize, +- int writeEntropy) ++static size_t ++ZSTD_estimateBlockSize_sequences(const BYTE* ofCodeTable, ++ const BYTE* llCodeTable, ++ const BYTE* mlCodeTable, ++ size_t nbSeq, ++ const ZSTD_fseCTables_t* fseTables, ++ const ZSTD_fseCTablesMetadata_t* fseMetadata, ++ void* workspace, size_t wkspSize, ++ int writeEntropy) + { + size_t sequencesSectionHeaderSize = 1 /* seqHead */ + 1 /* min seqSize size */ + (nbSeq >= 128) + (nbSeq >= LONGNBSEQ); + size_t cSeqSizeEstimate = 0; + cSeqSizeEstimate += ZSTD_estimateBlockSize_symbolType(fseMetadata->ofType, ofCodeTable, nbSeq, MaxOff, +- fseTables->offcodeCTable, NULL, +- OF_defaultNorm, OF_defaultNormLog, DefaultMaxOff, +- workspace, wkspSize); ++ fseTables->offcodeCTable, NULL, ++ OF_defaultNorm, OF_defaultNormLog, DefaultMaxOff, ++ workspace, wkspSize); + cSeqSizeEstimate += ZSTD_estimateBlockSize_symbolType(fseMetadata->llType, llCodeTable, nbSeq, MaxLL, +- fseTables->litlengthCTable, LL_bits, +- LL_defaultNorm, LL_defaultNormLog, MaxLL, +- workspace, wkspSize); ++ fseTables->litlengthCTable, LL_bits, ++ LL_defaultNorm, LL_defaultNormLog, MaxLL, ++ workspace, wkspSize); + cSeqSizeEstimate += ZSTD_estimateBlockSize_symbolType(fseMetadata->mlType, mlCodeTable, nbSeq, MaxML, +- fseTables->matchlengthCTable, ML_bits, +- ML_defaultNorm, ML_defaultNormLog, MaxML, +- workspace, wkspSize); ++ fseTables->matchlengthCTable, ML_bits, ++ ML_defaultNorm, ML_defaultNormLog, MaxML, ++ workspace, wkspSize); + if (writeEntropy) cSeqSizeEstimate += fseMetadata->fseTablesSize; + return cSeqSizeEstimate + sequencesSectionHeaderSize; + } + + /* Returns the size estimate for a given stream of literals, of, ll, ml */ +-static size_t ZSTD_estimateBlockSize(const BYTE* literals, size_t litSize, +- const BYTE* ofCodeTable, +- const BYTE* llCodeTable, +- const BYTE* mlCodeTable, +- size_t nbSeq, +- const ZSTD_entropyCTables_t* entropy, +- const ZSTD_entropyCTablesMetadata_t* entropyMetadata, +- void* workspace, size_t wkspSize, +- int writeLitEntropy, int writeSeqEntropy) { ++static size_t ++ZSTD_estimateBlockSize(const BYTE* literals, size_t litSize, ++ const BYTE* ofCodeTable, ++ const BYTE* llCodeTable, ++ const BYTE* mlCodeTable, ++ size_t nbSeq, ++ const ZSTD_entropyCTables_t* entropy, ++ const ZSTD_entropyCTablesMetadata_t* entropyMetadata, ++ void* workspace, size_t wkspSize, ++ int writeLitEntropy, int writeSeqEntropy) ++{ + size_t const literalsSize = ZSTD_estimateBlockSize_literal(literals, litSize, +- &entropy->huf, &entropyMetadata->hufMetadata, +- workspace, wkspSize, writeLitEntropy); ++ &entropy->huf, &entropyMetadata->hufMetadata, ++ workspace, wkspSize, writeLitEntropy); + size_t const seqSize = ZSTD_estimateBlockSize_sequences(ofCodeTable, llCodeTable, mlCodeTable, +- nbSeq, &entropy->fse, &entropyMetadata->fseMetadata, +- workspace, wkspSize, writeSeqEntropy); ++ nbSeq, &entropy->fse, &entropyMetadata->fseMetadata, ++ workspace, wkspSize, writeSeqEntropy); + return seqSize + literalsSize + ZSTD_blockHeaderSize; + } + + /* Builds entropy statistics and uses them for blocksize estimation. + * +- * Returns the estimated compressed size of the seqStore, or a zstd error. ++ * @return: estimated compressed size of the seqStore, or a zstd error. + */ +-static size_t ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(seqStore_t* seqStore, ZSTD_CCtx* zc) { +- ZSTD_entropyCTablesMetadata_t* entropyMetadata = &zc->blockSplitCtx.entropyMetadata; ++static size_t ++ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(seqStore_t* seqStore, ZSTD_CCtx* zc) ++{ ++ ZSTD_entropyCTablesMetadata_t* const entropyMetadata = &zc->blockSplitCtx.entropyMetadata; + DEBUGLOG(6, "ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize()"); + FORWARD_IF_ERROR(ZSTD_buildBlockEntropyStats(seqStore, + &zc->blockState.prevCBlock->entropy, + &zc->blockState.nextCBlock->entropy, + &zc->appliedParams, + entropyMetadata, +- zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE /* statically allocated in resetCCtx */), ""); +- return ZSTD_estimateBlockSize(seqStore->litStart, (size_t)(seqStore->lit - seqStore->litStart), ++ zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE), ""); ++ return ZSTD_estimateBlockSize( ++ seqStore->litStart, (size_t)(seqStore->lit - seqStore->litStart), + seqStore->ofCode, seqStore->llCode, seqStore->mlCode, + (size_t)(seqStore->sequences - seqStore->sequencesStart), +- &zc->blockState.nextCBlock->entropy, entropyMetadata, zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE, ++ &zc->blockState.nextCBlock->entropy, ++ entropyMetadata, ++ zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE, + (int)(entropyMetadata->hufMetadata.hType == set_compressed), 1); + } + + /* Returns literals bytes represented in a seqStore */ +-static size_t ZSTD_countSeqStoreLiteralsBytes(const seqStore_t* const seqStore) { ++static size_t ZSTD_countSeqStoreLiteralsBytes(const seqStore_t* const seqStore) ++{ + size_t literalsBytes = 0; +- size_t const nbSeqs = seqStore->sequences - seqStore->sequencesStart; ++ size_t const nbSeqs = (size_t)(seqStore->sequences - seqStore->sequencesStart); + size_t i; + for (i = 0; i < nbSeqs; ++i) { +- seqDef seq = seqStore->sequencesStart[i]; ++ seqDef const seq = seqStore->sequencesStart[i]; + literalsBytes += seq.litLength; + if (i == seqStore->longLengthPos && seqStore->longLengthType == ZSTD_llt_literalLength) { + literalsBytes += 0x10000; +- } +- } ++ } } + return literalsBytes; + } + + /* Returns match bytes represented in a seqStore */ +-static size_t ZSTD_countSeqStoreMatchBytes(const seqStore_t* const seqStore) { ++static size_t ZSTD_countSeqStoreMatchBytes(const seqStore_t* const seqStore) ++{ + size_t matchBytes = 0; +- size_t const nbSeqs = seqStore->sequences - seqStore->sequencesStart; ++ size_t const nbSeqs = (size_t)(seqStore->sequences - seqStore->sequencesStart); + size_t i; + for (i = 0; i < nbSeqs; ++i) { + seqDef seq = seqStore->sequencesStart[i]; + matchBytes += seq.mlBase + MINMATCH; + if (i == seqStore->longLengthPos && seqStore->longLengthType == ZSTD_llt_matchLength) { + matchBytes += 0x10000; +- } +- } ++ } } + return matchBytes; + } + +@@ -3307,15 +3846,12 @@ static size_t ZSTD_countSeqStoreMatchByt + */ + static void ZSTD_deriveSeqStoreChunk(seqStore_t* resultSeqStore, + const seqStore_t* originalSeqStore, +- size_t startIdx, size_t endIdx) { +- BYTE* const litEnd = originalSeqStore->lit; +- size_t literalsBytes; +- size_t literalsBytesPreceding = 0; +- ++ size_t startIdx, size_t endIdx) ++{ + *resultSeqStore = *originalSeqStore; + if (startIdx > 0) { + resultSeqStore->sequences = originalSeqStore->sequencesStart + startIdx; +- literalsBytesPreceding = ZSTD_countSeqStoreLiteralsBytes(resultSeqStore); ++ resultSeqStore->litStart += ZSTD_countSeqStoreLiteralsBytes(resultSeqStore); + } + + /* Move longLengthPos into the correct position if necessary */ +@@ -3328,13 +3864,12 @@ static void ZSTD_deriveSeqStoreChunk(seq + } + resultSeqStore->sequencesStart = originalSeqStore->sequencesStart + startIdx; + resultSeqStore->sequences = originalSeqStore->sequencesStart + endIdx; +- literalsBytes = ZSTD_countSeqStoreLiteralsBytes(resultSeqStore); +- resultSeqStore->litStart += literalsBytesPreceding; + if (endIdx == (size_t)(originalSeqStore->sequences - originalSeqStore->sequencesStart)) { + /* This accounts for possible last literals if the derived chunk reaches the end of the block */ +- resultSeqStore->lit = litEnd; ++ assert(resultSeqStore->lit == originalSeqStore->lit); + } else { +- resultSeqStore->lit = resultSeqStore->litStart+literalsBytes; ++ size_t const literalsBytes = ZSTD_countSeqStoreLiteralsBytes(resultSeqStore); ++ resultSeqStore->lit = resultSeqStore->litStart + literalsBytes; + } + resultSeqStore->llCode += startIdx; + resultSeqStore->mlCode += startIdx; +@@ -3342,20 +3877,26 @@ static void ZSTD_deriveSeqStoreChunk(seq + } + + /* +- * Returns the raw offset represented by the combination of offCode, ll0, and repcode history. +- * offCode must represent a repcode in the numeric representation of ZSTD_storeSeq(). ++ * Returns the raw offset represented by the combination of offBase, ll0, and repcode history. ++ * offBase must represent a repcode in the numeric representation of ZSTD_storeSeq(). + */ + static U32 +-ZSTD_resolveRepcodeToRawOffset(const U32 rep[ZSTD_REP_NUM], const U32 offCode, const U32 ll0) ++ZSTD_resolveRepcodeToRawOffset(const U32 rep[ZSTD_REP_NUM], const U32 offBase, const U32 ll0) + { +- U32 const adjustedOffCode = STORED_REPCODE(offCode) - 1 + ll0; /* [ 0 - 3 ] */ +- assert(STORED_IS_REPCODE(offCode)); +- if (adjustedOffCode == ZSTD_REP_NUM) { +- /* litlength == 0 and offCode == 2 implies selection of first repcode - 1 */ +- assert(rep[0] > 0); ++ U32 const adjustedRepCode = OFFBASE_TO_REPCODE(offBase) - 1 + ll0; /* [ 0 - 3 ] */ ++ assert(OFFBASE_IS_REPCODE(offBase)); ++ if (adjustedRepCode == ZSTD_REP_NUM) { ++ assert(ll0); ++ /* litlength == 0 and offCode == 2 implies selection of first repcode - 1 ++ * This is only valid if it results in a valid offset value, aka > 0. ++ * Note : it may happen that `rep[0]==1` in exceptional circumstances. ++ * In which case this function will return 0, which is an invalid offset. ++ * It's not an issue though, since this value will be ++ * compared and discarded within ZSTD_seqStore_resolveOffCodes(). ++ */ + return rep[0] - 1; + } +- return rep[adjustedOffCode]; ++ return rep[adjustedRepCode]; + } + + /* +@@ -3371,30 +3912,33 @@ ZSTD_resolveRepcodeToRawOffset(const U32 + * 1-3 : repcode 1-3 + * 4+ : real_offset+3 + */ +-static void ZSTD_seqStore_resolveOffCodes(repcodes_t* const dRepcodes, repcodes_t* const cRepcodes, +- seqStore_t* const seqStore, U32 const nbSeq) { ++static void ++ZSTD_seqStore_resolveOffCodes(repcodes_t* const dRepcodes, repcodes_t* const cRepcodes, ++ const seqStore_t* const seqStore, U32 const nbSeq) ++{ + U32 idx = 0; ++ U32 const longLitLenIdx = seqStore->longLengthType == ZSTD_llt_literalLength ? seqStore->longLengthPos : nbSeq; + for (; idx < nbSeq; ++idx) { + seqDef* const seq = seqStore->sequencesStart + idx; +- U32 const ll0 = (seq->litLength == 0); +- U32 const offCode = OFFBASE_TO_STORED(seq->offBase); +- assert(seq->offBase > 0); +- if (STORED_IS_REPCODE(offCode)) { +- U32 const dRawOffset = ZSTD_resolveRepcodeToRawOffset(dRepcodes->rep, offCode, ll0); +- U32 const cRawOffset = ZSTD_resolveRepcodeToRawOffset(cRepcodes->rep, offCode, ll0); ++ U32 const ll0 = (seq->litLength == 0) && (idx != longLitLenIdx); ++ U32 const offBase = seq->offBase; ++ assert(offBase > 0); ++ if (OFFBASE_IS_REPCODE(offBase)) { ++ U32 const dRawOffset = ZSTD_resolveRepcodeToRawOffset(dRepcodes->rep, offBase, ll0); ++ U32 const cRawOffset = ZSTD_resolveRepcodeToRawOffset(cRepcodes->rep, offBase, ll0); + /* Adjust simulated decompression repcode history if we come across a mismatch. Replace + * the repcode with the offset it actually references, determined by the compression + * repcode history. + */ + if (dRawOffset != cRawOffset) { +- seq->offBase = cRawOffset + ZSTD_REP_NUM; ++ seq->offBase = OFFSET_TO_OFFBASE(cRawOffset); + } + } + /* Compression repcode history is always updated with values directly from the unmodified seqStore. + * Decompression repcode history may use modified seq->offset value taken from compression repcode history. + */ +- ZSTD_updateRep(dRepcodes->rep, OFFBASE_TO_STORED(seq->offBase), ll0); +- ZSTD_updateRep(cRepcodes->rep, offCode, ll0); ++ ZSTD_updateRep(dRepcodes->rep, seq->offBase, ll0); ++ ZSTD_updateRep(cRepcodes->rep, offBase, ll0); + } + } + +@@ -3404,10 +3948,11 @@ static void ZSTD_seqStore_resolveOffCode + * Returns the total size of that block (including header) or a ZSTD error code. + */ + static size_t +-ZSTD_compressSeqStore_singleBlock(ZSTD_CCtx* zc, seqStore_t* const seqStore, ++ZSTD_compressSeqStore_singleBlock(ZSTD_CCtx* zc, ++ const seqStore_t* const seqStore, + repcodes_t* const dRep, repcodes_t* const cRep, + void* dst, size_t dstCapacity, +- const void* src, size_t srcSize, ++ const void* src, size_t srcSize, + U32 lastBlock, U32 isPartition) + { + const U32 rleMaxLength = 25; +@@ -3442,8 +3987,9 @@ ZSTD_compressSeqStore_singleBlock(ZSTD_C + cSeqsSize = 1; + } + ++ /* Sequence collection not supported when block splitting */ + if (zc->seqCollector.collectSequences) { +- ZSTD_copyBlockSequences(zc); ++ FORWARD_IF_ERROR(ZSTD_copyBlockSequences(&zc->seqCollector, seqStore, dRepOriginal.rep), "copyBlockSequences failed"); + ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState); + return 0; + } +@@ -3481,45 +4027,49 @@ typedef struct { + + /* Helper function to perform the recursive search for block splits. + * Estimates the cost of seqStore prior to split, and estimates the cost of splitting the sequences in half. +- * If advantageous to split, then we recurse down the two sub-blocks. If not, or if an error occurred in estimation, then +- * we do not recurse. ++ * If advantageous to split, then we recurse down the two sub-blocks. ++ * If not, or if an error occurred in estimation, then we do not recurse. + * +- * Note: The recursion depth is capped by a heuristic minimum number of sequences, defined by MIN_SEQUENCES_BLOCK_SPLITTING. ++ * Note: The recursion depth is capped by a heuristic minimum number of sequences, ++ * defined by MIN_SEQUENCES_BLOCK_SPLITTING. + * In theory, this means the absolute largest recursion depth is 10 == log2(maxNbSeqInBlock/MIN_SEQUENCES_BLOCK_SPLITTING). + * In practice, recursion depth usually doesn't go beyond 4. + * +- * Furthermore, the number of splits is capped by ZSTD_MAX_NB_BLOCK_SPLITS. At ZSTD_MAX_NB_BLOCK_SPLITS == 196 with the current existing blockSize ++ * Furthermore, the number of splits is capped by ZSTD_MAX_NB_BLOCK_SPLITS. ++ * At ZSTD_MAX_NB_BLOCK_SPLITS == 196 with the current existing blockSize + * maximum of 128 KB, this value is actually impossible to reach. + */ + static void + ZSTD_deriveBlockSplitsHelper(seqStoreSplits* splits, size_t startIdx, size_t endIdx, + ZSTD_CCtx* zc, const seqStore_t* origSeqStore) + { +- seqStore_t* fullSeqStoreChunk = &zc->blockSplitCtx.fullSeqStoreChunk; +- seqStore_t* firstHalfSeqStore = &zc->blockSplitCtx.firstHalfSeqStore; +- seqStore_t* secondHalfSeqStore = &zc->blockSplitCtx.secondHalfSeqStore; ++ seqStore_t* const fullSeqStoreChunk = &zc->blockSplitCtx.fullSeqStoreChunk; ++ seqStore_t* const firstHalfSeqStore = &zc->blockSplitCtx.firstHalfSeqStore; ++ seqStore_t* const secondHalfSeqStore = &zc->blockSplitCtx.secondHalfSeqStore; + size_t estimatedOriginalSize; + size_t estimatedFirstHalfSize; + size_t estimatedSecondHalfSize; + size_t midIdx = (startIdx + endIdx)/2; + ++ DEBUGLOG(5, "ZSTD_deriveBlockSplitsHelper: startIdx=%zu endIdx=%zu", startIdx, endIdx); ++ assert(endIdx >= startIdx); + if (endIdx - startIdx < MIN_SEQUENCES_BLOCK_SPLITTING || splits->idx >= ZSTD_MAX_NB_BLOCK_SPLITS) { +- DEBUGLOG(6, "ZSTD_deriveBlockSplitsHelper: Too few sequences"); ++ DEBUGLOG(6, "ZSTD_deriveBlockSplitsHelper: Too few sequences (%zu)", endIdx - startIdx); + return; + } +- DEBUGLOG(4, "ZSTD_deriveBlockSplitsHelper: startIdx=%zu endIdx=%zu", startIdx, endIdx); + ZSTD_deriveSeqStoreChunk(fullSeqStoreChunk, origSeqStore, startIdx, endIdx); + ZSTD_deriveSeqStoreChunk(firstHalfSeqStore, origSeqStore, startIdx, midIdx); + ZSTD_deriveSeqStoreChunk(secondHalfSeqStore, origSeqStore, midIdx, endIdx); + estimatedOriginalSize = ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(fullSeqStoreChunk, zc); + estimatedFirstHalfSize = ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(firstHalfSeqStore, zc); + estimatedSecondHalfSize = ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(secondHalfSeqStore, zc); +- DEBUGLOG(4, "Estimated original block size: %zu -- First half split: %zu -- Second half split: %zu", ++ DEBUGLOG(5, "Estimated original block size: %zu -- First half split: %zu -- Second half split: %zu", + estimatedOriginalSize, estimatedFirstHalfSize, estimatedSecondHalfSize); + if (ZSTD_isError(estimatedOriginalSize) || ZSTD_isError(estimatedFirstHalfSize) || ZSTD_isError(estimatedSecondHalfSize)) { + return; + } + if (estimatedFirstHalfSize + estimatedSecondHalfSize < estimatedOriginalSize) { ++ DEBUGLOG(5, "split decided at seqNb:%zu", midIdx); + ZSTD_deriveBlockSplitsHelper(splits, startIdx, midIdx, zc, origSeqStore); + splits->splitLocations[splits->idx] = (U32)midIdx; + splits->idx++; +@@ -3527,14 +4077,18 @@ ZSTD_deriveBlockSplitsHelper(seqStoreSpl + } + } + +-/* Base recursive function. Populates a table with intra-block partition indices that can improve compression ratio. ++/* Base recursive function. ++ * Populates a table with intra-block partition indices that can improve compression ratio. + * +- * Returns the number of splits made (which equals the size of the partition table - 1). ++ * @return: number of splits made (which equals the size of the partition table - 1). + */ +-static size_t ZSTD_deriveBlockSplits(ZSTD_CCtx* zc, U32 partitions[], U32 nbSeq) { +- seqStoreSplits splits = {partitions, 0}; ++static size_t ZSTD_deriveBlockSplits(ZSTD_CCtx* zc, U32 partitions[], U32 nbSeq) ++{ ++ seqStoreSplits splits; ++ splits.splitLocations = partitions; ++ splits.idx = 0; + if (nbSeq <= 4) { +- DEBUGLOG(4, "ZSTD_deriveBlockSplits: Too few sequences to split"); ++ DEBUGLOG(5, "ZSTD_deriveBlockSplits: Too few sequences to split (%u <= 4)", nbSeq); + /* Refuse to try and split anything with less than 4 sequences */ + return 0; + } +@@ -3550,18 +4104,20 @@ static size_t ZSTD_deriveBlockSplits(ZST + * Returns combined size of all blocks (which includes headers), or a ZSTD error code. + */ + static size_t +-ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc, void* dst, size_t dstCapacity, +- const void* src, size_t blockSize, U32 lastBlock, U32 nbSeq) ++ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc, ++ void* dst, size_t dstCapacity, ++ const void* src, size_t blockSize, ++ U32 lastBlock, U32 nbSeq) + { + size_t cSize = 0; + const BYTE* ip = (const BYTE*)src; + BYTE* op = (BYTE*)dst; + size_t i = 0; + size_t srcBytesTotal = 0; +- U32* partitions = zc->blockSplitCtx.partitions; /* size == ZSTD_MAX_NB_BLOCK_SPLITS */ +- seqStore_t* nextSeqStore = &zc->blockSplitCtx.nextSeqStore; +- seqStore_t* currSeqStore = &zc->blockSplitCtx.currSeqStore; +- size_t numSplits = ZSTD_deriveBlockSplits(zc, partitions, nbSeq); ++ U32* const partitions = zc->blockSplitCtx.partitions; /* size == ZSTD_MAX_NB_BLOCK_SPLITS */ ++ seqStore_t* const nextSeqStore = &zc->blockSplitCtx.nextSeqStore; ++ seqStore_t* const currSeqStore = &zc->blockSplitCtx.currSeqStore; ++ size_t const numSplits = ZSTD_deriveBlockSplits(zc, partitions, nbSeq); + + /* If a block is split and some partitions are emitted as RLE/uncompressed, then repcode history + * may become invalid. In order to reconcile potentially invalid repcodes, we keep track of two +@@ -3583,30 +4139,31 @@ ZSTD_compressBlock_splitBlock_internal(Z + ZSTD_memcpy(cRep.rep, zc->blockState.prevCBlock->rep, sizeof(repcodes_t)); + ZSTD_memset(nextSeqStore, 0, sizeof(seqStore_t)); + +- DEBUGLOG(4, "ZSTD_compressBlock_splitBlock_internal (dstCapacity=%u, dictLimit=%u, nextToUpdate=%u)", ++ DEBUGLOG(5, "ZSTD_compressBlock_splitBlock_internal (dstCapacity=%u, dictLimit=%u, nextToUpdate=%u)", + (unsigned)dstCapacity, (unsigned)zc->blockState.matchState.window.dictLimit, + (unsigned)zc->blockState.matchState.nextToUpdate); + + if (numSplits == 0) { +- size_t cSizeSingleBlock = ZSTD_compressSeqStore_singleBlock(zc, &zc->seqStore, +- &dRep, &cRep, +- op, dstCapacity, +- ip, blockSize, +- lastBlock, 0 /* isPartition */); ++ size_t cSizeSingleBlock = ++ ZSTD_compressSeqStore_singleBlock(zc, &zc->seqStore, ++ &dRep, &cRep, ++ op, dstCapacity, ++ ip, blockSize, ++ lastBlock, 0 /* isPartition */); + FORWARD_IF_ERROR(cSizeSingleBlock, "Compressing single block from splitBlock_internal() failed!"); + DEBUGLOG(5, "ZSTD_compressBlock_splitBlock_internal: No splits"); +- assert(cSizeSingleBlock <= ZSTD_BLOCKSIZE_MAX + ZSTD_blockHeaderSize); ++ assert(zc->blockSize <= ZSTD_BLOCKSIZE_MAX); ++ assert(cSizeSingleBlock <= zc->blockSize + ZSTD_blockHeaderSize); + return cSizeSingleBlock; + } + + ZSTD_deriveSeqStoreChunk(currSeqStore, &zc->seqStore, 0, partitions[0]); + for (i = 0; i <= numSplits; ++i) { +- size_t srcBytes; + size_t cSizeChunk; + U32 const lastPartition = (i == numSplits); + U32 lastBlockEntireSrc = 0; + +- srcBytes = ZSTD_countSeqStoreLiteralsBytes(currSeqStore) + ZSTD_countSeqStoreMatchBytes(currSeqStore); ++ size_t srcBytes = ZSTD_countSeqStoreLiteralsBytes(currSeqStore) + ZSTD_countSeqStoreMatchBytes(currSeqStore); + srcBytesTotal += srcBytes; + if (lastPartition) { + /* This is the final partition, need to account for possible last literals */ +@@ -3621,7 +4178,8 @@ ZSTD_compressBlock_splitBlock_internal(Z + op, dstCapacity, + ip, srcBytes, + lastBlockEntireSrc, 1 /* isPartition */); +- DEBUGLOG(5, "Estimated size: %zu actual size: %zu", ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(currSeqStore, zc), cSizeChunk); ++ DEBUGLOG(5, "Estimated size: %zu vs %zu : actual size", ++ ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(currSeqStore, zc), cSizeChunk); + FORWARD_IF_ERROR(cSizeChunk, "Compressing chunk failed!"); + + ip += srcBytes; +@@ -3629,10 +4187,10 @@ ZSTD_compressBlock_splitBlock_internal(Z + dstCapacity -= cSizeChunk; + cSize += cSizeChunk; + *currSeqStore = *nextSeqStore; +- assert(cSizeChunk <= ZSTD_BLOCKSIZE_MAX + ZSTD_blockHeaderSize); ++ assert(cSizeChunk <= zc->blockSize + ZSTD_blockHeaderSize); + } +- /* cRep and dRep may have diverged during the compression. If so, we use the dRep repcodes +- * for the next block. ++ /* cRep and dRep may have diverged during the compression. ++ * If so, we use the dRep repcodes for the next block. + */ + ZSTD_memcpy(zc->blockState.prevCBlock->rep, dRep.rep, sizeof(repcodes_t)); + return cSize; +@@ -3643,8 +4201,6 @@ ZSTD_compressBlock_splitBlock(ZSTD_CCtx* + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, U32 lastBlock) + { +- const BYTE* ip = (const BYTE*)src; +- BYTE* op = (BYTE*)dst; + U32 nbSeq; + size_t cSize; + DEBUGLOG(4, "ZSTD_compressBlock_splitBlock"); +@@ -3655,7 +4211,8 @@ ZSTD_compressBlock_splitBlock(ZSTD_CCtx* + if (bss == ZSTDbss_noCompress) { + if (zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode == FSE_repeat_valid) + zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode = FSE_repeat_check; +- cSize = ZSTD_noCompressBlock(op, dstCapacity, ip, srcSize, lastBlock); ++ RETURN_ERROR_IF(zc->seqCollector.collectSequences, sequenceProducer_failed, "Uncompressible block"); ++ cSize = ZSTD_noCompressBlock(dst, dstCapacity, src, srcSize, lastBlock); + FORWARD_IF_ERROR(cSize, "ZSTD_noCompressBlock failed"); + DEBUGLOG(4, "ZSTD_compressBlock_splitBlock: Nocompress block"); + return cSize; +@@ -3673,9 +4230,9 @@ ZSTD_compressBlock_internal(ZSTD_CCtx* z + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, U32 frame) + { +- /* This the upper bound for the length of an rle block. +- * This isn't the actual upper bound. Finding the real threshold +- * needs further investigation. ++ /* This is an estimated upper bound for the length of an rle block. ++ * This isn't the actual upper bound. ++ * Finding the real threshold needs further investigation. + */ + const U32 rleMaxLength = 25; + size_t cSize; +@@ -3687,11 +4244,15 @@ ZSTD_compressBlock_internal(ZSTD_CCtx* z + + { const size_t bss = ZSTD_buildSeqStore(zc, src, srcSize); + FORWARD_IF_ERROR(bss, "ZSTD_buildSeqStore failed"); +- if (bss == ZSTDbss_noCompress) { cSize = 0; goto out; } ++ if (bss == ZSTDbss_noCompress) { ++ RETURN_ERROR_IF(zc->seqCollector.collectSequences, sequenceProducer_failed, "Uncompressible block"); ++ cSize = 0; ++ goto out; ++ } + } + + if (zc->seqCollector.collectSequences) { +- ZSTD_copyBlockSequences(zc); ++ FORWARD_IF_ERROR(ZSTD_copyBlockSequences(&zc->seqCollector, ZSTD_getSeqStore(zc), zc->blockState.prevCBlock->rep), "copyBlockSequences failed"); + ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState); + return 0; + } +@@ -3767,10 +4328,11 @@ static size_t ZSTD_compressBlock_targetC + * * cSize >= blockBound(srcSize): We have expanded the block too much so + * emit an uncompressed block. + */ +- { +- size_t const cSize = ZSTD_compressSuperBlock(zc, dst, dstCapacity, src, srcSize, lastBlock); ++ { size_t const cSize = ++ ZSTD_compressSuperBlock(zc, dst, dstCapacity, src, srcSize, lastBlock); + if (cSize != ERROR(dstSize_tooSmall)) { +- size_t const maxCSize = srcSize - ZSTD_minGain(srcSize, zc->appliedParams.cParams.strategy); ++ size_t const maxCSize = ++ srcSize - ZSTD_minGain(srcSize, zc->appliedParams.cParams.strategy); + FORWARD_IF_ERROR(cSize, "ZSTD_compressSuperBlock failed"); + if (cSize != 0 && cSize < maxCSize + ZSTD_blockHeaderSize) { + ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState); +@@ -3778,7 +4340,7 @@ static size_t ZSTD_compressBlock_targetC + } + } + } +- } ++ } /* if (bss == ZSTDbss_compress)*/ + + DEBUGLOG(6, "Resorting to ZSTD_noCompressBlock()"); + /* Superblock compression failed, attempt to emit a single no compress block. +@@ -3836,7 +4398,7 @@ static void ZSTD_overflowCorrectIfNeeded + * All blocks will be terminated, all input will be consumed. + * Function will issue an error if there is not enough `dstCapacity` to hold the compressed content. + * Frame is supposed already started (header already produced) +-* @return : compressed size, or an error code ++* @return : compressed size, or an error code + */ + static size_t ZSTD_compress_frameChunk(ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, +@@ -3860,7 +4422,9 @@ static size_t ZSTD_compress_frameChunk(Z + ZSTD_matchState_t* const ms = &cctx->blockState.matchState; + U32 const lastBlock = lastFrameChunk & (blockSize >= remaining); + +- RETURN_ERROR_IF(dstCapacity < ZSTD_blockHeaderSize + MIN_CBLOCK_SIZE, ++ /* TODO: See 3090. We reduced MIN_CBLOCK_SIZE from 3 to 2 so to compensate we are adding ++ * additional 1. We need to revisit and change this logic to be more consistent */ ++ RETURN_ERROR_IF(dstCapacity < ZSTD_blockHeaderSize + MIN_CBLOCK_SIZE + 1, + dstSize_tooSmall, + "not enough space to store compressed block"); + if (remaining < blockSize) blockSize = remaining; +@@ -3899,7 +4463,7 @@ static size_t ZSTD_compress_frameChunk(Z + MEM_writeLE24(op, cBlockHeader); + cSize += ZSTD_blockHeaderSize; + } +- } ++ } /* if (ZSTD_useTargetCBlockSize(&cctx->appliedParams))*/ + + + ip += blockSize; +@@ -4001,19 +4565,15 @@ size_t ZSTD_writeLastEmptyBlock(void* ds + } + } + +-size_t ZSTD_referenceExternalSequences(ZSTD_CCtx* cctx, rawSeq* seq, size_t nbSeq) ++void ZSTD_referenceExternalSequences(ZSTD_CCtx* cctx, rawSeq* seq, size_t nbSeq) + { +- RETURN_ERROR_IF(cctx->stage != ZSTDcs_init, stage_wrong, +- "wrong cctx stage"); +- RETURN_ERROR_IF(cctx->appliedParams.ldmParams.enableLdm == ZSTD_ps_enable, +- parameter_unsupported, +- "incompatible with ldm"); ++ assert(cctx->stage == ZSTDcs_init); ++ assert(nbSeq == 0 || cctx->appliedParams.ldmParams.enableLdm != ZSTD_ps_enable); + cctx->externSeqStore.seq = seq; + cctx->externSeqStore.size = nbSeq; + cctx->externSeqStore.capacity = nbSeq; + cctx->externSeqStore.pos = 0; + cctx->externSeqStore.posInSequence = 0; +- return 0; + } + + +@@ -4078,31 +4638,51 @@ static size_t ZSTD_compressContinue_inte + } + } + +-size_t ZSTD_compressContinue (ZSTD_CCtx* cctx, +- void* dst, size_t dstCapacity, +- const void* src, size_t srcSize) ++size_t ZSTD_compressContinue_public(ZSTD_CCtx* cctx, ++ void* dst, size_t dstCapacity, ++ const void* src, size_t srcSize) + { + DEBUGLOG(5, "ZSTD_compressContinue (srcSize=%u)", (unsigned)srcSize); + return ZSTD_compressContinue_internal(cctx, dst, dstCapacity, src, srcSize, 1 /* frame mode */, 0 /* last chunk */); + } + ++/* NOTE: Must just wrap ZSTD_compressContinue_public() */ ++size_t ZSTD_compressContinue(ZSTD_CCtx* cctx, ++ void* dst, size_t dstCapacity, ++ const void* src, size_t srcSize) ++{ ++ return ZSTD_compressContinue_public(cctx, dst, dstCapacity, src, srcSize); ++} + +-size_t ZSTD_getBlockSize(const ZSTD_CCtx* cctx) ++static size_t ZSTD_getBlockSize_deprecated(const ZSTD_CCtx* cctx) + { + ZSTD_compressionParameters const cParams = cctx->appliedParams.cParams; + assert(!ZSTD_checkCParams(cParams)); +- return MIN (ZSTD_BLOCKSIZE_MAX, (U32)1 << cParams.windowLog); ++ return MIN(cctx->appliedParams.maxBlockSize, (size_t)1 << cParams.windowLog); + } + +-size_t ZSTD_compressBlock(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize) ++/* NOTE: Must just wrap ZSTD_getBlockSize_deprecated() */ ++size_t ZSTD_getBlockSize(const ZSTD_CCtx* cctx) ++{ ++ return ZSTD_getBlockSize_deprecated(cctx); ++} ++ ++/* NOTE: Must just wrap ZSTD_compressBlock_deprecated() */ ++size_t ZSTD_compressBlock_deprecated(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize) + { + DEBUGLOG(5, "ZSTD_compressBlock: srcSize = %u", (unsigned)srcSize); +- { size_t const blockSizeMax = ZSTD_getBlockSize(cctx); ++ { size_t const blockSizeMax = ZSTD_getBlockSize_deprecated(cctx); + RETURN_ERROR_IF(srcSize > blockSizeMax, srcSize_wrong, "input is larger than a block"); } + + return ZSTD_compressContinue_internal(cctx, dst, dstCapacity, src, srcSize, 0 /* frame mode */, 0 /* last chunk */); + } + ++/* NOTE: Must just wrap ZSTD_compressBlock_deprecated() */ ++size_t ZSTD_compressBlock(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize) ++{ ++ return ZSTD_compressBlock_deprecated(cctx, dst, dstCapacity, src, srcSize); ++} ++ + /*! ZSTD_loadDictionaryContent() : + * @return : 0, or an error code + */ +@@ -4111,25 +4691,36 @@ static size_t ZSTD_loadDictionaryContent + ZSTD_cwksp* ws, + ZSTD_CCtx_params const* params, + const void* src, size_t srcSize, +- ZSTD_dictTableLoadMethod_e dtlm) ++ ZSTD_dictTableLoadMethod_e dtlm, ++ ZSTD_tableFillPurpose_e tfp) + { + const BYTE* ip = (const BYTE*) src; + const BYTE* const iend = ip + srcSize; + int const loadLdmDict = params->ldmParams.enableLdm == ZSTD_ps_enable && ls != NULL; + +- /* Assert that we the ms params match the params we're being given */ ++ /* Assert that the ms params match the params we're being given */ + ZSTD_assertEqualCParams(params->cParams, ms->cParams); + +- if (srcSize > ZSTD_CHUNKSIZE_MAX) { ++ { /* Ensure large dictionaries can't cause index overflow */ ++ + /* Allow the dictionary to set indices up to exactly ZSTD_CURRENT_MAX. + * Dictionaries right at the edge will immediately trigger overflow + * correction, but I don't want to insert extra constraints here. + */ +- U32 const maxDictSize = ZSTD_CURRENT_MAX - 1; +- /* We must have cleared our windows when our source is this large. */ +- assert(ZSTD_window_isEmpty(ms->window)); +- if (loadLdmDict) +- assert(ZSTD_window_isEmpty(ls->window)); ++ U32 maxDictSize = ZSTD_CURRENT_MAX - ZSTD_WINDOW_START_INDEX; ++ ++ int const CDictTaggedIndices = ZSTD_CDictIndicesAreTagged(¶ms->cParams); ++ if (CDictTaggedIndices && tfp == ZSTD_tfp_forCDict) { ++ /* Some dictionary matchfinders in zstd use "short cache", ++ * which treats the lower ZSTD_SHORT_CACHE_TAG_BITS of each ++ * CDict hashtable entry as a tag rather than as part of an index. ++ * When short cache is used, we need to truncate the dictionary ++ * so that its indices don't overlap with the tag. */ ++ U32 const shortCacheMaxDictSize = (1u << (32 - ZSTD_SHORT_CACHE_TAG_BITS)) - ZSTD_WINDOW_START_INDEX; ++ maxDictSize = MIN(maxDictSize, shortCacheMaxDictSize); ++ assert(!loadLdmDict); ++ } ++ + /* If the dictionary is too large, only load the suffix of the dictionary. */ + if (srcSize > maxDictSize) { + ip = iend - maxDictSize; +@@ -4138,35 +4729,58 @@ static size_t ZSTD_loadDictionaryContent + } + } + +- DEBUGLOG(4, "ZSTD_loadDictionaryContent(): useRowMatchFinder=%d", (int)params->useRowMatchFinder); ++ if (srcSize > ZSTD_CHUNKSIZE_MAX) { ++ /* We must have cleared our windows when our source is this large. */ ++ assert(ZSTD_window_isEmpty(ms->window)); ++ if (loadLdmDict) assert(ZSTD_window_isEmpty(ls->window)); ++ } + ZSTD_window_update(&ms->window, src, srcSize, /* forceNonContiguous */ 0); +- ms->loadedDictEnd = params->forceWindow ? 0 : (U32)(iend - ms->window.base); +- ms->forceNonContiguous = params->deterministicRefPrefix; + +- if (loadLdmDict) { ++ DEBUGLOG(4, "ZSTD_loadDictionaryContent(): useRowMatchFinder=%d", (int)params->useRowMatchFinder); ++ ++ if (loadLdmDict) { /* Load the entire dict into LDM matchfinders. */ + ZSTD_window_update(&ls->window, src, srcSize, /* forceNonContiguous */ 0); + ls->loadedDictEnd = params->forceWindow ? 0 : (U32)(iend - ls->window.base); ++ ZSTD_ldm_fillHashTable(ls, ip, iend, ¶ms->ldmParams); ++ } ++ ++ /* If the dict is larger than we can reasonably index in our tables, only load the suffix. */ ++ if (params->cParams.strategy < ZSTD_btultra) { ++ U32 maxDictSize = 8U << MIN(MAX(params->cParams.hashLog, params->cParams.chainLog), 28); ++ if (srcSize > maxDictSize) { ++ ip = iend - maxDictSize; ++ src = ip; ++ srcSize = maxDictSize; ++ } + } + ++ ms->nextToUpdate = (U32)(ip - ms->window.base); ++ ms->loadedDictEnd = params->forceWindow ? 0 : (U32)(iend - ms->window.base); ++ ms->forceNonContiguous = params->deterministicRefPrefix; ++ + if (srcSize <= HASH_READ_SIZE) return 0; + + ZSTD_overflowCorrectIfNeeded(ms, ws, params, ip, iend); + +- if (loadLdmDict) +- ZSTD_ldm_fillHashTable(ls, ip, iend, ¶ms->ldmParams); +- + switch(params->cParams.strategy) + { + case ZSTD_fast: +- ZSTD_fillHashTable(ms, iend, dtlm); ++ ZSTD_fillHashTable(ms, iend, dtlm, tfp); + break; + case ZSTD_dfast: +- ZSTD_fillDoubleHashTable(ms, iend, dtlm); ++#ifndef ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR ++ ZSTD_fillDoubleHashTable(ms, iend, dtlm, tfp); ++#else ++ assert(0); /* shouldn't be called: cparams should've been adjusted. */ ++#endif + break; + + case ZSTD_greedy: + case ZSTD_lazy: + case ZSTD_lazy2: ++#if !defined(ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR) \ ++ || !defined(ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR) \ ++ || !defined(ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR) + assert(srcSize >= HASH_READ_SIZE); + if (ms->dedicatedDictSearch) { + assert(ms->chainTable != NULL); +@@ -4174,7 +4788,7 @@ static size_t ZSTD_loadDictionaryContent + } else { + assert(params->useRowMatchFinder != ZSTD_ps_auto); + if (params->useRowMatchFinder == ZSTD_ps_enable) { +- size_t const tagTableSize = ((size_t)1 << params->cParams.hashLog) * sizeof(U16); ++ size_t const tagTableSize = ((size_t)1 << params->cParams.hashLog); + ZSTD_memset(ms->tagTable, 0, tagTableSize); + ZSTD_row_update(ms, iend-HASH_READ_SIZE); + DEBUGLOG(4, "Using row-based hash table for lazy dict"); +@@ -4183,14 +4797,23 @@ static size_t ZSTD_loadDictionaryContent + DEBUGLOG(4, "Using chain-based hash table for lazy dict"); + } + } ++#else ++ assert(0); /* shouldn't be called: cparams should've been adjusted. */ ++#endif + break; + + case ZSTD_btlazy2: /* we want the dictionary table fully sorted */ + case ZSTD_btopt: + case ZSTD_btultra: + case ZSTD_btultra2: ++#if !defined(ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR) \ ++ || !defined(ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR) \ ++ || !defined(ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR) + assert(srcSize >= HASH_READ_SIZE); + ZSTD_updateTree(ms, iend-HASH_READ_SIZE, iend); ++#else ++ assert(0); /* shouldn't be called: cparams should've been adjusted. */ ++#endif + break; + + default: +@@ -4237,11 +4860,10 @@ size_t ZSTD_loadCEntropy(ZSTD_compressed + + /* We only set the loaded table as valid if it contains all non-zero + * weights. Otherwise, we set it to check */ +- if (!hasZeroWeights) ++ if (!hasZeroWeights && maxSymbolValue == 255) + bs->entropy.huf.repeatMode = HUF_repeat_valid; + + RETURN_ERROR_IF(HUF_isError(hufHeaderSize), dictionary_corrupted, ""); +- RETURN_ERROR_IF(maxSymbolValue < 255, dictionary_corrupted, ""); + dictPtr += hufHeaderSize; + } + +@@ -4327,6 +4949,7 @@ static size_t ZSTD_loadZstdDictionary(ZS + ZSTD_CCtx_params const* params, + const void* dict, size_t dictSize, + ZSTD_dictTableLoadMethod_e dtlm, ++ ZSTD_tableFillPurpose_e tfp, + void* workspace) + { + const BYTE* dictPtr = (const BYTE*)dict; +@@ -4345,7 +4968,7 @@ static size_t ZSTD_loadZstdDictionary(ZS + { + size_t const dictContentSize = (size_t)(dictEnd - dictPtr); + FORWARD_IF_ERROR(ZSTD_loadDictionaryContent( +- ms, NULL, ws, params, dictPtr, dictContentSize, dtlm), ""); ++ ms, NULL, ws, params, dictPtr, dictContentSize, dtlm, tfp), ""); + } + return dictID; + } +@@ -4361,6 +4984,7 @@ ZSTD_compress_insertDictionary(ZSTD_comp + const void* dict, size_t dictSize, + ZSTD_dictContentType_e dictContentType, + ZSTD_dictTableLoadMethod_e dtlm, ++ ZSTD_tableFillPurpose_e tfp, + void* workspace) + { + DEBUGLOG(4, "ZSTD_compress_insertDictionary (dictSize=%u)", (U32)dictSize); +@@ -4373,13 +4997,13 @@ ZSTD_compress_insertDictionary(ZSTD_comp + + /* dict restricted modes */ + if (dictContentType == ZSTD_dct_rawContent) +- return ZSTD_loadDictionaryContent(ms, ls, ws, params, dict, dictSize, dtlm); ++ return ZSTD_loadDictionaryContent(ms, ls, ws, params, dict, dictSize, dtlm, tfp); + + if (MEM_readLE32(dict) != ZSTD_MAGIC_DICTIONARY) { + if (dictContentType == ZSTD_dct_auto) { + DEBUGLOG(4, "raw content dictionary detected"); + return ZSTD_loadDictionaryContent( +- ms, ls, ws, params, dict, dictSize, dtlm); ++ ms, ls, ws, params, dict, dictSize, dtlm, tfp); + } + RETURN_ERROR_IF(dictContentType == ZSTD_dct_fullDict, dictionary_wrong, ""); + assert(0); /* impossible */ +@@ -4387,13 +5011,14 @@ ZSTD_compress_insertDictionary(ZSTD_comp + + /* dict as full zstd dictionary */ + return ZSTD_loadZstdDictionary( +- bs, ms, ws, params, dict, dictSize, dtlm, workspace); ++ bs, ms, ws, params, dict, dictSize, dtlm, tfp, workspace); + } + + #define ZSTD_USE_CDICT_PARAMS_SRCSIZE_CUTOFF (128 KB) + #define ZSTD_USE_CDICT_PARAMS_DICTSIZE_MULTIPLIER (6ULL) + + /*! ZSTD_compressBegin_internal() : ++ * Assumption : either @dict OR @cdict (or none) is non-NULL, never both + * @return : 0, or an error code */ + static size_t ZSTD_compressBegin_internal(ZSTD_CCtx* cctx, + const void* dict, size_t dictSize, +@@ -4426,11 +5051,11 @@ static size_t ZSTD_compressBegin_interna + cctx->blockState.prevCBlock, &cctx->blockState.matchState, + &cctx->ldmState, &cctx->workspace, &cctx->appliedParams, cdict->dictContent, + cdict->dictContentSize, cdict->dictContentType, dtlm, +- cctx->entropyWorkspace) ++ ZSTD_tfp_forCCtx, cctx->entropyWorkspace) + : ZSTD_compress_insertDictionary( + cctx->blockState.prevCBlock, &cctx->blockState.matchState, + &cctx->ldmState, &cctx->workspace, &cctx->appliedParams, dict, dictSize, +- dictContentType, dtlm, cctx->entropyWorkspace); ++ dictContentType, dtlm, ZSTD_tfp_forCCtx, cctx->entropyWorkspace); + FORWARD_IF_ERROR(dictID, "ZSTD_compress_insertDictionary failed"); + assert(dictID <= UINT_MAX); + cctx->dictID = (U32)dictID; +@@ -4471,11 +5096,11 @@ size_t ZSTD_compressBegin_advanced(ZSTD_ + &cctxParams, pledgedSrcSize); + } + +-size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel) ++static size_t ++ZSTD_compressBegin_usingDict_deprecated(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel) + { + ZSTD_CCtx_params cctxParams; +- { +- ZSTD_parameters const params = ZSTD_getParams_internal(compressionLevel, ZSTD_CONTENTSIZE_UNKNOWN, dictSize, ZSTD_cpm_noAttachDict); ++ { ZSTD_parameters const params = ZSTD_getParams_internal(compressionLevel, ZSTD_CONTENTSIZE_UNKNOWN, dictSize, ZSTD_cpm_noAttachDict); + ZSTD_CCtxParams_init_internal(&cctxParams, ¶ms, (compressionLevel == 0) ? ZSTD_CLEVEL_DEFAULT : compressionLevel); + } + DEBUGLOG(4, "ZSTD_compressBegin_usingDict (dictSize=%u)", (unsigned)dictSize); +@@ -4483,9 +5108,15 @@ size_t ZSTD_compressBegin_usingDict(ZSTD + &cctxParams, ZSTD_CONTENTSIZE_UNKNOWN, ZSTDb_not_buffered); + } + ++size_t ++ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel) ++{ ++ return ZSTD_compressBegin_usingDict_deprecated(cctx, dict, dictSize, compressionLevel); ++} ++ + size_t ZSTD_compressBegin(ZSTD_CCtx* cctx, int compressionLevel) + { +- return ZSTD_compressBegin_usingDict(cctx, NULL, 0, compressionLevel); ++ return ZSTD_compressBegin_usingDict_deprecated(cctx, NULL, 0, compressionLevel); + } + + +@@ -4496,14 +5127,13 @@ static size_t ZSTD_writeEpilogue(ZSTD_CC + { + BYTE* const ostart = (BYTE*)dst; + BYTE* op = ostart; +- size_t fhSize = 0; + + DEBUGLOG(4, "ZSTD_writeEpilogue"); + RETURN_ERROR_IF(cctx->stage == ZSTDcs_created, stage_wrong, "init missing"); + + /* special case : empty frame */ + if (cctx->stage == ZSTDcs_init) { +- fhSize = ZSTD_writeFrameHeader(dst, dstCapacity, &cctx->appliedParams, 0, 0); ++ size_t fhSize = ZSTD_writeFrameHeader(dst, dstCapacity, &cctx->appliedParams, 0, 0); + FORWARD_IF_ERROR(fhSize, "ZSTD_writeFrameHeader failed"); + dstCapacity -= fhSize; + op += fhSize; +@@ -4513,8 +5143,9 @@ static size_t ZSTD_writeEpilogue(ZSTD_CC + if (cctx->stage != ZSTDcs_ending) { + /* write one last empty block, make it the "last" block */ + U32 const cBlockHeader24 = 1 /* last block */ + (((U32)bt_raw)<<1) + 0; +- RETURN_ERROR_IF(dstCapacity<4, dstSize_tooSmall, "no room for epilogue"); +- MEM_writeLE32(op, cBlockHeader24); ++ ZSTD_STATIC_ASSERT(ZSTD_BLOCKHEADERSIZE == 3); ++ RETURN_ERROR_IF(dstCapacity<3, dstSize_tooSmall, "no room for epilogue"); ++ MEM_writeLE24(op, cBlockHeader24); + op += ZSTD_blockHeaderSize; + dstCapacity -= ZSTD_blockHeaderSize; + } +@@ -4537,9 +5168,9 @@ void ZSTD_CCtx_trace(ZSTD_CCtx* cctx, si + (void)extraCSize; + } + +-size_t ZSTD_compressEnd (ZSTD_CCtx* cctx, +- void* dst, size_t dstCapacity, +- const void* src, size_t srcSize) ++size_t ZSTD_compressEnd_public(ZSTD_CCtx* cctx, ++ void* dst, size_t dstCapacity, ++ const void* src, size_t srcSize) + { + size_t endResult; + size_t const cSize = ZSTD_compressContinue_internal(cctx, +@@ -4563,6 +5194,14 @@ size_t ZSTD_compressEnd (ZSTD_CCtx* cctx + return cSize + endResult; + } + ++/* NOTE: Must just wrap ZSTD_compressEnd_public() */ ++size_t ZSTD_compressEnd(ZSTD_CCtx* cctx, ++ void* dst, size_t dstCapacity, ++ const void* src, size_t srcSize) ++{ ++ return ZSTD_compressEnd_public(cctx, dst, dstCapacity, src, srcSize); ++} ++ + size_t ZSTD_compress_advanced (ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, +@@ -4591,7 +5230,7 @@ size_t ZSTD_compress_advanced_internal( + FORWARD_IF_ERROR( ZSTD_compressBegin_internal(cctx, + dict, dictSize, ZSTD_dct_auto, ZSTD_dtlm_fast, NULL, + params, srcSize, ZSTDb_not_buffered) , ""); +- return ZSTD_compressEnd(cctx, dst, dstCapacity, src, srcSize); ++ return ZSTD_compressEnd_public(cctx, dst, dstCapacity, src, srcSize); + } + + size_t ZSTD_compress_usingDict(ZSTD_CCtx* cctx, +@@ -4709,7 +5348,7 @@ static size_t ZSTD_initCDict_internal( + { size_t const dictID = ZSTD_compress_insertDictionary( + &cdict->cBlockState, &cdict->matchState, NULL, &cdict->workspace, + ¶ms, cdict->dictContent, cdict->dictContentSize, +- dictContentType, ZSTD_dtlm_full, cdict->entropyWorkspace); ++ dictContentType, ZSTD_dtlm_full, ZSTD_tfp_forCDict, cdict->entropyWorkspace); + FORWARD_IF_ERROR(dictID, "ZSTD_compress_insertDictionary failed"); + assert(dictID <= (size_t)(U32)-1); + cdict->dictID = (U32)dictID; +@@ -4811,7 +5450,7 @@ ZSTD_CDict* ZSTD_createCDict_advanced2( + cctxParams.useRowMatchFinder, cctxParams.enableDedicatedDictSearch, + customMem); + +- if (ZSTD_isError( ZSTD_initCDict_internal(cdict, ++ if (!cdict || ZSTD_isError( ZSTD_initCDict_internal(cdict, + dict, dictSize, + dictLoadMethod, dictContentType, + cctxParams) )) { +@@ -4906,6 +5545,7 @@ const ZSTD_CDict* ZSTD_initStaticCDict( + params.cParams = cParams; + params.useRowMatchFinder = useRowMatchFinder; + cdict->useRowMatchFinder = useRowMatchFinder; ++ cdict->compressionLevel = ZSTD_NO_CLEVEL; + + if (ZSTD_isError( ZSTD_initCDict_internal(cdict, + dict, dictSize, +@@ -4985,12 +5625,17 @@ size_t ZSTD_compressBegin_usingCDict_adv + + /* ZSTD_compressBegin_usingCDict() : + * cdict must be != NULL */ +-size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict) ++size_t ZSTD_compressBegin_usingCDict_deprecated(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict) + { + ZSTD_frameParameters const fParams = { 0 /*content*/, 0 /*checksum*/, 0 /*noDictID*/ }; + return ZSTD_compressBegin_usingCDict_internal(cctx, cdict, fParams, ZSTD_CONTENTSIZE_UNKNOWN); + } + ++size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict) ++{ ++ return ZSTD_compressBegin_usingCDict_deprecated(cctx, cdict); ++} ++ + /*! ZSTD_compress_usingCDict_internal(): + * Implementation of various ZSTD_compress_usingCDict* functions. + */ +@@ -5000,7 +5645,7 @@ static size_t ZSTD_compress_usingCDict_i + const ZSTD_CDict* cdict, ZSTD_frameParameters fParams) + { + FORWARD_IF_ERROR(ZSTD_compressBegin_usingCDict_internal(cctx, cdict, fParams, srcSize), ""); /* will check if cdict != NULL */ +- return ZSTD_compressEnd(cctx, dst, dstCapacity, src, srcSize); ++ return ZSTD_compressEnd_public(cctx, dst, dstCapacity, src, srcSize); + } + + /*! ZSTD_compress_usingCDict_advanced(): +@@ -5197,30 +5842,41 @@ size_t ZSTD_initCStream(ZSTD_CStream* zc + + static size_t ZSTD_nextInputSizeHint(const ZSTD_CCtx* cctx) + { +- size_t hintInSize = cctx->inBuffTarget - cctx->inBuffPos; +- if (hintInSize==0) hintInSize = cctx->blockSize; +- return hintInSize; ++ if (cctx->appliedParams.inBufferMode == ZSTD_bm_stable) { ++ return cctx->blockSize - cctx->stableIn_notConsumed; ++ } ++ assert(cctx->appliedParams.inBufferMode == ZSTD_bm_buffered); ++ { size_t hintInSize = cctx->inBuffTarget - cctx->inBuffPos; ++ if (hintInSize==0) hintInSize = cctx->blockSize; ++ return hintInSize; ++ } + } + + /* ZSTD_compressStream_generic(): + * internal function for all *compressStream*() variants +- * non-static, because can be called from zstdmt_compress.c +- * @return : hint size for next input */ ++ * @return : hint size for next input to complete ongoing block */ + static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs, + ZSTD_outBuffer* output, + ZSTD_inBuffer* input, + ZSTD_EndDirective const flushMode) + { +- const char* const istart = (const char*)input->src; +- const char* const iend = input->size != 0 ? istart + input->size : istart; +- const char* ip = input->pos != 0 ? istart + input->pos : istart; +- char* const ostart = (char*)output->dst; +- char* const oend = output->size != 0 ? ostart + output->size : ostart; +- char* op = output->pos != 0 ? ostart + output->pos : ostart; ++ const char* const istart = (assert(input != NULL), (const char*)input->src); ++ const char* const iend = (istart != NULL) ? istart + input->size : istart; ++ const char* ip = (istart != NULL) ? istart + input->pos : istart; ++ char* const ostart = (assert(output != NULL), (char*)output->dst); ++ char* const oend = (ostart != NULL) ? ostart + output->size : ostart; ++ char* op = (ostart != NULL) ? ostart + output->pos : ostart; + U32 someMoreWork = 1; + + /* check expectations */ +- DEBUGLOG(5, "ZSTD_compressStream_generic, flush=%u", (unsigned)flushMode); ++ DEBUGLOG(5, "ZSTD_compressStream_generic, flush=%i, srcSize = %zu", (int)flushMode, input->size - input->pos); ++ assert(zcs != NULL); ++ if (zcs->appliedParams.inBufferMode == ZSTD_bm_stable) { ++ assert(input->pos >= zcs->stableIn_notConsumed); ++ input->pos -= zcs->stableIn_notConsumed; ++ if (ip) ip -= zcs->stableIn_notConsumed; ++ zcs->stableIn_notConsumed = 0; ++ } + if (zcs->appliedParams.inBufferMode == ZSTD_bm_buffered) { + assert(zcs->inBuff != NULL); + assert(zcs->inBuffSize > 0); +@@ -5229,8 +5885,10 @@ static size_t ZSTD_compressStream_generi + assert(zcs->outBuff != NULL); + assert(zcs->outBuffSize > 0); + } +- assert(output->pos <= output->size); ++ if (input->src == NULL) assert(input->size == 0); + assert(input->pos <= input->size); ++ if (output->dst == NULL) assert(output->size == 0); ++ assert(output->pos <= output->size); + assert((U32)flushMode <= (U32)ZSTD_e_end); + + while (someMoreWork) { +@@ -5245,7 +5903,7 @@ static size_t ZSTD_compressStream_generi + || zcs->appliedParams.outBufferMode == ZSTD_bm_stable) /* OR we are allowed to return dstSizeTooSmall */ + && (zcs->inBuffPos == 0) ) { + /* shortcut to compression pass directly into output buffer */ +- size_t const cSize = ZSTD_compressEnd(zcs, ++ size_t const cSize = ZSTD_compressEnd_public(zcs, + op, oend-op, ip, iend-ip); + DEBUGLOG(4, "ZSTD_compressEnd : cSize=%u", (unsigned)cSize); + FORWARD_IF_ERROR(cSize, "ZSTD_compressEnd failed"); +@@ -5262,8 +5920,7 @@ static size_t ZSTD_compressStream_generi + zcs->inBuff + zcs->inBuffPos, toLoad, + ip, iend-ip); + zcs->inBuffPos += loaded; +- if (loaded != 0) +- ip += loaded; ++ if (ip) ip += loaded; + if ( (flushMode == ZSTD_e_continue) + && (zcs->inBuffPos < zcs->inBuffTarget) ) { + /* not enough input to fill full block : stop here */ +@@ -5274,6 +5931,20 @@ static size_t ZSTD_compressStream_generi + /* empty */ + someMoreWork = 0; break; + } ++ } else { ++ assert(zcs->appliedParams.inBufferMode == ZSTD_bm_stable); ++ if ( (flushMode == ZSTD_e_continue) ++ && ( (size_t)(iend - ip) < zcs->blockSize) ) { ++ /* can't compress a full block : stop here */ ++ zcs->stableIn_notConsumed = (size_t)(iend - ip); ++ ip = iend; /* pretend to have consumed input */ ++ someMoreWork = 0; break; ++ } ++ if ( (flushMode == ZSTD_e_flush) ++ && (ip == iend) ) { ++ /* empty */ ++ someMoreWork = 0; break; ++ } + } + /* compress current block (note : this stage cannot be stopped in the middle) */ + DEBUGLOG(5, "stream compression stage (flushMode==%u)", flushMode); +@@ -5281,9 +5952,8 @@ static size_t ZSTD_compressStream_generi + void* cDst; + size_t cSize; + size_t oSize = oend-op; +- size_t const iSize = inputBuffered +- ? zcs->inBuffPos - zcs->inToCompress +- : MIN((size_t)(iend - ip), zcs->blockSize); ++ size_t const iSize = inputBuffered ? zcs->inBuffPos - zcs->inToCompress ++ : MIN((size_t)(iend - ip), zcs->blockSize); + if (oSize >= ZSTD_compressBound(iSize) || zcs->appliedParams.outBufferMode == ZSTD_bm_stable) + cDst = op; /* compress into output buffer, to skip flush stage */ + else +@@ -5291,9 +5961,9 @@ static size_t ZSTD_compressStream_generi + if (inputBuffered) { + unsigned const lastBlock = (flushMode == ZSTD_e_end) && (ip==iend); + cSize = lastBlock ? +- ZSTD_compressEnd(zcs, cDst, oSize, ++ ZSTD_compressEnd_public(zcs, cDst, oSize, + zcs->inBuff + zcs->inToCompress, iSize) : +- ZSTD_compressContinue(zcs, cDst, oSize, ++ ZSTD_compressContinue_public(zcs, cDst, oSize, + zcs->inBuff + zcs->inToCompress, iSize); + FORWARD_IF_ERROR(cSize, "%s", lastBlock ? "ZSTD_compressEnd failed" : "ZSTD_compressContinue failed"); + zcs->frameEnded = lastBlock; +@@ -5306,19 +5976,16 @@ static size_t ZSTD_compressStream_generi + if (!lastBlock) + assert(zcs->inBuffTarget <= zcs->inBuffSize); + zcs->inToCompress = zcs->inBuffPos; +- } else { +- unsigned const lastBlock = (ip + iSize == iend); +- assert(flushMode == ZSTD_e_end /* Already validated */); ++ } else { /* !inputBuffered, hence ZSTD_bm_stable */ ++ unsigned const lastBlock = (flushMode == ZSTD_e_end) && (ip + iSize == iend); + cSize = lastBlock ? +- ZSTD_compressEnd(zcs, cDst, oSize, ip, iSize) : +- ZSTD_compressContinue(zcs, cDst, oSize, ip, iSize); ++ ZSTD_compressEnd_public(zcs, cDst, oSize, ip, iSize) : ++ ZSTD_compressContinue_public(zcs, cDst, oSize, ip, iSize); + /* Consume the input prior to error checking to mirror buffered mode. */ +- if (iSize > 0) +- ip += iSize; ++ if (ip) ip += iSize; + FORWARD_IF_ERROR(cSize, "%s", lastBlock ? "ZSTD_compressEnd failed" : "ZSTD_compressContinue failed"); + zcs->frameEnded = lastBlock; +- if (lastBlock) +- assert(ip == iend); ++ if (lastBlock) assert(ip == iend); + } + if (cDst == op) { /* no need to flush */ + op += cSize; +@@ -5388,8 +6055,10 @@ size_t ZSTD_compressStream(ZSTD_CStream* + /* After a compression call set the expected input/output buffer. + * This is validated at the start of the next compression call. + */ +-static void ZSTD_setBufferExpectations(ZSTD_CCtx* cctx, ZSTD_outBuffer const* output, ZSTD_inBuffer const* input) ++static void ++ZSTD_setBufferExpectations(ZSTD_CCtx* cctx, const ZSTD_outBuffer* output, const ZSTD_inBuffer* input) + { ++ DEBUGLOG(5, "ZSTD_setBufferExpectations (for advanced stable in/out modes)"); + if (cctx->appliedParams.inBufferMode == ZSTD_bm_stable) { + cctx->expectedInBuffer = *input; + } +@@ -5408,22 +6077,22 @@ static size_t ZSTD_checkBufferStability( + { + if (cctx->appliedParams.inBufferMode == ZSTD_bm_stable) { + ZSTD_inBuffer const expect = cctx->expectedInBuffer; +- if (expect.src != input->src || expect.pos != input->pos || expect.size != input->size) +- RETURN_ERROR(srcBuffer_wrong, "ZSTD_c_stableInBuffer enabled but input differs!"); +- if (endOp != ZSTD_e_end) +- RETURN_ERROR(srcBuffer_wrong, "ZSTD_c_stableInBuffer can only be used with ZSTD_e_end!"); ++ if (expect.src != input->src || expect.pos != input->pos) ++ RETURN_ERROR(stabilityCondition_notRespected, "ZSTD_c_stableInBuffer enabled but input differs!"); + } ++ (void)endOp; + if (cctx->appliedParams.outBufferMode == ZSTD_bm_stable) { + size_t const outBufferSize = output->size - output->pos; + if (cctx->expectedOutBufferSize != outBufferSize) +- RETURN_ERROR(dstBuffer_wrong, "ZSTD_c_stableOutBuffer enabled but output size differs!"); ++ RETURN_ERROR(stabilityCondition_notRespected, "ZSTD_c_stableOutBuffer enabled but output size differs!"); + } + return 0; + } + + static size_t ZSTD_CCtx_init_compressStream2(ZSTD_CCtx* cctx, + ZSTD_EndDirective endOp, +- size_t inSize) { ++ size_t inSize) ++{ + ZSTD_CCtx_params params = cctx->requestedParams; + ZSTD_prefixDict const prefixDict = cctx->prefixDict; + FORWARD_IF_ERROR( ZSTD_initLocalDict(cctx) , ""); /* Init the local dict if present. */ +@@ -5437,9 +6106,9 @@ static size_t ZSTD_CCtx_init_compressStr + params.compressionLevel = cctx->cdict->compressionLevel; + } + DEBUGLOG(4, "ZSTD_compressStream2 : transparent init stage"); +- if (endOp == ZSTD_e_end) cctx->pledgedSrcSizePlusOne = inSize + 1; /* auto-fix pledgedSrcSize */ +- { +- size_t const dictSize = prefixDict.dict ++ if (endOp == ZSTD_e_end) cctx->pledgedSrcSizePlusOne = inSize + 1; /* auto-determine pledgedSrcSize */ ++ ++ { size_t const dictSize = prefixDict.dict + ? prefixDict.dictSize + : (cctx->cdict ? cctx->cdict->dictContentSize : 0); + ZSTD_cParamMode_e const mode = ZSTD_getCParamMode(cctx->cdict, ¶ms, cctx->pledgedSrcSizePlusOne - 1); +@@ -5451,6 +6120,9 @@ static size_t ZSTD_CCtx_init_compressStr + params.useBlockSplitter = ZSTD_resolveBlockSplitterMode(params.useBlockSplitter, ¶ms.cParams); + params.ldmParams.enableLdm = ZSTD_resolveEnableLdm(params.ldmParams.enableLdm, ¶ms.cParams); + params.useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(params.useRowMatchFinder, ¶ms.cParams); ++ params.validateSequences = ZSTD_resolveExternalSequenceValidation(params.validateSequences); ++ params.maxBlockSize = ZSTD_resolveMaxBlockSize(params.maxBlockSize); ++ params.searchForExternalRepcodes = ZSTD_resolveExternalRepcodeSearch(params.searchForExternalRepcodes, params.compressionLevel); + + { U64 const pledgedSrcSize = cctx->pledgedSrcSizePlusOne - 1; + assert(!ZSTD_isError(ZSTD_checkCParams(params.cParams))); +@@ -5477,6 +6149,8 @@ static size_t ZSTD_CCtx_init_compressStr + return 0; + } + ++/* @return provides a minimum amount of data remaining to be flushed from internal buffers ++ */ + size_t ZSTD_compressStream2( ZSTD_CCtx* cctx, + ZSTD_outBuffer* output, + ZSTD_inBuffer* input, +@@ -5491,8 +6165,27 @@ size_t ZSTD_compressStream2( ZSTD_CCtx* + + /* transparent initialization stage */ + if (cctx->streamStage == zcss_init) { +- FORWARD_IF_ERROR(ZSTD_CCtx_init_compressStream2(cctx, endOp, input->size), "CompressStream2 initialization failed"); +- ZSTD_setBufferExpectations(cctx, output, input); /* Set initial buffer expectations now that we've initialized */ ++ size_t const inputSize = input->size - input->pos; /* no obligation to start from pos==0 */ ++ size_t const totalInputSize = inputSize + cctx->stableIn_notConsumed; ++ if ( (cctx->requestedParams.inBufferMode == ZSTD_bm_stable) /* input is presumed stable, across invocations */ ++ && (endOp == ZSTD_e_continue) /* no flush requested, more input to come */ ++ && (totalInputSize < ZSTD_BLOCKSIZE_MAX) ) { /* not even reached one block yet */ ++ if (cctx->stableIn_notConsumed) { /* not the first time */ ++ /* check stable source guarantees */ ++ RETURN_ERROR_IF(input->src != cctx->expectedInBuffer.src, stabilityCondition_notRespected, "stableInBuffer condition not respected: wrong src pointer"); ++ RETURN_ERROR_IF(input->pos != cctx->expectedInBuffer.size, stabilityCondition_notRespected, "stableInBuffer condition not respected: externally modified pos"); ++ } ++ /* pretend input was consumed, to give a sense forward progress */ ++ input->pos = input->size; ++ /* save stable inBuffer, for later control, and flush/end */ ++ cctx->expectedInBuffer = *input; ++ /* but actually input wasn't consumed, so keep track of position from where compression shall resume */ ++ cctx->stableIn_notConsumed += inputSize; ++ /* don't initialize yet, wait for the first block of flush() order, for better parameters adaptation */ ++ return ZSTD_FRAMEHEADERSIZE_MIN(cctx->requestedParams.format); /* at least some header to produce */ ++ } ++ FORWARD_IF_ERROR(ZSTD_CCtx_init_compressStream2(cctx, endOp, totalInputSize), "compressStream2 initialization failed"); ++ ZSTD_setBufferExpectations(cctx, output, input); /* Set initial buffer expectations now that we've initialized */ + } + /* end of transparent initialization stage */ + +@@ -5510,13 +6203,20 @@ size_t ZSTD_compressStream2_simpleArgs ( + const void* src, size_t srcSize, size_t* srcPos, + ZSTD_EndDirective endOp) + { +- ZSTD_outBuffer output = { dst, dstCapacity, *dstPos }; +- ZSTD_inBuffer input = { src, srcSize, *srcPos }; ++ ZSTD_outBuffer output; ++ ZSTD_inBuffer input; ++ output.dst = dst; ++ output.size = dstCapacity; ++ output.pos = *dstPos; ++ input.src = src; ++ input.size = srcSize; ++ input.pos = *srcPos; + /* ZSTD_compressStream2() will check validity of dstPos and srcPos */ +- size_t const cErr = ZSTD_compressStream2(cctx, &output, &input, endOp); +- *dstPos = output.pos; +- *srcPos = input.pos; +- return cErr; ++ { size_t const cErr = ZSTD_compressStream2(cctx, &output, &input, endOp); ++ *dstPos = output.pos; ++ *srcPos = input.pos; ++ return cErr; ++ } + } + + size_t ZSTD_compress2(ZSTD_CCtx* cctx, +@@ -5539,6 +6239,7 @@ size_t ZSTD_compress2(ZSTD_CCtx* cctx, + /* Reset to the original values. */ + cctx->requestedParams.inBufferMode = originalInBufferMode; + cctx->requestedParams.outBufferMode = originalOutBufferMode; ++ + FORWARD_IF_ERROR(result, "ZSTD_compressStream2_simpleArgs failed"); + if (result != 0) { /* compression not completed, due to lack of output space */ + assert(oPos == dstCapacity); +@@ -5549,64 +6250,61 @@ size_t ZSTD_compress2(ZSTD_CCtx* cctx, + } + } + +-typedef struct { +- U32 idx; /* Index in array of ZSTD_Sequence */ +- U32 posInSequence; /* Position within sequence at idx */ +- size_t posInSrc; /* Number of bytes given by sequences provided so far */ +-} ZSTD_sequencePosition; +- + /* ZSTD_validateSequence() : + * @offCode : is presumed to follow format required by ZSTD_storeSeq() + * @returns a ZSTD error code if sequence is not valid + */ + static size_t +-ZSTD_validateSequence(U32 offCode, U32 matchLength, +- size_t posInSrc, U32 windowLog, size_t dictSize) ++ZSTD_validateSequence(U32 offCode, U32 matchLength, U32 minMatch, ++ size_t posInSrc, U32 windowLog, size_t dictSize, int useSequenceProducer) + { +- U32 const windowSize = 1 << windowLog; ++ U32 const windowSize = 1u << windowLog; + /* posInSrc represents the amount of data the decoder would decode up to this point. + * As long as the amount of data decoded is less than or equal to window size, offsets may be + * larger than the total length of output decoded in order to reference the dict, even larger than + * window size. After output surpasses windowSize, we're limited to windowSize offsets again. + */ + size_t const offsetBound = posInSrc > windowSize ? (size_t)windowSize : posInSrc + (size_t)dictSize; +- RETURN_ERROR_IF(offCode > STORE_OFFSET(offsetBound), corruption_detected, "Offset too large!"); +- RETURN_ERROR_IF(matchLength < MINMATCH, corruption_detected, "Matchlength too small"); ++ size_t const matchLenLowerBound = (minMatch == 3 || useSequenceProducer) ? 3 : 4; ++ RETURN_ERROR_IF(offCode > OFFSET_TO_OFFBASE(offsetBound), externalSequences_invalid, "Offset too large!"); ++ /* Validate maxNbSeq is large enough for the given matchLength and minMatch */ ++ RETURN_ERROR_IF(matchLength < matchLenLowerBound, externalSequences_invalid, "Matchlength too small for the minMatch"); + return 0; + } + + /* Returns an offset code, given a sequence's raw offset, the ongoing repcode array, and whether litLength == 0 */ +-static U32 ZSTD_finalizeOffCode(U32 rawOffset, const U32 rep[ZSTD_REP_NUM], U32 ll0) ++static U32 ZSTD_finalizeOffBase(U32 rawOffset, const U32 rep[ZSTD_REP_NUM], U32 ll0) + { +- U32 offCode = STORE_OFFSET(rawOffset); ++ U32 offBase = OFFSET_TO_OFFBASE(rawOffset); + + if (!ll0 && rawOffset == rep[0]) { +- offCode = STORE_REPCODE_1; ++ offBase = REPCODE1_TO_OFFBASE; + } else if (rawOffset == rep[1]) { +- offCode = STORE_REPCODE(2 - ll0); ++ offBase = REPCODE_TO_OFFBASE(2 - ll0); + } else if (rawOffset == rep[2]) { +- offCode = STORE_REPCODE(3 - ll0); ++ offBase = REPCODE_TO_OFFBASE(3 - ll0); + } else if (ll0 && rawOffset == rep[0] - 1) { +- offCode = STORE_REPCODE_3; ++ offBase = REPCODE3_TO_OFFBASE; + } +- return offCode; ++ return offBase; + } + +-/* Returns 0 on success, and a ZSTD_error otherwise. This function scans through an array of +- * ZSTD_Sequence, storing the sequences it finds, until it reaches a block delimiter. +- */ +-static size_t ++size_t + ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx, + ZSTD_sequencePosition* seqPos, + const ZSTD_Sequence* const inSeqs, size_t inSeqsSize, +- const void* src, size_t blockSize) ++ const void* src, size_t blockSize, ++ ZSTD_paramSwitch_e externalRepSearch) + { + U32 idx = seqPos->idx; ++ U32 const startIdx = idx; + BYTE const* ip = (BYTE const*)(src); + const BYTE* const iend = ip + blockSize; + repcodes_t updatedRepcodes; + U32 dictSize; + ++ DEBUGLOG(5, "ZSTD_copySequencesToSeqStoreExplicitBlockDelim (blockSize = %zu)", blockSize); ++ + if (cctx->cdict) { + dictSize = (U32)cctx->cdict->dictContentSize; + } else if (cctx->prefixDict.dict) { +@@ -5615,25 +6313,55 @@ ZSTD_copySequencesToSeqStoreExplicitBloc + dictSize = 0; + } + ZSTD_memcpy(updatedRepcodes.rep, cctx->blockState.prevCBlock->rep, sizeof(repcodes_t)); +- for (; (inSeqs[idx].matchLength != 0 || inSeqs[idx].offset != 0) && idx < inSeqsSize; ++idx) { ++ for (; idx < inSeqsSize && (inSeqs[idx].matchLength != 0 || inSeqs[idx].offset != 0); ++idx) { + U32 const litLength = inSeqs[idx].litLength; +- U32 const ll0 = (litLength == 0); + U32 const matchLength = inSeqs[idx].matchLength; +- U32 const offCode = ZSTD_finalizeOffCode(inSeqs[idx].offset, updatedRepcodes.rep, ll0); +- ZSTD_updateRep(updatedRepcodes.rep, offCode, ll0); ++ U32 offBase; ++ ++ if (externalRepSearch == ZSTD_ps_disable) { ++ offBase = OFFSET_TO_OFFBASE(inSeqs[idx].offset); ++ } else { ++ U32 const ll0 = (litLength == 0); ++ offBase = ZSTD_finalizeOffBase(inSeqs[idx].offset, updatedRepcodes.rep, ll0); ++ ZSTD_updateRep(updatedRepcodes.rep, offBase, ll0); ++ } + +- DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offCode, matchLength, litLength); ++ DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offBase, matchLength, litLength); + if (cctx->appliedParams.validateSequences) { + seqPos->posInSrc += litLength + matchLength; +- FORWARD_IF_ERROR(ZSTD_validateSequence(offCode, matchLength, seqPos->posInSrc, +- cctx->appliedParams.cParams.windowLog, dictSize), ++ FORWARD_IF_ERROR(ZSTD_validateSequence(offBase, matchLength, cctx->appliedParams.cParams.minMatch, seqPos->posInSrc, ++ cctx->appliedParams.cParams.windowLog, dictSize, ZSTD_hasExtSeqProd(&cctx->appliedParams)), + "Sequence validation failed"); + } +- RETURN_ERROR_IF(idx - seqPos->idx > cctx->seqStore.maxNbSeq, memory_allocation, ++ RETURN_ERROR_IF(idx - seqPos->idx >= cctx->seqStore.maxNbSeq, externalSequences_invalid, + "Not enough memory allocated. Try adjusting ZSTD_c_minMatch."); +- ZSTD_storeSeq(&cctx->seqStore, litLength, ip, iend, offCode, matchLength); ++ ZSTD_storeSeq(&cctx->seqStore, litLength, ip, iend, offBase, matchLength); + ip += matchLength + litLength; + } ++ ++ /* If we skipped repcode search while parsing, we need to update repcodes now */ ++ assert(externalRepSearch != ZSTD_ps_auto); ++ assert(idx >= startIdx); ++ if (externalRepSearch == ZSTD_ps_disable && idx != startIdx) { ++ U32* const rep = updatedRepcodes.rep; ++ U32 lastSeqIdx = idx - 1; /* index of last non-block-delimiter sequence */ ++ ++ if (lastSeqIdx >= startIdx + 2) { ++ rep[2] = inSeqs[lastSeqIdx - 2].offset; ++ rep[1] = inSeqs[lastSeqIdx - 1].offset; ++ rep[0] = inSeqs[lastSeqIdx].offset; ++ } else if (lastSeqIdx == startIdx + 1) { ++ rep[2] = rep[0]; ++ rep[1] = inSeqs[lastSeqIdx - 1].offset; ++ rep[0] = inSeqs[lastSeqIdx].offset; ++ } else { ++ assert(lastSeqIdx == startIdx); ++ rep[2] = rep[1]; ++ rep[1] = rep[0]; ++ rep[0] = inSeqs[lastSeqIdx].offset; ++ } ++ } ++ + ZSTD_memcpy(cctx->blockState.nextCBlock->rep, updatedRepcodes.rep, sizeof(repcodes_t)); + + if (inSeqs[idx].litLength) { +@@ -5642,26 +6370,15 @@ ZSTD_copySequencesToSeqStoreExplicitBloc + ip += inSeqs[idx].litLength; + seqPos->posInSrc += inSeqs[idx].litLength; + } +- RETURN_ERROR_IF(ip != iend, corruption_detected, "Blocksize doesn't agree with block delimiter!"); ++ RETURN_ERROR_IF(ip != iend, externalSequences_invalid, "Blocksize doesn't agree with block delimiter!"); + seqPos->idx = idx+1; + return 0; + } + +-/* Returns the number of bytes to move the current read position back by. Only non-zero +- * if we ended up splitting a sequence. Otherwise, it may return a ZSTD error if something +- * went wrong. +- * +- * This function will attempt to scan through blockSize bytes represented by the sequences +- * in inSeqs, storing any (partial) sequences. +- * +- * Occasionally, we may want to change the actual number of bytes we consumed from inSeqs to +- * avoid splitting a match, or to avoid splitting a match such that it would produce a match +- * smaller than MINMATCH. In this case, we return the number of bytes that we didn't read from this block. +- */ +-static size_t ++size_t + ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* seqPos, + const ZSTD_Sequence* const inSeqs, size_t inSeqsSize, +- const void* src, size_t blockSize) ++ const void* src, size_t blockSize, ZSTD_paramSwitch_e externalRepSearch) + { + U32 idx = seqPos->idx; + U32 startPosInSequence = seqPos->posInSequence; +@@ -5673,6 +6390,9 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim + U32 bytesAdjustment = 0; + U32 finalMatchSplit = 0; + ++ /* TODO(embg) support fast parsing mode in noBlockDelim mode */ ++ (void)externalRepSearch; ++ + if (cctx->cdict) { + dictSize = cctx->cdict->dictContentSize; + } else if (cctx->prefixDict.dict) { +@@ -5680,7 +6400,7 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim + } else { + dictSize = 0; + } +- DEBUGLOG(5, "ZSTD_copySequencesToSeqStore: idx: %u PIS: %u blockSize: %zu", idx, startPosInSequence, blockSize); ++ DEBUGLOG(5, "ZSTD_copySequencesToSeqStoreNoBlockDelim: idx: %u PIS: %u blockSize: %zu", idx, startPosInSequence, blockSize); + DEBUGLOG(5, "Start seq: idx: %u (of: %u ml: %u ll: %u)", idx, inSeqs[idx].offset, inSeqs[idx].matchLength, inSeqs[idx].litLength); + ZSTD_memcpy(updatedRepcodes.rep, cctx->blockState.prevCBlock->rep, sizeof(repcodes_t)); + while (endPosInSequence && idx < inSeqsSize && !finalMatchSplit) { +@@ -5688,7 +6408,7 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim + U32 litLength = currSeq.litLength; + U32 matchLength = currSeq.matchLength; + U32 const rawOffset = currSeq.offset; +- U32 offCode; ++ U32 offBase; + + /* Modify the sequence depending on where endPosInSequence lies */ + if (endPosInSequence >= currSeq.litLength + currSeq.matchLength) { +@@ -5702,7 +6422,6 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim + /* Move to the next sequence */ + endPosInSequence -= currSeq.litLength + currSeq.matchLength; + startPosInSequence = 0; +- idx++; + } else { + /* This is the final (partial) sequence we're adding from inSeqs, and endPosInSequence + does not reach the end of the match. So, we have to split the sequence */ +@@ -5742,21 +6461,23 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim + } + /* Check if this offset can be represented with a repcode */ + { U32 const ll0 = (litLength == 0); +- offCode = ZSTD_finalizeOffCode(rawOffset, updatedRepcodes.rep, ll0); +- ZSTD_updateRep(updatedRepcodes.rep, offCode, ll0); ++ offBase = ZSTD_finalizeOffBase(rawOffset, updatedRepcodes.rep, ll0); ++ ZSTD_updateRep(updatedRepcodes.rep, offBase, ll0); + } + + if (cctx->appliedParams.validateSequences) { + seqPos->posInSrc += litLength + matchLength; +- FORWARD_IF_ERROR(ZSTD_validateSequence(offCode, matchLength, seqPos->posInSrc, +- cctx->appliedParams.cParams.windowLog, dictSize), ++ FORWARD_IF_ERROR(ZSTD_validateSequence(offBase, matchLength, cctx->appliedParams.cParams.minMatch, seqPos->posInSrc, ++ cctx->appliedParams.cParams.windowLog, dictSize, ZSTD_hasExtSeqProd(&cctx->appliedParams)), + "Sequence validation failed"); + } +- DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offCode, matchLength, litLength); +- RETURN_ERROR_IF(idx - seqPos->idx > cctx->seqStore.maxNbSeq, memory_allocation, ++ DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offBase, matchLength, litLength); ++ RETURN_ERROR_IF(idx - seqPos->idx >= cctx->seqStore.maxNbSeq, externalSequences_invalid, + "Not enough memory allocated. Try adjusting ZSTD_c_minMatch."); +- ZSTD_storeSeq(&cctx->seqStore, litLength, ip, iend, offCode, matchLength); ++ ZSTD_storeSeq(&cctx->seqStore, litLength, ip, iend, offBase, matchLength); + ip += matchLength + litLength; ++ if (!finalMatchSplit) ++ idx++; /* Next Sequence */ + } + DEBUGLOG(5, "Ending seq: idx: %u (of: %u ml: %u ll: %u)", idx, inSeqs[idx].offset, inSeqs[idx].matchLength, inSeqs[idx].litLength); + assert(idx == inSeqsSize || endPosInSequence <= inSeqs[idx].litLength + inSeqs[idx].matchLength); +@@ -5779,7 +6500,7 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim + + typedef size_t (*ZSTD_sequenceCopier) (ZSTD_CCtx* cctx, ZSTD_sequencePosition* seqPos, + const ZSTD_Sequence* const inSeqs, size_t inSeqsSize, +- const void* src, size_t blockSize); ++ const void* src, size_t blockSize, ZSTD_paramSwitch_e externalRepSearch); + static ZSTD_sequenceCopier ZSTD_selectSequenceCopier(ZSTD_sequenceFormat_e mode) + { + ZSTD_sequenceCopier sequenceCopier = NULL; +@@ -5793,6 +6514,57 @@ static ZSTD_sequenceCopier ZSTD_selectSe + return sequenceCopier; + } + ++/* Discover the size of next block by searching for the delimiter. ++ * Note that a block delimiter **must** exist in this mode, ++ * otherwise it's an input error. ++ * The block size retrieved will be later compared to ensure it remains within bounds */ ++static size_t ++blockSize_explicitDelimiter(const ZSTD_Sequence* inSeqs, size_t inSeqsSize, ZSTD_sequencePosition seqPos) ++{ ++ int end = 0; ++ size_t blockSize = 0; ++ size_t spos = seqPos.idx; ++ DEBUGLOG(6, "blockSize_explicitDelimiter : seq %zu / %zu", spos, inSeqsSize); ++ assert(spos <= inSeqsSize); ++ while (spos < inSeqsSize) { ++ end = (inSeqs[spos].offset == 0); ++ blockSize += inSeqs[spos].litLength + inSeqs[spos].matchLength; ++ if (end) { ++ if (inSeqs[spos].matchLength != 0) ++ RETURN_ERROR(externalSequences_invalid, "delimiter format error : both matchlength and offset must be == 0"); ++ break; ++ } ++ spos++; ++ } ++ if (!end) ++ RETURN_ERROR(externalSequences_invalid, "Reached end of sequences without finding a block delimiter"); ++ return blockSize; ++} ++ ++/* More a "target" block size */ ++static size_t blockSize_noDelimiter(size_t blockSize, size_t remaining) ++{ ++ int const lastBlock = (remaining <= blockSize); ++ return lastBlock ? remaining : blockSize; ++} ++ ++static size_t determine_blockSize(ZSTD_sequenceFormat_e mode, ++ size_t blockSize, size_t remaining, ++ const ZSTD_Sequence* inSeqs, size_t inSeqsSize, ZSTD_sequencePosition seqPos) ++{ ++ DEBUGLOG(6, "determine_blockSize : remainingSize = %zu", remaining); ++ if (mode == ZSTD_sf_noBlockDelimiters) ++ return blockSize_noDelimiter(blockSize, remaining); ++ { size_t const explicitBlockSize = blockSize_explicitDelimiter(inSeqs, inSeqsSize, seqPos); ++ FORWARD_IF_ERROR(explicitBlockSize, "Error while determining block size with explicit delimiters"); ++ if (explicitBlockSize > blockSize) ++ RETURN_ERROR(externalSequences_invalid, "sequences incorrectly define a too large block"); ++ if (explicitBlockSize > remaining) ++ RETURN_ERROR(externalSequences_invalid, "sequences define a frame longer than source"); ++ return explicitBlockSize; ++ } ++} ++ + /* Compress, block-by-block, all of the sequences given. + * + * Returns the cumulative size of all compressed blocks (including their headers), +@@ -5805,9 +6577,6 @@ ZSTD_compressSequences_internal(ZSTD_CCt + const void* src, size_t srcSize) + { + size_t cSize = 0; +- U32 lastBlock; +- size_t blockSize; +- size_t compressedSeqsSize; + size_t remaining = srcSize; + ZSTD_sequencePosition seqPos = {0, 0, 0}; + +@@ -5827,22 +6596,29 @@ ZSTD_compressSequences_internal(ZSTD_CCt + } + + while (remaining) { ++ size_t compressedSeqsSize; + size_t cBlockSize; + size_t additionalByteAdjustment; +- lastBlock = remaining <= cctx->blockSize; +- blockSize = lastBlock ? (U32)remaining : (U32)cctx->blockSize; ++ size_t blockSize = determine_blockSize(cctx->appliedParams.blockDelimiters, ++ cctx->blockSize, remaining, ++ inSeqs, inSeqsSize, seqPos); ++ U32 const lastBlock = (blockSize == remaining); ++ FORWARD_IF_ERROR(blockSize, "Error while trying to determine block size"); ++ assert(blockSize <= remaining); + ZSTD_resetSeqStore(&cctx->seqStore); +- DEBUGLOG(4, "Working on new block. Blocksize: %zu", blockSize); ++ DEBUGLOG(5, "Working on new block. Blocksize: %zu (total:%zu)", blockSize, (ip - (const BYTE*)src) + blockSize); + +- additionalByteAdjustment = sequenceCopier(cctx, &seqPos, inSeqs, inSeqsSize, ip, blockSize); ++ additionalByteAdjustment = sequenceCopier(cctx, &seqPos, inSeqs, inSeqsSize, ip, blockSize, cctx->appliedParams.searchForExternalRepcodes); + FORWARD_IF_ERROR(additionalByteAdjustment, "Bad sequence copy"); + blockSize -= additionalByteAdjustment; + + /* If blocks are too small, emit as a nocompress block */ +- if (blockSize < MIN_CBLOCK_SIZE+ZSTD_blockHeaderSize+1) { ++ /* TODO: See 3090. We reduced MIN_CBLOCK_SIZE from 3 to 2 so to compensate we are adding ++ * additional 1. We need to revisit and change this logic to be more consistent */ ++ if (blockSize < MIN_CBLOCK_SIZE+ZSTD_blockHeaderSize+1+1) { + cBlockSize = ZSTD_noCompressBlock(op, dstCapacity, ip, blockSize, lastBlock); + FORWARD_IF_ERROR(cBlockSize, "Nocompress block failed"); +- DEBUGLOG(4, "Block too small, writing out nocompress block: cSize: %zu", cBlockSize); ++ DEBUGLOG(5, "Block too small, writing out nocompress block: cSize: %zu", cBlockSize); + cSize += cBlockSize; + ip += blockSize; + op += cBlockSize; +@@ -5851,6 +6627,7 @@ ZSTD_compressSequences_internal(ZSTD_CCt + continue; + } + ++ RETURN_ERROR_IF(dstCapacity < ZSTD_blockHeaderSize, dstSize_tooSmall, "not enough dstCapacity to write a new compressed block"); + compressedSeqsSize = ZSTD_entropyCompressSeqStore(&cctx->seqStore, + &cctx->blockState.prevCBlock->entropy, &cctx->blockState.nextCBlock->entropy, + &cctx->appliedParams, +@@ -5859,11 +6636,11 @@ ZSTD_compressSequences_internal(ZSTD_CCt + cctx->entropyWorkspace, ENTROPY_WORKSPACE_SIZE /* statically allocated in resetCCtx */, + cctx->bmi2); + FORWARD_IF_ERROR(compressedSeqsSize, "Compressing sequences of block failed"); +- DEBUGLOG(4, "Compressed sequences size: %zu", compressedSeqsSize); ++ DEBUGLOG(5, "Compressed sequences size: %zu", compressedSeqsSize); + + if (!cctx->isFirstBlock && + ZSTD_maybeRLE(&cctx->seqStore) && +- ZSTD_isRLE((BYTE const*)src, srcSize)) { ++ ZSTD_isRLE(ip, blockSize)) { + /* We don't want to emit our first block as a RLE even if it qualifies because + * doing so will cause the decoder (cli only) to throw a "should consume all input error." + * This is only an issue for zstd <= v1.4.3 +@@ -5874,12 +6651,12 @@ ZSTD_compressSequences_internal(ZSTD_CCt + if (compressedSeqsSize == 0) { + /* ZSTD_noCompressBlock writes the block header as well */ + cBlockSize = ZSTD_noCompressBlock(op, dstCapacity, ip, blockSize, lastBlock); +- FORWARD_IF_ERROR(cBlockSize, "Nocompress block failed"); +- DEBUGLOG(4, "Writing out nocompress block, size: %zu", cBlockSize); ++ FORWARD_IF_ERROR(cBlockSize, "ZSTD_noCompressBlock failed"); ++ DEBUGLOG(5, "Writing out nocompress block, size: %zu", cBlockSize); + } else if (compressedSeqsSize == 1) { + cBlockSize = ZSTD_rleCompressBlock(op, dstCapacity, *ip, blockSize, lastBlock); +- FORWARD_IF_ERROR(cBlockSize, "RLE compress block failed"); +- DEBUGLOG(4, "Writing out RLE block, size: %zu", cBlockSize); ++ FORWARD_IF_ERROR(cBlockSize, "ZSTD_rleCompressBlock failed"); ++ DEBUGLOG(5, "Writing out RLE block, size: %zu", cBlockSize); + } else { + U32 cBlockHeader; + /* Error checking and repcodes update */ +@@ -5891,11 +6668,10 @@ ZSTD_compressSequences_internal(ZSTD_CCt + cBlockHeader = lastBlock + (((U32)bt_compressed)<<1) + (U32)(compressedSeqsSize << 3); + MEM_writeLE24(op, cBlockHeader); + cBlockSize = ZSTD_blockHeaderSize + compressedSeqsSize; +- DEBUGLOG(4, "Writing out compressed block, size: %zu", cBlockSize); ++ DEBUGLOG(5, "Writing out compressed block, size: %zu", cBlockSize); + } + + cSize += cBlockSize; +- DEBUGLOG(4, "cSize running total: %zu", cSize); + + if (lastBlock) { + break; +@@ -5906,12 +6682,15 @@ ZSTD_compressSequences_internal(ZSTD_CCt + dstCapacity -= cBlockSize; + cctx->isFirstBlock = 0; + } ++ DEBUGLOG(5, "cSize running total: %zu (remaining dstCapacity=%zu)", cSize, dstCapacity); + } + ++ DEBUGLOG(4, "cSize final total: %zu", cSize); + return cSize; + } + +-size_t ZSTD_compressSequences(ZSTD_CCtx* const cctx, void* dst, size_t dstCapacity, ++size_t ZSTD_compressSequences(ZSTD_CCtx* cctx, ++ void* dst, size_t dstCapacity, + const ZSTD_Sequence* inSeqs, size_t inSeqsSize, + const void* src, size_t srcSize) + { +@@ -5921,7 +6700,7 @@ size_t ZSTD_compressSequences(ZSTD_CCtx* + size_t frameHeaderSize = 0; + + /* Transparent initialization stage, same as compressStream2() */ +- DEBUGLOG(3, "ZSTD_compressSequences()"); ++ DEBUGLOG(4, "ZSTD_compressSequences (dstCapacity=%zu)", dstCapacity); + assert(cctx != NULL); + FORWARD_IF_ERROR(ZSTD_CCtx_init_compressStream2(cctx, ZSTD_e_end, srcSize), "CCtx initialization failed"); + /* Begin writing output, starting with frame header */ +@@ -5949,26 +6728,34 @@ size_t ZSTD_compressSequences(ZSTD_CCtx* + cSize += 4; + } + +- DEBUGLOG(3, "Final compressed size: %zu", cSize); ++ DEBUGLOG(4, "Final compressed size: %zu", cSize); + return cSize; + } + + /*====== Finalize ======*/ + ++static ZSTD_inBuffer inBuffer_forEndFlush(const ZSTD_CStream* zcs) ++{ ++ const ZSTD_inBuffer nullInput = { NULL, 0, 0 }; ++ const int stableInput = (zcs->appliedParams.inBufferMode == ZSTD_bm_stable); ++ return stableInput ? zcs->expectedInBuffer : nullInput; ++} ++ + /*! ZSTD_flushStream() : + * @return : amount of data remaining to flush */ + size_t ZSTD_flushStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output) + { +- ZSTD_inBuffer input = { NULL, 0, 0 }; ++ ZSTD_inBuffer input = inBuffer_forEndFlush(zcs); ++ input.size = input.pos; /* do not ingest more input during flush */ + return ZSTD_compressStream2(zcs, output, &input, ZSTD_e_flush); + } + + + size_t ZSTD_endStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output) + { +- ZSTD_inBuffer input = { NULL, 0, 0 }; ++ ZSTD_inBuffer input = inBuffer_forEndFlush(zcs); + size_t const remainingToFlush = ZSTD_compressStream2(zcs, output, &input, ZSTD_e_end); +- FORWARD_IF_ERROR( remainingToFlush , "ZSTD_compressStream2 failed"); ++ FORWARD_IF_ERROR(remainingToFlush , "ZSTD_compressStream2(,,ZSTD_e_end) failed"); + if (zcs->appliedParams.nbWorkers > 0) return remainingToFlush; /* minimal estimation */ + /* single thread mode : attempt to calculate remaining to flush more precisely */ + { size_t const lastBlockSize = zcs->frameEnded ? 0 : ZSTD_BLOCKHEADERSIZE; +@@ -6090,7 +6877,7 @@ static ZSTD_compressionParameters ZSTD_g + cp.targetLength = (unsigned)(-clampedCompressionLevel); + } + /* refine parameters based on srcSize & dictSize */ +- return ZSTD_adjustCParams_internal(cp, srcSizeHint, dictSize, mode); ++ return ZSTD_adjustCParams_internal(cp, srcSizeHint, dictSize, mode, ZSTD_ps_auto); + } + } + +@@ -6125,3 +6912,29 @@ ZSTD_parameters ZSTD_getParams(int compr + if (srcSizeHint == 0) srcSizeHint = ZSTD_CONTENTSIZE_UNKNOWN; + return ZSTD_getParams_internal(compressionLevel, srcSizeHint, dictSize, ZSTD_cpm_unknown); + } ++ ++void ZSTD_registerSequenceProducer( ++ ZSTD_CCtx* zc, ++ void* extSeqProdState, ++ ZSTD_sequenceProducer_F extSeqProdFunc ++) { ++ assert(zc != NULL); ++ ZSTD_CCtxParams_registerSequenceProducer( ++ &zc->requestedParams, extSeqProdState, extSeqProdFunc ++ ); ++} ++ ++void ZSTD_CCtxParams_registerSequenceProducer( ++ ZSTD_CCtx_params* params, ++ void* extSeqProdState, ++ ZSTD_sequenceProducer_F extSeqProdFunc ++) { ++ assert(params != NULL); ++ if (extSeqProdFunc != NULL) { ++ params->extSeqProdFunc = extSeqProdFunc; ++ params->extSeqProdState = extSeqProdState; ++ } else { ++ params->extSeqProdFunc = NULL; ++ params->extSeqProdState = NULL; ++ } ++} +--- a/lib/zstd/compress/zstd_compress_internal.h ++++ b/lib/zstd/compress/zstd_compress_internal.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -20,6 +21,7 @@ + ***************************************/ + #include "../common/zstd_internal.h" + #include "zstd_cwksp.h" ++#include "../common/bits.h" /* ZSTD_highbit32, ZSTD_NbCommonBytes */ + + + /*-************************************* +@@ -32,7 +34,7 @@ + It's not a big deal though : candidate will just be sorted again. + Additionally, candidate position 1 will be lost. + But candidate 1 cannot hide a large tree of candidates, so it's a minimal loss. +- The benefit is that ZSTD_DUBT_UNSORTED_MARK cannot be mishandled after table re-use with a different strategy. ++ The benefit is that ZSTD_DUBT_UNSORTED_MARK cannot be mishandled after table reuse with a different strategy. + This constant is required by ZSTD_compressBlock_btlazy2() and ZSTD_reduceTable_internal() */ + + +@@ -111,12 +113,13 @@ typedef struct { + /* ZSTD_buildBlockEntropyStats() : + * Builds entropy for the block. + * @return : 0 on success or error code */ +-size_t ZSTD_buildBlockEntropyStats(seqStore_t* seqStorePtr, +- const ZSTD_entropyCTables_t* prevEntropy, +- ZSTD_entropyCTables_t* nextEntropy, +- const ZSTD_CCtx_params* cctxParams, +- ZSTD_entropyCTablesMetadata_t* entropyMetadata, +- void* workspace, size_t wkspSize); ++size_t ZSTD_buildBlockEntropyStats( ++ const seqStore_t* seqStorePtr, ++ const ZSTD_entropyCTables_t* prevEntropy, ++ ZSTD_entropyCTables_t* nextEntropy, ++ const ZSTD_CCtx_params* cctxParams, ++ ZSTD_entropyCTablesMetadata_t* entropyMetadata, ++ void* workspace, size_t wkspSize); + + /* ******************************* + * Compression internals structs * +@@ -142,26 +145,33 @@ typedef struct { + size_t capacity; /* The capacity starting from `seq` pointer */ + } rawSeqStore_t; + ++typedef struct { ++ U32 idx; /* Index in array of ZSTD_Sequence */ ++ U32 posInSequence; /* Position within sequence at idx */ ++ size_t posInSrc; /* Number of bytes given by sequences provided so far */ ++} ZSTD_sequencePosition; ++ + UNUSED_ATTR static const rawSeqStore_t kNullRawSeqStore = {NULL, 0, 0, 0, 0}; + + typedef struct { +- int price; +- U32 off; +- U32 mlen; +- U32 litlen; +- U32 rep[ZSTD_REP_NUM]; ++ int price; /* price from beginning of segment to this position */ ++ U32 off; /* offset of previous match */ ++ U32 mlen; /* length of previous match */ ++ U32 litlen; /* nb of literals since previous match */ ++ U32 rep[ZSTD_REP_NUM]; /* offset history after previous match */ + } ZSTD_optimal_t; + + typedef enum { zop_dynamic=0, zop_predef } ZSTD_OptPrice_e; + ++#define ZSTD_OPT_SIZE (ZSTD_OPT_NUM+3) + typedef struct { + /* All tables are allocated inside cctx->workspace by ZSTD_resetCCtx_internal() */ + unsigned* litFreq; /* table of literals statistics, of size 256 */ + unsigned* litLengthFreq; /* table of litLength statistics, of size (MaxLL+1) */ + unsigned* matchLengthFreq; /* table of matchLength statistics, of size (MaxML+1) */ + unsigned* offCodeFreq; /* table of offCode statistics, of size (MaxOff+1) */ +- ZSTD_match_t* matchTable; /* list of found matches, of size ZSTD_OPT_NUM+1 */ +- ZSTD_optimal_t* priceTable; /* All positions tracked by optimal parser, of size ZSTD_OPT_NUM+1 */ ++ ZSTD_match_t* matchTable; /* list of found matches, of size ZSTD_OPT_SIZE */ ++ ZSTD_optimal_t* priceTable; /* All positions tracked by optimal parser, of size ZSTD_OPT_SIZE */ + + U32 litSum; /* nb of literals */ + U32 litLengthSum; /* nb of litLength codes */ +@@ -212,8 +222,10 @@ struct ZSTD_matchState_t { + U32 hashLog3; /* dispatch table for matches of len==3 : larger == faster, more memory */ + + U32 rowHashLog; /* For row-based matchfinder: Hashlog based on nb of rows in the hashTable.*/ +- U16* tagTable; /* For row-based matchFinder: A row-based table containing the hashes and head index. */ ++ BYTE* tagTable; /* For row-based matchFinder: A row-based table containing the hashes and head index. */ + U32 hashCache[ZSTD_ROW_HASH_CACHE_SIZE]; /* For row-based matchFinder: a cache of hashes to improve speed */ ++ U64 hashSalt; /* For row-based matchFinder: salts the hash for reuse of tag table */ ++ U32 hashSaltEntropy; /* For row-based matchFinder: collects entropy for salt generation */ + + U32* hashTable; + U32* hashTable3; +@@ -228,6 +240,18 @@ struct ZSTD_matchState_t { + const ZSTD_matchState_t* dictMatchState; + ZSTD_compressionParameters cParams; + const rawSeqStore_t* ldmSeqStore; ++ ++ /* Controls prefetching in some dictMatchState matchfinders. ++ * This behavior is controlled from the cctx ms. ++ * This parameter has no effect in the cdict ms. */ ++ int prefetchCDictTables; ++ ++ /* When == 0, lazy match finders insert every position. ++ * When != 0, lazy match finders only insert positions they search. ++ * This allows them to skip much faster over incompressible data, ++ * at a small cost to compression ratio. ++ */ ++ int lazySkipping; + }; + + typedef struct { +@@ -324,6 +348,25 @@ struct ZSTD_CCtx_params_s { + + /* Internal use, for createCCtxParams() and freeCCtxParams() only */ + ZSTD_customMem customMem; ++ ++ /* Controls prefetching in some dictMatchState matchfinders */ ++ ZSTD_paramSwitch_e prefetchCDictTables; ++ ++ /* Controls whether zstd will fall back to an internal matchfinder ++ * if the external matchfinder returns an error code. */ ++ int enableMatchFinderFallback; ++ ++ /* Parameters for the external sequence producer API. ++ * Users set these parameters through ZSTD_registerSequenceProducer(). ++ * It is not possible to set these parameters individually through the public API. */ ++ void* extSeqProdState; ++ ZSTD_sequenceProducer_F extSeqProdFunc; ++ ++ /* Adjust the max block size*/ ++ size_t maxBlockSize; ++ ++ /* Controls repcode search in external sequence parsing */ ++ ZSTD_paramSwitch_e searchForExternalRepcodes; + }; /* typedef'd to ZSTD_CCtx_params within "zstd.h" */ + + #define COMPRESS_SEQUENCES_WORKSPACE_SIZE (sizeof(unsigned) * (MaxSeq + 2)) +@@ -404,6 +447,7 @@ struct ZSTD_CCtx_s { + + /* Stable in/out buffer verification */ + ZSTD_inBuffer expectedInBuffer; ++ size_t stableIn_notConsumed; /* nb bytes within stable input buffer that are said to be consumed but are not */ + size_t expectedOutBufferSize; + + /* Dictionary */ +@@ -417,9 +461,14 @@ struct ZSTD_CCtx_s { + + /* Workspace for block splitter */ + ZSTD_blockSplitCtx blockSplitCtx; ++ ++ /* Buffer for output from external sequence producer */ ++ ZSTD_Sequence* extSeqBuf; ++ size_t extSeqBufCapacity; + }; + + typedef enum { ZSTD_dtlm_fast, ZSTD_dtlm_full } ZSTD_dictTableLoadMethod_e; ++typedef enum { ZSTD_tfp_forCCtx, ZSTD_tfp_forCDict } ZSTD_tableFillPurpose_e; + + typedef enum { + ZSTD_noDict = 0, +@@ -441,7 +490,7 @@ typedef enum { + * In this mode we take both the source size and the dictionary size + * into account when selecting and adjusting the parameters. + */ +- ZSTD_cpm_unknown = 3, /* ZSTD_getCParams, ZSTD_getParams, ZSTD_adjustParams. ++ ZSTD_cpm_unknown = 3 /* ZSTD_getCParams, ZSTD_getParams, ZSTD_adjustParams. + * We don't know what these parameters are for. We default to the legacy + * behavior of taking both the source size and the dict size into account + * when selecting and adjusting parameters. +@@ -500,9 +549,11 @@ MEM_STATIC int ZSTD_cParam_withinBounds( + /* ZSTD_noCompressBlock() : + * Writes uncompressed block to dst buffer from given src. + * Returns the size of the block */ +-MEM_STATIC size_t ZSTD_noCompressBlock (void* dst, size_t dstCapacity, const void* src, size_t srcSize, U32 lastBlock) ++MEM_STATIC size_t ++ZSTD_noCompressBlock(void* dst, size_t dstCapacity, const void* src, size_t srcSize, U32 lastBlock) + { + U32 const cBlockHeader24 = lastBlock + (((U32)bt_raw)<<1) + (U32)(srcSize << 3); ++ DEBUGLOG(5, "ZSTD_noCompressBlock (srcSize=%zu, dstCapacity=%zu)", srcSize, dstCapacity); + RETURN_ERROR_IF(srcSize + ZSTD_blockHeaderSize > dstCapacity, + dstSize_tooSmall, "dst buf too small for uncompressed block"); + MEM_writeLE24(dst, cBlockHeader24); +@@ -510,7 +561,8 @@ MEM_STATIC size_t ZSTD_noCompressBlock ( + return ZSTD_blockHeaderSize + srcSize; + } + +-MEM_STATIC size_t ZSTD_rleCompressBlock (void* dst, size_t dstCapacity, BYTE src, size_t srcSize, U32 lastBlock) ++MEM_STATIC size_t ++ZSTD_rleCompressBlock(void* dst, size_t dstCapacity, BYTE src, size_t srcSize, U32 lastBlock) + { + BYTE* const op = (BYTE*)dst; + U32 const cBlockHeader = lastBlock + (((U32)bt_rle)<<1) + (U32)(srcSize << 3); +@@ -529,7 +581,7 @@ MEM_STATIC size_t ZSTD_minGain(size_t sr + { + U32 const minlog = (strat>=ZSTD_btultra) ? (U32)(strat) - 1 : 6; + ZSTD_STATIC_ASSERT(ZSTD_btultra == 8); +- assert(ZSTD_cParam_withinBounds(ZSTD_c_strategy, strat)); ++ assert(ZSTD_cParam_withinBounds(ZSTD_c_strategy, (int)strat)); + return (srcSize >> minlog) + 2; + } + +@@ -565,29 +617,27 @@ ZSTD_safecopyLiterals(BYTE* op, BYTE con + while (ip < iend) *op++ = *ip++; + } + +-#define ZSTD_REP_MOVE (ZSTD_REP_NUM-1) +-#define STORE_REPCODE_1 STORE_REPCODE(1) +-#define STORE_REPCODE_2 STORE_REPCODE(2) +-#define STORE_REPCODE_3 STORE_REPCODE(3) +-#define STORE_REPCODE(r) (assert((r)>=1), assert((r)<=3), (r)-1) +-#define STORE_OFFSET(o) (assert((o)>0), o + ZSTD_REP_MOVE) +-#define STORED_IS_OFFSET(o) ((o) > ZSTD_REP_MOVE) +-#define STORED_IS_REPCODE(o) ((o) <= ZSTD_REP_MOVE) +-#define STORED_OFFSET(o) (assert(STORED_IS_OFFSET(o)), (o)-ZSTD_REP_MOVE) +-#define STORED_REPCODE(o) (assert(STORED_IS_REPCODE(o)), (o)+1) /* returns ID 1,2,3 */ +-#define STORED_TO_OFFBASE(o) ((o)+1) +-#define OFFBASE_TO_STORED(o) ((o)-1) ++ ++#define REPCODE1_TO_OFFBASE REPCODE_TO_OFFBASE(1) ++#define REPCODE2_TO_OFFBASE REPCODE_TO_OFFBASE(2) ++#define REPCODE3_TO_OFFBASE REPCODE_TO_OFFBASE(3) ++#define REPCODE_TO_OFFBASE(r) (assert((r)>=1), assert((r)<=ZSTD_REP_NUM), (r)) /* accepts IDs 1,2,3 */ ++#define OFFSET_TO_OFFBASE(o) (assert((o)>0), o + ZSTD_REP_NUM) ++#define OFFBASE_IS_OFFSET(o) ((o) > ZSTD_REP_NUM) ++#define OFFBASE_IS_REPCODE(o) ( 1 <= (o) && (o) <= ZSTD_REP_NUM) ++#define OFFBASE_TO_OFFSET(o) (assert(OFFBASE_IS_OFFSET(o)), (o) - ZSTD_REP_NUM) ++#define OFFBASE_TO_REPCODE(o) (assert(OFFBASE_IS_REPCODE(o)), (o)) /* returns ID 1,2,3 */ + + /*! ZSTD_storeSeq() : +- * Store a sequence (litlen, litPtr, offCode and matchLength) into seqStore_t. +- * @offBase_minus1 : Users should use employ macros STORE_REPCODE_X and STORE_OFFSET(). ++ * Store a sequence (litlen, litPtr, offBase and matchLength) into seqStore_t. ++ * @offBase : Users should employ macros REPCODE_TO_OFFBASE() and OFFSET_TO_OFFBASE(). + * @matchLength : must be >= MINMATCH +- * Allowed to overread literals up to litLimit. ++ * Allowed to over-read literals up to litLimit. + */ + HINT_INLINE UNUSED_ATTR void + ZSTD_storeSeq(seqStore_t* seqStorePtr, + size_t litLength, const BYTE* literals, const BYTE* litLimit, +- U32 offBase_minus1, ++ U32 offBase, + size_t matchLength) + { + BYTE const* const litLimit_w = litLimit - WILDCOPY_OVERLENGTH; +@@ -596,8 +646,8 @@ ZSTD_storeSeq(seqStore_t* seqStorePtr, + static const BYTE* g_start = NULL; + if (g_start==NULL) g_start = (const BYTE*)literals; /* note : index only works for compression within a single segment */ + { U32 const pos = (U32)((const BYTE*)literals - g_start); +- DEBUGLOG(6, "Cpos%7u :%3u literals, match%4u bytes at offCode%7u", +- pos, (U32)litLength, (U32)matchLength, (U32)offBase_minus1); ++ DEBUGLOG(6, "Cpos%7u :%3u literals, match%4u bytes at offBase%7u", ++ pos, (U32)litLength, (U32)matchLength, (U32)offBase); + } + #endif + assert((size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart) < seqStorePtr->maxNbSeq); +@@ -607,9 +657,9 @@ ZSTD_storeSeq(seqStore_t* seqStorePtr, + assert(literals + litLength <= litLimit); + if (litEnd <= litLimit_w) { + /* Common case we can use wildcopy. +- * First copy 16 bytes, because literals are likely short. +- */ +- assert(WILDCOPY_OVERLENGTH >= 16); ++ * First copy 16 bytes, because literals are likely short. ++ */ ++ ZSTD_STATIC_ASSERT(WILDCOPY_OVERLENGTH >= 16); + ZSTD_copy16(seqStorePtr->lit, literals); + if (litLength > 16) { + ZSTD_wildcopy(seqStorePtr->lit+16, literals+16, (ptrdiff_t)litLength-16, ZSTD_no_overlap); +@@ -628,7 +678,7 @@ ZSTD_storeSeq(seqStore_t* seqStorePtr, + seqStorePtr->sequences[0].litLength = (U16)litLength; + + /* match offset */ +- seqStorePtr->sequences[0].offBase = STORED_TO_OFFBASE(offBase_minus1); ++ seqStorePtr->sequences[0].offBase = offBase; + + /* match Length */ + assert(matchLength >= MINMATCH); +@@ -646,17 +696,17 @@ ZSTD_storeSeq(seqStore_t* seqStorePtr, + + /* ZSTD_updateRep() : + * updates in-place @rep (array of repeat offsets) +- * @offBase_minus1 : sum-type, with same numeric representation as ZSTD_storeSeq() ++ * @offBase : sum-type, using numeric representation of ZSTD_storeSeq() + */ + MEM_STATIC void +-ZSTD_updateRep(U32 rep[ZSTD_REP_NUM], U32 const offBase_minus1, U32 const ll0) ++ZSTD_updateRep(U32 rep[ZSTD_REP_NUM], U32 const offBase, U32 const ll0) + { +- if (STORED_IS_OFFSET(offBase_minus1)) { /* full offset */ ++ if (OFFBASE_IS_OFFSET(offBase)) { /* full offset */ + rep[2] = rep[1]; + rep[1] = rep[0]; +- rep[0] = STORED_OFFSET(offBase_minus1); ++ rep[0] = OFFBASE_TO_OFFSET(offBase); + } else { /* repcode */ +- U32 const repCode = STORED_REPCODE(offBase_minus1) - 1 + ll0; ++ U32 const repCode = OFFBASE_TO_REPCODE(offBase) - 1 + ll0; + if (repCode > 0) { /* note : if repCode==0, no change */ + U32 const currentOffset = (repCode==ZSTD_REP_NUM) ? (rep[0] - 1) : rep[repCode]; + rep[2] = (repCode >= 2) ? rep[1] : rep[2]; +@@ -673,11 +723,11 @@ typedef struct repcodes_s { + } repcodes_t; + + MEM_STATIC repcodes_t +-ZSTD_newRep(U32 const rep[ZSTD_REP_NUM], U32 const offBase_minus1, U32 const ll0) ++ZSTD_newRep(U32 const rep[ZSTD_REP_NUM], U32 const offBase, U32 const ll0) + { + repcodes_t newReps; + ZSTD_memcpy(&newReps, rep, sizeof(newReps)); +- ZSTD_updateRep(newReps.rep, offBase_minus1, ll0); ++ ZSTD_updateRep(newReps.rep, offBase, ll0); + return newReps; + } + +@@ -685,59 +735,6 @@ ZSTD_newRep(U32 const rep[ZSTD_REP_NUM], + /*-************************************* + * Match length counter + ***************************************/ +-static unsigned ZSTD_NbCommonBytes (size_t val) +-{ +- if (MEM_isLittleEndian()) { +- if (MEM_64bits()) { +-# if (__GNUC__ >= 4) +- return (__builtin_ctzll((U64)val) >> 3); +-# else +- static const int DeBruijnBytePos[64] = { 0, 0, 0, 0, 0, 1, 1, 2, +- 0, 3, 1, 3, 1, 4, 2, 7, +- 0, 2, 3, 6, 1, 5, 3, 5, +- 1, 3, 4, 4, 2, 5, 6, 7, +- 7, 0, 1, 2, 3, 3, 4, 6, +- 2, 6, 5, 5, 3, 4, 5, 6, +- 7, 1, 2, 4, 6, 4, 4, 5, +- 7, 2, 6, 5, 7, 6, 7, 7 }; +- return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58]; +-# endif +- } else { /* 32 bits */ +-# if (__GNUC__ >= 3) +- return (__builtin_ctz((U32)val) >> 3); +-# else +- static const int DeBruijnBytePos[32] = { 0, 0, 3, 0, 3, 1, 3, 0, +- 3, 2, 2, 1, 3, 2, 0, 1, +- 3, 3, 1, 2, 2, 2, 2, 0, +- 3, 1, 2, 0, 1, 0, 1, 1 }; +- return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27]; +-# endif +- } +- } else { /* Big Endian CPU */ +- if (MEM_64bits()) { +-# if (__GNUC__ >= 4) +- return (__builtin_clzll(val) >> 3); +-# else +- unsigned r; +- const unsigned n32 = sizeof(size_t)*4; /* calculate this way due to compiler complaining in 32-bits mode */ +- if (!(val>>n32)) { r=4; } else { r=0; val>>=n32; } +- if (!(val>>16)) { r+=2; val>>=8; } else { val>>=24; } +- r += (!val); +- return r; +-# endif +- } else { /* 32 bits */ +-# if (__GNUC__ >= 3) +- return (__builtin_clz((U32)val) >> 3); +-# else +- unsigned r; +- if (!(val>>16)) { r=2; val>>=8; } else { r=0; val>>=24; } +- r += (!val); +- return r; +-# endif +- } } +-} +- +- + MEM_STATIC size_t ZSTD_count(const BYTE* pIn, const BYTE* pMatch, const BYTE* const pInLimit) + { + const BYTE* const pStart = pIn; +@@ -783,32 +780,43 @@ ZSTD_count_2segments(const BYTE* ip, con + * Hashes + ***************************************/ + static const U32 prime3bytes = 506832829U; +-static U32 ZSTD_hash3(U32 u, U32 h) { return ((u << (32-24)) * prime3bytes) >> (32-h) ; } +-MEM_STATIC size_t ZSTD_hash3Ptr(const void* ptr, U32 h) { return ZSTD_hash3(MEM_readLE32(ptr), h); } /* only in zstd_opt.h */ ++static U32 ZSTD_hash3(U32 u, U32 h, U32 s) { assert(h <= 32); return (((u << (32-24)) * prime3bytes) ^ s) >> (32-h) ; } ++MEM_STATIC size_t ZSTD_hash3Ptr(const void* ptr, U32 h) { return ZSTD_hash3(MEM_readLE32(ptr), h, 0); } /* only in zstd_opt.h */ ++MEM_STATIC size_t ZSTD_hash3PtrS(const void* ptr, U32 h, U32 s) { return ZSTD_hash3(MEM_readLE32(ptr), h, s); } + + static const U32 prime4bytes = 2654435761U; +-static U32 ZSTD_hash4(U32 u, U32 h) { return (u * prime4bytes) >> (32-h) ; } +-static size_t ZSTD_hash4Ptr(const void* ptr, U32 h) { return ZSTD_hash4(MEM_read32(ptr), h); } ++static U32 ZSTD_hash4(U32 u, U32 h, U32 s) { assert(h <= 32); return ((u * prime4bytes) ^ s) >> (32-h) ; } ++static size_t ZSTD_hash4Ptr(const void* ptr, U32 h) { return ZSTD_hash4(MEM_readLE32(ptr), h, 0); } ++static size_t ZSTD_hash4PtrS(const void* ptr, U32 h, U32 s) { return ZSTD_hash4(MEM_readLE32(ptr), h, s); } + + static const U64 prime5bytes = 889523592379ULL; +-static size_t ZSTD_hash5(U64 u, U32 h) { return (size_t)(((u << (64-40)) * prime5bytes) >> (64-h)) ; } +-static size_t ZSTD_hash5Ptr(const void* p, U32 h) { return ZSTD_hash5(MEM_readLE64(p), h); } ++static size_t ZSTD_hash5(U64 u, U32 h, U64 s) { assert(h <= 64); return (size_t)((((u << (64-40)) * prime5bytes) ^ s) >> (64-h)) ; } ++static size_t ZSTD_hash5Ptr(const void* p, U32 h) { return ZSTD_hash5(MEM_readLE64(p), h, 0); } ++static size_t ZSTD_hash5PtrS(const void* p, U32 h, U64 s) { return ZSTD_hash5(MEM_readLE64(p), h, s); } + + static const U64 prime6bytes = 227718039650203ULL; +-static size_t ZSTD_hash6(U64 u, U32 h) { return (size_t)(((u << (64-48)) * prime6bytes) >> (64-h)) ; } +-static size_t ZSTD_hash6Ptr(const void* p, U32 h) { return ZSTD_hash6(MEM_readLE64(p), h); } ++static size_t ZSTD_hash6(U64 u, U32 h, U64 s) { assert(h <= 64); return (size_t)((((u << (64-48)) * prime6bytes) ^ s) >> (64-h)) ; } ++static size_t ZSTD_hash6Ptr(const void* p, U32 h) { return ZSTD_hash6(MEM_readLE64(p), h, 0); } ++static size_t ZSTD_hash6PtrS(const void* p, U32 h, U64 s) { return ZSTD_hash6(MEM_readLE64(p), h, s); } + + static const U64 prime7bytes = 58295818150454627ULL; +-static size_t ZSTD_hash7(U64 u, U32 h) { return (size_t)(((u << (64-56)) * prime7bytes) >> (64-h)) ; } +-static size_t ZSTD_hash7Ptr(const void* p, U32 h) { return ZSTD_hash7(MEM_readLE64(p), h); } ++static size_t ZSTD_hash7(U64 u, U32 h, U64 s) { assert(h <= 64); return (size_t)((((u << (64-56)) * prime7bytes) ^ s) >> (64-h)) ; } ++static size_t ZSTD_hash7Ptr(const void* p, U32 h) { return ZSTD_hash7(MEM_readLE64(p), h, 0); } ++static size_t ZSTD_hash7PtrS(const void* p, U32 h, U64 s) { return ZSTD_hash7(MEM_readLE64(p), h, s); } + + static const U64 prime8bytes = 0xCF1BBCDCB7A56463ULL; +-static size_t ZSTD_hash8(U64 u, U32 h) { return (size_t)(((u) * prime8bytes) >> (64-h)) ; } +-static size_t ZSTD_hash8Ptr(const void* p, U32 h) { return ZSTD_hash8(MEM_readLE64(p), h); } ++static size_t ZSTD_hash8(U64 u, U32 h, U64 s) { assert(h <= 64); return (size_t)((((u) * prime8bytes) ^ s) >> (64-h)) ; } ++static size_t ZSTD_hash8Ptr(const void* p, U32 h) { return ZSTD_hash8(MEM_readLE64(p), h, 0); } ++static size_t ZSTD_hash8PtrS(const void* p, U32 h, U64 s) { return ZSTD_hash8(MEM_readLE64(p), h, s); } ++ + + MEM_STATIC FORCE_INLINE_ATTR + size_t ZSTD_hashPtr(const void* p, U32 hBits, U32 mls) + { ++ /* Although some of these hashes do support hBits up to 64, some do not. ++ * To be on the safe side, always avoid hBits > 32. */ ++ assert(hBits <= 32); ++ + switch(mls) + { + default: +@@ -820,6 +828,24 @@ size_t ZSTD_hashPtr(const void* p, U32 h + } + } + ++MEM_STATIC FORCE_INLINE_ATTR ++size_t ZSTD_hashPtrSalted(const void* p, U32 hBits, U32 mls, const U64 hashSalt) { ++ /* Although some of these hashes do support hBits up to 64, some do not. ++ * To be on the safe side, always avoid hBits > 32. */ ++ assert(hBits <= 32); ++ ++ switch(mls) ++ { ++ default: ++ case 4: return ZSTD_hash4PtrS(p, hBits, (U32)hashSalt); ++ case 5: return ZSTD_hash5PtrS(p, hBits, hashSalt); ++ case 6: return ZSTD_hash6PtrS(p, hBits, hashSalt); ++ case 7: return ZSTD_hash7PtrS(p, hBits, hashSalt); ++ case 8: return ZSTD_hash8PtrS(p, hBits, hashSalt); ++ } ++} ++ ++ + /* ZSTD_ipow() : + * Return base^exponent. + */ +@@ -1011,7 +1037,9 @@ MEM_STATIC U32 ZSTD_window_needOverflowC + * The least significant cycleLog bits of the indices must remain the same, + * which may be 0. Every index up to maxDist in the past must be valid. + */ +-MEM_STATIC U32 ZSTD_window_correctOverflow(ZSTD_window_t* window, U32 cycleLog, ++MEM_STATIC ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++U32 ZSTD_window_correctOverflow(ZSTD_window_t* window, U32 cycleLog, + U32 maxDist, void const* src) + { + /* preemptive overflow correction: +@@ -1167,10 +1195,15 @@ ZSTD_checkDictValidity(const ZSTD_window + (unsigned)blockEndIdx, (unsigned)maxDist, (unsigned)loadedDictEnd); + assert(blockEndIdx >= loadedDictEnd); + +- if (blockEndIdx > loadedDictEnd + maxDist) { ++ if (blockEndIdx > loadedDictEnd + maxDist || loadedDictEnd != window->dictLimit) { + /* On reaching window size, dictionaries are invalidated. + * For simplification, if window size is reached anywhere within next block, + * the dictionary is invalidated for the full block. ++ * ++ * We also have to invalidate the dictionary if ZSTD_window_update() has detected ++ * non-contiguous segments, which means that loadedDictEnd != window->dictLimit. ++ * loadedDictEnd may be 0, if forceWindow is true, but in that case we never use ++ * dictMatchState, so setting it to NULL is not a problem. + */ + DEBUGLOG(6, "invalidating dictionary for current block (distance > windowSize)"); + *loadedDictEndPtr = 0; +@@ -1199,7 +1232,9 @@ MEM_STATIC void ZSTD_window_init(ZSTD_wi + * forget about the extDict. Handles overlap of the prefix and extDict. + * Returns non-zero if the segment is contiguous. + */ +-MEM_STATIC U32 ZSTD_window_update(ZSTD_window_t* window, ++MEM_STATIC ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++U32 ZSTD_window_update(ZSTD_window_t* window, + void const* src, size_t srcSize, + int forceNonContiguous) + { +@@ -1302,6 +1337,42 @@ MEM_STATIC void ZSTD_debugTable(const U3 + + #endif + ++/* Short Cache */ ++ ++/* Normally, zstd matchfinders follow this flow: ++ * 1. Compute hash at ip ++ * 2. Load index from hashTable[hash] ++ * 3. Check if *ip == *(base + index) ++ * In dictionary compression, loading *(base + index) is often an L2 or even L3 miss. ++ * ++ * Short cache is an optimization which allows us to avoid step 3 most of the time ++ * when the data doesn't actually match. With short cache, the flow becomes: ++ * 1. Compute (hash, currentTag) at ip. currentTag is an 8-bit independent hash at ip. ++ * 2. Load (index, matchTag) from hashTable[hash]. See ZSTD_writeTaggedIndex to understand how this works. ++ * 3. Only if currentTag == matchTag, check *ip == *(base + index). Otherwise, continue. ++ * ++ * Currently, short cache is only implemented in CDict hashtables. Thus, its use is limited to ++ * dictMatchState matchfinders. ++ */ ++#define ZSTD_SHORT_CACHE_TAG_BITS 8 ++#define ZSTD_SHORT_CACHE_TAG_MASK ((1u << ZSTD_SHORT_CACHE_TAG_BITS) - 1) ++ ++/* Helper function for ZSTD_fillHashTable and ZSTD_fillDoubleHashTable. ++ * Unpacks hashAndTag into (hash, tag), then packs (index, tag) into hashTable[hash]. */ ++MEM_STATIC void ZSTD_writeTaggedIndex(U32* const hashTable, size_t hashAndTag, U32 index) { ++ size_t const hash = hashAndTag >> ZSTD_SHORT_CACHE_TAG_BITS; ++ U32 const tag = (U32)(hashAndTag & ZSTD_SHORT_CACHE_TAG_MASK); ++ assert(index >> (32 - ZSTD_SHORT_CACHE_TAG_BITS) == 0); ++ hashTable[hash] = (index << ZSTD_SHORT_CACHE_TAG_BITS) | tag; ++} ++ ++/* Helper function for short cache matchfinders. ++ * Unpacks tag1 and tag2 from lower bits of packedTag1 and packedTag2, then checks if the tags match. */ ++MEM_STATIC int ZSTD_comparePackedTags(size_t packedTag1, size_t packedTag2) { ++ U32 const tag1 = packedTag1 & ZSTD_SHORT_CACHE_TAG_MASK; ++ U32 const tag2 = packedTag2 & ZSTD_SHORT_CACHE_TAG_MASK; ++ return tag1 == tag2; ++} + + + /* =============================================================== +@@ -1381,11 +1452,10 @@ size_t ZSTD_writeLastEmptyBlock(void* ds + * This cannot be used when long range matching is enabled. + * Zstd will use these sequences, and pass the literals to a secondary block + * compressor. +- * @return : An error code on failure. + * NOTE: seqs are not verified! Invalid sequences can cause out-of-bounds memory + * access and data corruption. + */ +-size_t ZSTD_referenceExternalSequences(ZSTD_CCtx* cctx, rawSeq* seq, size_t nbSeq); ++void ZSTD_referenceExternalSequences(ZSTD_CCtx* cctx, rawSeq* seq, size_t nbSeq); + + /* ZSTD_cycleLog() : + * condition for correct operation : hashLog > 1 */ +@@ -1396,4 +1466,55 @@ U32 ZSTD_cycleLog(U32 hashLog, ZSTD_stra + */ + void ZSTD_CCtx_trace(ZSTD_CCtx* cctx, size_t extraCSize); + ++/* Returns 0 on success, and a ZSTD_error otherwise. This function scans through an array of ++ * ZSTD_Sequence, storing the sequences it finds, until it reaches a block delimiter. ++ * Note that the block delimiter must include the last literals of the block. ++ */ ++size_t ++ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx, ++ ZSTD_sequencePosition* seqPos, ++ const ZSTD_Sequence* const inSeqs, size_t inSeqsSize, ++ const void* src, size_t blockSize, ZSTD_paramSwitch_e externalRepSearch); ++ ++/* Returns the number of bytes to move the current read position back by. ++ * Only non-zero if we ended up splitting a sequence. ++ * Otherwise, it may return a ZSTD error if something went wrong. ++ * ++ * This function will attempt to scan through blockSize bytes ++ * represented by the sequences in @inSeqs, ++ * storing any (partial) sequences. ++ * ++ * Occasionally, we may want to change the actual number of bytes we consumed from inSeqs to ++ * avoid splitting a match, or to avoid splitting a match such that it would produce a match ++ * smaller than MINMATCH. In this case, we return the number of bytes that we didn't read from this block. ++ */ ++size_t ++ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* seqPos, ++ const ZSTD_Sequence* const inSeqs, size_t inSeqsSize, ++ const void* src, size_t blockSize, ZSTD_paramSwitch_e externalRepSearch); ++ ++/* Returns 1 if an external sequence producer is registered, otherwise returns 0. */ ++MEM_STATIC int ZSTD_hasExtSeqProd(const ZSTD_CCtx_params* params) { ++ return params->extSeqProdFunc != NULL; ++} ++ ++/* =============================================================== ++ * Deprecated definitions that are still used internally to avoid ++ * deprecation warnings. These functions are exactly equivalent to ++ * their public variants, but avoid the deprecation warnings. ++ * =============================================================== */ ++ ++size_t ZSTD_compressBegin_usingCDict_deprecated(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict); ++ ++size_t ZSTD_compressContinue_public(ZSTD_CCtx* cctx, ++ void* dst, size_t dstCapacity, ++ const void* src, size_t srcSize); ++ ++size_t ZSTD_compressEnd_public(ZSTD_CCtx* cctx, ++ void* dst, size_t dstCapacity, ++ const void* src, size_t srcSize); ++ ++size_t ZSTD_compressBlock_deprecated(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); ++ ++ + #endif /* ZSTD_COMPRESS_H */ +--- a/lib/zstd/compress/zstd_compress_literals.c ++++ b/lib/zstd/compress/zstd_compress_literals.c +@@ -1,5 +1,6 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -13,11 +14,36 @@ + ***************************************/ + #include "zstd_compress_literals.h" + ++ ++/* ************************************************************** ++* Debug Traces ++****************************************************************/ ++#if DEBUGLEVEL >= 2 ++ ++static size_t showHexa(const void* src, size_t srcSize) ++{ ++ const BYTE* const ip = (const BYTE*)src; ++ size_t u; ++ for (u=0; u31) + (srcSize>4095); + ++ DEBUGLOG(5, "ZSTD_noCompressLiterals: srcSize=%zu, dstCapacity=%zu", srcSize, dstCapacity); ++ + RETURN_ERROR_IF(srcSize + flSize > dstCapacity, dstSize_tooSmall, ""); + + switch(flSize) +@@ -36,16 +62,30 @@ size_t ZSTD_noCompressLiterals (void* ds + } + + ZSTD_memcpy(ostart + flSize, src, srcSize); +- DEBUGLOG(5, "Raw literals: %u -> %u", (U32)srcSize, (U32)(srcSize + flSize)); ++ DEBUGLOG(5, "Raw (uncompressed) literals: %u -> %u", (U32)srcSize, (U32)(srcSize + flSize)); + return srcSize + flSize; + } + ++static int allBytesIdentical(const void* src, size_t srcSize) ++{ ++ assert(srcSize >= 1); ++ assert(src != NULL); ++ { const BYTE b = ((const BYTE*)src)[0]; ++ size_t p; ++ for (p=1; p31) + (srcSize>4095); + +- (void)dstCapacity; /* dstCapacity already guaranteed to be >=4, hence large enough */ ++ assert(dstCapacity >= 4); (void)dstCapacity; ++ assert(allBytesIdentical(src, srcSize)); + + switch(flSize) + { +@@ -63,28 +103,51 @@ size_t ZSTD_compressRleLiteralsBlock (vo + } + + ostart[flSize] = *(const BYTE*)src; +- DEBUGLOG(5, "RLE literals: %u -> %u", (U32)srcSize, (U32)flSize + 1); ++ DEBUGLOG(5, "RLE : Repeated Literal (%02X: %u times) -> %u bytes encoded", ((const BYTE*)src)[0], (U32)srcSize, (U32)flSize + 1); + return flSize+1; + } + +-size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf, +- ZSTD_hufCTables_t* nextHuf, +- ZSTD_strategy strategy, int disableLiteralCompression, +- void* dst, size_t dstCapacity, +- const void* src, size_t srcSize, +- void* entropyWorkspace, size_t entropyWorkspaceSize, +- const int bmi2, +- unsigned suspectUncompressible) ++/* ZSTD_minLiteralsToCompress() : ++ * returns minimal amount of literals ++ * for literal compression to even be attempted. ++ * Minimum is made tighter as compression strategy increases. ++ */ ++static size_t ++ZSTD_minLiteralsToCompress(ZSTD_strategy strategy, HUF_repeat huf_repeat) ++{ ++ assert((int)strategy >= 0); ++ assert((int)strategy <= 9); ++ /* btultra2 : min 8 bytes; ++ * then 2x larger for each successive compression strategy ++ * max threshold 64 bytes */ ++ { int const shift = MIN(9-(int)strategy, 3); ++ size_t const mintc = (huf_repeat == HUF_repeat_valid) ? 6 : (size_t)8 << shift; ++ DEBUGLOG(7, "minLiteralsToCompress = %zu", mintc); ++ return mintc; ++ } ++} ++ ++size_t ZSTD_compressLiterals ( ++ void* dst, size_t dstCapacity, ++ const void* src, size_t srcSize, ++ void* entropyWorkspace, size_t entropyWorkspaceSize, ++ const ZSTD_hufCTables_t* prevHuf, ++ ZSTD_hufCTables_t* nextHuf, ++ ZSTD_strategy strategy, ++ int disableLiteralCompression, ++ int suspectUncompressible, ++ int bmi2) + { +- size_t const minGain = ZSTD_minGain(srcSize, strategy); + size_t const lhSize = 3 + (srcSize >= 1 KB) + (srcSize >= 16 KB); + BYTE* const ostart = (BYTE*)dst; + U32 singleStream = srcSize < 256; + symbolEncodingType_e hType = set_compressed; + size_t cLitSize; + +- DEBUGLOG(5,"ZSTD_compressLiterals (disableLiteralCompression=%i srcSize=%u)", +- disableLiteralCompression, (U32)srcSize); ++ DEBUGLOG(5,"ZSTD_compressLiterals (disableLiteralCompression=%i, srcSize=%u, dstCapacity=%zu)", ++ disableLiteralCompression, (U32)srcSize, dstCapacity); ++ ++ DEBUGLOG(6, "Completed literals listing (%zu bytes)", showHexa(src, srcSize)); + + /* Prepare nextEntropy assuming reusing the existing table */ + ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf)); +@@ -92,40 +155,51 @@ size_t ZSTD_compressLiterals (ZSTD_hufCT + if (disableLiteralCompression) + return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize); + +- /* small ? don't even attempt compression (speed opt) */ +-# define COMPRESS_LITERALS_SIZE_MIN 63 +- { size_t const minLitSize = (prevHuf->repeatMode == HUF_repeat_valid) ? 6 : COMPRESS_LITERALS_SIZE_MIN; +- if (srcSize <= minLitSize) return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize); +- } ++ /* if too small, don't even attempt compression (speed opt) */ ++ if (srcSize < ZSTD_minLiteralsToCompress(strategy, prevHuf->repeatMode)) ++ return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize); + + RETURN_ERROR_IF(dstCapacity < lhSize+1, dstSize_tooSmall, "not enough space for compression"); + { HUF_repeat repeat = prevHuf->repeatMode; +- int const preferRepeat = strategy < ZSTD_lazy ? srcSize <= 1024 : 0; ++ int const flags = 0 ++ | (bmi2 ? HUF_flags_bmi2 : 0) ++ | (strategy < ZSTD_lazy && srcSize <= 1024 ? HUF_flags_preferRepeat : 0) ++ | (strategy >= HUF_OPTIMAL_DEPTH_THRESHOLD ? HUF_flags_optimalDepth : 0) ++ | (suspectUncompressible ? HUF_flags_suspectUncompressible : 0); ++ ++ typedef size_t (*huf_compress_f)(void*, size_t, const void*, size_t, unsigned, unsigned, void*, size_t, HUF_CElt*, HUF_repeat*, int); ++ huf_compress_f huf_compress; + if (repeat == HUF_repeat_valid && lhSize == 3) singleStream = 1; +- cLitSize = singleStream ? +- HUF_compress1X_repeat( +- ostart+lhSize, dstCapacity-lhSize, src, srcSize, +- HUF_SYMBOLVALUE_MAX, HUF_TABLELOG_DEFAULT, entropyWorkspace, entropyWorkspaceSize, +- (HUF_CElt*)nextHuf->CTable, &repeat, preferRepeat, bmi2, suspectUncompressible) : +- HUF_compress4X_repeat( +- ostart+lhSize, dstCapacity-lhSize, src, srcSize, +- HUF_SYMBOLVALUE_MAX, HUF_TABLELOG_DEFAULT, entropyWorkspace, entropyWorkspaceSize, +- (HUF_CElt*)nextHuf->CTable, &repeat, preferRepeat, bmi2, suspectUncompressible); ++ huf_compress = singleStream ? HUF_compress1X_repeat : HUF_compress4X_repeat; ++ cLitSize = huf_compress(ostart+lhSize, dstCapacity-lhSize, ++ src, srcSize, ++ HUF_SYMBOLVALUE_MAX, LitHufLog, ++ entropyWorkspace, entropyWorkspaceSize, ++ (HUF_CElt*)nextHuf->CTable, ++ &repeat, flags); ++ DEBUGLOG(5, "%zu literals compressed into %zu bytes (before header)", srcSize, cLitSize); + if (repeat != HUF_repeat_none) { + /* reused the existing table */ +- DEBUGLOG(5, "Reusing previous huffman table"); ++ DEBUGLOG(5, "reusing statistics from previous huffman block"); + hType = set_repeat; + } + } + +- if ((cLitSize==0) || (cLitSize >= srcSize - minGain) || ERR_isError(cLitSize)) { +- ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf)); +- return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize); +- } ++ { size_t const minGain = ZSTD_minGain(srcSize, strategy); ++ if ((cLitSize==0) || (cLitSize >= srcSize - minGain) || ERR_isError(cLitSize)) { ++ ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf)); ++ return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize); ++ } } + if (cLitSize==1) { +- ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf)); +- return ZSTD_compressRleLiteralsBlock(dst, dstCapacity, src, srcSize); +- } ++ /* A return value of 1 signals that the alphabet consists of a single symbol. ++ * However, in some rare circumstances, it could be the compressed size (a single byte). ++ * For that outcome to have a chance to happen, it's necessary that `srcSize < 8`. ++ * (it's also necessary to not generate statistics). ++ * Therefore, in such a case, actively check that all bytes are identical. */ ++ if ((srcSize >= 8) || allBytesIdentical(src, srcSize)) { ++ ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf)); ++ return ZSTD_compressRleLiteralsBlock(dst, dstCapacity, src, srcSize); ++ } } + + if (hType == set_compressed) { + /* using a newly constructed table */ +@@ -136,16 +210,19 @@ size_t ZSTD_compressLiterals (ZSTD_hufCT + switch(lhSize) + { + case 3: /* 2 - 2 - 10 - 10 */ +- { U32 const lhc = hType + ((!singleStream) << 2) + ((U32)srcSize<<4) + ((U32)cLitSize<<14); ++ if (!singleStream) assert(srcSize >= MIN_LITERALS_FOR_4_STREAMS); ++ { U32 const lhc = hType + ((U32)(!singleStream) << 2) + ((U32)srcSize<<4) + ((U32)cLitSize<<14); + MEM_writeLE24(ostart, lhc); + break; + } + case 4: /* 2 - 2 - 14 - 14 */ ++ assert(srcSize >= MIN_LITERALS_FOR_4_STREAMS); + { U32 const lhc = hType + (2 << 2) + ((U32)srcSize<<4) + ((U32)cLitSize<<18); + MEM_writeLE32(ostart, lhc); + break; + } + case 5: /* 2 - 2 - 18 - 18 */ ++ assert(srcSize >= MIN_LITERALS_FOR_4_STREAMS); + { U32 const lhc = hType + (3 << 2) + ((U32)srcSize<<4) + ((U32)cLitSize<<22); + MEM_writeLE32(ostart, lhc); + ostart[4] = (BYTE)(cLitSize >> 10); +--- a/lib/zstd/compress/zstd_compress_literals.h ++++ b/lib/zstd/compress/zstd_compress_literals.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -16,16 +17,24 @@ + + size_t ZSTD_noCompressLiterals (void* dst, size_t dstCapacity, const void* src, size_t srcSize); + ++/* ZSTD_compressRleLiteralsBlock() : ++ * Conditions : ++ * - All bytes in @src are identical ++ * - dstCapacity >= 4 */ + size_t ZSTD_compressRleLiteralsBlock (void* dst, size_t dstCapacity, const void* src, size_t srcSize); + +-/* If suspectUncompressible then some sampling checks will be run to potentially skip huffman coding */ +-size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf, +- ZSTD_hufCTables_t* nextHuf, +- ZSTD_strategy strategy, int disableLiteralCompression, +- void* dst, size_t dstCapacity, ++/* ZSTD_compressLiterals(): ++ * @entropyWorkspace: must be aligned on 4-bytes boundaries ++ * @entropyWorkspaceSize : must be >= HUF_WORKSPACE_SIZE ++ * @suspectUncompressible: sampling checks, to potentially skip huffman coding ++ */ ++size_t ZSTD_compressLiterals (void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + void* entropyWorkspace, size_t entropyWorkspaceSize, +- const int bmi2, +- unsigned suspectUncompressible); ++ const ZSTD_hufCTables_t* prevHuf, ++ ZSTD_hufCTables_t* nextHuf, ++ ZSTD_strategy strategy, int disableLiteralCompression, ++ int suspectUncompressible, ++ int bmi2); + + #endif /* ZSTD_COMPRESS_LITERALS_H */ +--- a/lib/zstd/compress/zstd_compress_sequences.c ++++ b/lib/zstd/compress/zstd_compress_sequences.c +@@ -1,5 +1,6 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -58,7 +59,7 @@ static unsigned ZSTD_useLowProbCount(siz + { + /* Heuristic: This should cover most blocks <= 16K and + * start to fade out after 16K to about 32K depending on +- * comprssibility. ++ * compressibility. + */ + return nbSeq >= 2048; + } +@@ -166,7 +167,7 @@ ZSTD_selectEncodingType( + if (mostFrequent == nbSeq) { + *repeatMode = FSE_repeat_none; + if (isDefaultAllowed && nbSeq <= 2) { +- /* Prefer set_basic over set_rle when there are 2 or less symbols, ++ /* Prefer set_basic over set_rle when there are 2 or fewer symbols, + * since RLE uses 1 byte, but set_basic uses 5-6 bits per symbol. + * If basic encoding isn't possible, always choose RLE. + */ +--- a/lib/zstd/compress/zstd_compress_sequences.h ++++ b/lib/zstd/compress/zstd_compress_sequences.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +--- a/lib/zstd/compress/zstd_compress_superblock.c ++++ b/lib/zstd/compress/zstd_compress_superblock.c +@@ -1,5 +1,6 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -36,13 +37,14 @@ + * If it is set_compressed, first sub-block's literals section will be Treeless_Literals_Block + * and the following sub-blocks' literals sections will be Treeless_Literals_Block. + * @return : compressed size of literals section of a sub-block +- * Or 0 if it unable to compress. ++ * Or 0 if unable to compress. + * Or error code */ +-static size_t ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable, +- const ZSTD_hufCTablesMetadata_t* hufMetadata, +- const BYTE* literals, size_t litSize, +- void* dst, size_t dstSize, +- const int bmi2, int writeEntropy, int* entropyWritten) ++static size_t ++ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable, ++ const ZSTD_hufCTablesMetadata_t* hufMetadata, ++ const BYTE* literals, size_t litSize, ++ void* dst, size_t dstSize, ++ const int bmi2, int writeEntropy, int* entropyWritten) + { + size_t const header = writeEntropy ? 200 : 0; + size_t const lhSize = 3 + (litSize >= (1 KB - header)) + (litSize >= (16 KB - header)); +@@ -53,8 +55,6 @@ static size_t ZSTD_compressSubBlock_lite + symbolEncodingType_e hType = writeEntropy ? hufMetadata->hType : set_repeat; + size_t cLitSize = 0; + +- (void)bmi2; /* TODO bmi2... */ +- + DEBUGLOG(5, "ZSTD_compressSubBlock_literal (litSize=%zu, lhSize=%zu, writeEntropy=%d)", litSize, lhSize, writeEntropy); + + *entropyWritten = 0; +@@ -76,9 +76,9 @@ static size_t ZSTD_compressSubBlock_lite + DEBUGLOG(5, "ZSTD_compressSubBlock_literal (hSize=%zu)", hufMetadata->hufDesSize); + } + +- /* TODO bmi2 */ +- { const size_t cSize = singleStream ? HUF_compress1X_usingCTable(op, oend-op, literals, litSize, hufTable) +- : HUF_compress4X_usingCTable(op, oend-op, literals, litSize, hufTable); ++ { int const flags = bmi2 ? HUF_flags_bmi2 : 0; ++ const size_t cSize = singleStream ? HUF_compress1X_usingCTable(op, (size_t)(oend-op), literals, litSize, hufTable, flags) ++ : HUF_compress4X_usingCTable(op, (size_t)(oend-op), literals, litSize, hufTable, flags); + op += cSize; + cLitSize += cSize; + if (cSize == 0 || ERR_isError(cSize)) { +@@ -103,7 +103,7 @@ static size_t ZSTD_compressSubBlock_lite + switch(lhSize) + { + case 3: /* 2 - 2 - 10 - 10 */ +- { U32 const lhc = hType + ((!singleStream) << 2) + ((U32)litSize<<4) + ((U32)cLitSize<<14); ++ { U32 const lhc = hType + ((U32)(!singleStream) << 2) + ((U32)litSize<<4) + ((U32)cLitSize<<14); + MEM_writeLE24(ostart, lhc); + break; + } +@@ -123,26 +123,30 @@ static size_t ZSTD_compressSubBlock_lite + } + *entropyWritten = 1; + DEBUGLOG(5, "Compressed literals: %u -> %u", (U32)litSize, (U32)(op-ostart)); +- return op-ostart; ++ return (size_t)(op-ostart); + } + +-static size_t ZSTD_seqDecompressedSize(seqStore_t const* seqStore, const seqDef* sequences, size_t nbSeq, size_t litSize, int lastSequence) { +- const seqDef* const sstart = sequences; +- const seqDef* const send = sequences + nbSeq; +- const seqDef* sp = sstart; ++static size_t ++ZSTD_seqDecompressedSize(seqStore_t const* seqStore, ++ const seqDef* sequences, size_t nbSeqs, ++ size_t litSize, int lastSubBlock) ++{ + size_t matchLengthSum = 0; + size_t litLengthSum = 0; +- (void)(litLengthSum); /* suppress unused variable warning on some environments */ +- while (send-sp > 0) { +- ZSTD_sequenceLength const seqLen = ZSTD_getSequenceLength(seqStore, sp); ++ size_t n; ++ for (n=0; ncParams.windowLog > STREAM_ACCUMULATOR_MIN; + BYTE* const ostart = (BYTE*)dst; +@@ -176,14 +181,14 @@ static size_t ZSTD_compressSubBlock_sequ + /* Sequences Header */ + RETURN_ERROR_IF((oend-op) < 3 /*max nbSeq Size*/ + 1 /*seqHead*/, + dstSize_tooSmall, ""); +- if (nbSeq < 0x7F) ++ if (nbSeq < 128) + *op++ = (BYTE)nbSeq; + else if (nbSeq < LONGNBSEQ) + op[0] = (BYTE)((nbSeq>>8) + 0x80), op[1] = (BYTE)nbSeq, op+=2; + else + op[0]=0xFF, MEM_writeLE16(op+1, (U16)(nbSeq - LONGNBSEQ)), op+=3; + if (nbSeq==0) { +- return op - ostart; ++ return (size_t)(op - ostart); + } + + /* seqHead : flags for FSE encoding type */ +@@ -205,7 +210,7 @@ static size_t ZSTD_compressSubBlock_sequ + } + + { size_t const bitstreamSize = ZSTD_encodeSequences( +- op, oend - op, ++ op, (size_t)(oend - op), + fseTables->matchlengthCTable, mlCode, + fseTables->offcodeCTable, ofCode, + fseTables->litlengthCTable, llCode, +@@ -249,7 +254,7 @@ static size_t ZSTD_compressSubBlock_sequ + #endif + + *entropyWritten = 1; +- return op - ostart; ++ return (size_t)(op - ostart); + } + + /* ZSTD_compressSubBlock() : +@@ -275,7 +280,8 @@ static size_t ZSTD_compressSubBlock(cons + litSize, nbSeq, writeLitEntropy, writeSeqEntropy, lastBlock); + { size_t cLitSize = ZSTD_compressSubBlock_literal((const HUF_CElt*)entropy->huf.CTable, + &entropyMetadata->hufMetadata, literals, litSize, +- op, oend-op, bmi2, writeLitEntropy, litEntropyWritten); ++ op, (size_t)(oend-op), ++ bmi2, writeLitEntropy, litEntropyWritten); + FORWARD_IF_ERROR(cLitSize, "ZSTD_compressSubBlock_literal failed"); + if (cLitSize == 0) return 0; + op += cLitSize; +@@ -285,18 +291,18 @@ static size_t ZSTD_compressSubBlock(cons + sequences, nbSeq, + llCode, mlCode, ofCode, + cctxParams, +- op, oend-op, ++ op, (size_t)(oend-op), + bmi2, writeSeqEntropy, seqEntropyWritten); + FORWARD_IF_ERROR(cSeqSize, "ZSTD_compressSubBlock_sequences failed"); + if (cSeqSize == 0) return 0; + op += cSeqSize; + } + /* Write block header */ +- { size_t cSize = (op-ostart)-ZSTD_blockHeaderSize; ++ { size_t cSize = (size_t)(op-ostart) - ZSTD_blockHeaderSize; + U32 const cBlockHeader24 = lastBlock + (((U32)bt_compressed)<<1) + (U32)(cSize << 3); + MEM_writeLE24(ostart, cBlockHeader24); + } +- return op-ostart; ++ return (size_t)(op-ostart); + } + + static size_t ZSTD_estimateSubBlockSize_literal(const BYTE* literals, size_t litSize, +@@ -385,7 +391,11 @@ static size_t ZSTD_estimateSubBlockSize_ + return cSeqSizeEstimate + sequencesSectionHeaderSize; + } + +-static size_t ZSTD_estimateSubBlockSize(const BYTE* literals, size_t litSize, ++typedef struct { ++ size_t estLitSize; ++ size_t estBlockSize; ++} EstimatedBlockSize; ++static EstimatedBlockSize ZSTD_estimateSubBlockSize(const BYTE* literals, size_t litSize, + const BYTE* ofCodeTable, + const BYTE* llCodeTable, + const BYTE* mlCodeTable, +@@ -393,15 +403,17 @@ static size_t ZSTD_estimateSubBlockSize( + const ZSTD_entropyCTables_t* entropy, + const ZSTD_entropyCTablesMetadata_t* entropyMetadata, + void* workspace, size_t wkspSize, +- int writeLitEntropy, int writeSeqEntropy) { +- size_t cSizeEstimate = 0; +- cSizeEstimate += ZSTD_estimateSubBlockSize_literal(literals, litSize, +- &entropy->huf, &entropyMetadata->hufMetadata, +- workspace, wkspSize, writeLitEntropy); +- cSizeEstimate += ZSTD_estimateSubBlockSize_sequences(ofCodeTable, llCodeTable, mlCodeTable, ++ int writeLitEntropy, int writeSeqEntropy) ++{ ++ EstimatedBlockSize ebs; ++ ebs.estLitSize = ZSTD_estimateSubBlockSize_literal(literals, litSize, ++ &entropy->huf, &entropyMetadata->hufMetadata, ++ workspace, wkspSize, writeLitEntropy); ++ ebs.estBlockSize = ZSTD_estimateSubBlockSize_sequences(ofCodeTable, llCodeTable, mlCodeTable, + nbSeq, &entropy->fse, &entropyMetadata->fseMetadata, + workspace, wkspSize, writeSeqEntropy); +- return cSizeEstimate + ZSTD_blockHeaderSize; ++ ebs.estBlockSize += ebs.estLitSize + ZSTD_blockHeaderSize; ++ return ebs; + } + + static int ZSTD_needSequenceEntropyTables(ZSTD_fseCTablesMetadata_t const* fseMetadata) +@@ -415,13 +427,56 @@ static int ZSTD_needSequenceEntropyTable + return 0; + } + ++static size_t countLiterals(seqStore_t const* seqStore, const seqDef* sp, size_t seqCount) ++{ ++ size_t n, total = 0; ++ assert(sp != NULL); ++ for (n=0; n %zu bytes", seqCount, (const void*)sp, total); ++ return total; ++} ++ ++#define BYTESCALE 256 ++ ++static size_t sizeBlockSequences(const seqDef* sp, size_t nbSeqs, ++ size_t targetBudget, size_t avgLitCost, size_t avgSeqCost, ++ int firstSubBlock) ++{ ++ size_t n, budget = 0, inSize=0; ++ /* entropy headers */ ++ size_t const headerSize = (size_t)firstSubBlock * 120 * BYTESCALE; /* generous estimate */ ++ assert(firstSubBlock==0 || firstSubBlock==1); ++ budget += headerSize; ++ ++ /* first sequence => at least one sequence*/ ++ budget += sp[0].litLength * avgLitCost + avgSeqCost; ++ if (budget > targetBudget) return 1; ++ inSize = sp[0].litLength + (sp[0].mlBase+MINMATCH); ++ ++ /* loop over sequences */ ++ for (n=1; n targetBudget) ++ /* though continue to expand until the sub-block is deemed compressible */ ++ && (budget < inSize * BYTESCALE) ) ++ break; ++ } ++ ++ return n; ++} ++ + /* ZSTD_compressSubBlock_multi() : + * Breaks super-block into multiple sub-blocks and compresses them. +- * Entropy will be written to the first block. +- * The following blocks will use repeat mode to compress. +- * All sub-blocks are compressed blocks (no raw or rle blocks). +- * @return : compressed size of the super block (which is multiple ZSTD blocks) +- * Or 0 if it failed to compress. */ ++ * Entropy will be written into the first block. ++ * The following blocks use repeat_mode to compress. ++ * Sub-blocks are all compressed, except the last one when beneficial. ++ * @return : compressed size of the super block (which features multiple ZSTD blocks) ++ * or 0 if it failed to compress. */ + static size_t ZSTD_compressSubBlock_multi(const seqStore_t* seqStorePtr, + const ZSTD_compressedBlockState_t* prevCBlock, + ZSTD_compressedBlockState_t* nextCBlock, +@@ -434,10 +489,12 @@ static size_t ZSTD_compressSubBlock_mult + { + const seqDef* const sstart = seqStorePtr->sequencesStart; + const seqDef* const send = seqStorePtr->sequences; +- const seqDef* sp = sstart; ++ const seqDef* sp = sstart; /* tracks progresses within seqStorePtr->sequences */ ++ size_t const nbSeqs = (size_t)(send - sstart); + const BYTE* const lstart = seqStorePtr->litStart; + const BYTE* const lend = seqStorePtr->lit; + const BYTE* lp = lstart; ++ size_t const nbLiterals = (size_t)(lend - lstart); + BYTE const* ip = (BYTE const*)src; + BYTE const* const iend = ip + srcSize; + BYTE* const ostart = (BYTE*)dst; +@@ -446,112 +503,171 @@ static size_t ZSTD_compressSubBlock_mult + const BYTE* llCodePtr = seqStorePtr->llCode; + const BYTE* mlCodePtr = seqStorePtr->mlCode; + const BYTE* ofCodePtr = seqStorePtr->ofCode; +- size_t targetCBlockSize = cctxParams->targetCBlockSize; +- size_t litSize, seqCount; +- int writeLitEntropy = entropyMetadata->hufMetadata.hType == set_compressed; ++ size_t const minTarget = ZSTD_TARGETCBLOCKSIZE_MIN; /* enforce minimum size, to reduce undesirable side effects */ ++ size_t const targetCBlockSize = MAX(minTarget, cctxParams->targetCBlockSize); ++ int writeLitEntropy = (entropyMetadata->hufMetadata.hType == set_compressed); + int writeSeqEntropy = 1; +- int lastSequence = 0; + +- DEBUGLOG(5, "ZSTD_compressSubBlock_multi (litSize=%u, nbSeq=%u)", +- (unsigned)(lend-lp), (unsigned)(send-sstart)); ++ DEBUGLOG(5, "ZSTD_compressSubBlock_multi (srcSize=%u, litSize=%u, nbSeq=%u)", ++ (unsigned)srcSize, (unsigned)(lend-lstart), (unsigned)(send-sstart)); + +- litSize = 0; +- seqCount = 0; +- do { +- size_t cBlockSizeEstimate = 0; +- if (sstart == send) { +- lastSequence = 1; +- } else { +- const seqDef* const sequence = sp + seqCount; +- lastSequence = sequence == send - 1; +- litSize += ZSTD_getSequenceLength(seqStorePtr, sequence).litLength; +- seqCount++; +- } +- if (lastSequence) { +- assert(lp <= lend); +- assert(litSize <= (size_t)(lend - lp)); +- litSize = (size_t)(lend - lp); +- } +- /* I think there is an optimization opportunity here. +- * Calling ZSTD_estimateSubBlockSize for every sequence can be wasteful +- * since it recalculates estimate from scratch. +- * For example, it would recount literal distribution and symbol codes every time. +- */ +- cBlockSizeEstimate = ZSTD_estimateSubBlockSize(lp, litSize, ofCodePtr, llCodePtr, mlCodePtr, seqCount, +- &nextCBlock->entropy, entropyMetadata, +- workspace, wkspSize, writeLitEntropy, writeSeqEntropy); +- if (cBlockSizeEstimate > targetCBlockSize || lastSequence) { +- int litEntropyWritten = 0; +- int seqEntropyWritten = 0; +- const size_t decompressedSize = ZSTD_seqDecompressedSize(seqStorePtr, sp, seqCount, litSize, lastSequence); +- const size_t cSize = ZSTD_compressSubBlock(&nextCBlock->entropy, entropyMetadata, +- sp, seqCount, +- lp, litSize, +- llCodePtr, mlCodePtr, ofCodePtr, +- cctxParams, +- op, oend-op, +- bmi2, writeLitEntropy, writeSeqEntropy, +- &litEntropyWritten, &seqEntropyWritten, +- lastBlock && lastSequence); +- FORWARD_IF_ERROR(cSize, "ZSTD_compressSubBlock failed"); +- if (cSize > 0 && cSize < decompressedSize) { +- DEBUGLOG(5, "Committed the sub-block"); +- assert(ip + decompressedSize <= iend); +- ip += decompressedSize; +- sp += seqCount; +- lp += litSize; +- op += cSize; +- llCodePtr += seqCount; +- mlCodePtr += seqCount; +- ofCodePtr += seqCount; +- litSize = 0; +- seqCount = 0; +- /* Entropy only needs to be written once */ +- if (litEntropyWritten) { +- writeLitEntropy = 0; +- } +- if (seqEntropyWritten) { +- writeSeqEntropy = 0; +- } ++ /* let's start by a general estimation for the full block */ ++ if (nbSeqs > 0) { ++ EstimatedBlockSize const ebs = ++ ZSTD_estimateSubBlockSize(lp, nbLiterals, ++ ofCodePtr, llCodePtr, mlCodePtr, nbSeqs, ++ &nextCBlock->entropy, entropyMetadata, ++ workspace, wkspSize, ++ writeLitEntropy, writeSeqEntropy); ++ /* quick estimation */ ++ size_t const avgLitCost = nbLiterals ? (ebs.estLitSize * BYTESCALE) / nbLiterals : BYTESCALE; ++ size_t const avgSeqCost = ((ebs.estBlockSize - ebs.estLitSize) * BYTESCALE) / nbSeqs; ++ const size_t nbSubBlocks = MAX((ebs.estBlockSize + (targetCBlockSize/2)) / targetCBlockSize, 1); ++ size_t n, avgBlockBudget, blockBudgetSupp=0; ++ avgBlockBudget = (ebs.estBlockSize * BYTESCALE) / nbSubBlocks; ++ DEBUGLOG(5, "estimated fullblock size=%u bytes ; avgLitCost=%.2f ; avgSeqCost=%.2f ; targetCBlockSize=%u, nbSubBlocks=%u ; avgBlockBudget=%.0f bytes", ++ (unsigned)ebs.estBlockSize, (double)avgLitCost/BYTESCALE, (double)avgSeqCost/BYTESCALE, ++ (unsigned)targetCBlockSize, (unsigned)nbSubBlocks, (double)avgBlockBudget/BYTESCALE); ++ /* simplification: if estimates states that the full superblock doesn't compress, just bail out immediately ++ * this will result in the production of a single uncompressed block covering @srcSize.*/ ++ if (ebs.estBlockSize > srcSize) return 0; ++ ++ /* compress and write sub-blocks */ ++ assert(nbSubBlocks>0); ++ for (n=0; n < nbSubBlocks-1; n++) { ++ /* determine nb of sequences for current sub-block + nbLiterals from next sequence */ ++ size_t const seqCount = sizeBlockSequences(sp, (size_t)(send-sp), ++ avgBlockBudget + blockBudgetSupp, avgLitCost, avgSeqCost, n==0); ++ /* if reached last sequence : break to last sub-block (simplification) */ ++ assert(seqCount <= (size_t)(send-sp)); ++ if (sp + seqCount == send) break; ++ assert(seqCount > 0); ++ /* compress sub-block */ ++ { int litEntropyWritten = 0; ++ int seqEntropyWritten = 0; ++ size_t litSize = countLiterals(seqStorePtr, sp, seqCount); ++ const size_t decompressedSize = ++ ZSTD_seqDecompressedSize(seqStorePtr, sp, seqCount, litSize, 0); ++ size_t const cSize = ZSTD_compressSubBlock(&nextCBlock->entropy, entropyMetadata, ++ sp, seqCount, ++ lp, litSize, ++ llCodePtr, mlCodePtr, ofCodePtr, ++ cctxParams, ++ op, (size_t)(oend-op), ++ bmi2, writeLitEntropy, writeSeqEntropy, ++ &litEntropyWritten, &seqEntropyWritten, ++ 0); ++ FORWARD_IF_ERROR(cSize, "ZSTD_compressSubBlock failed"); ++ ++ /* check compressibility, update state components */ ++ if (cSize > 0 && cSize < decompressedSize) { ++ DEBUGLOG(5, "Committed sub-block compressing %u bytes => %u bytes", ++ (unsigned)decompressedSize, (unsigned)cSize); ++ assert(ip + decompressedSize <= iend); ++ ip += decompressedSize; ++ lp += litSize; ++ op += cSize; ++ llCodePtr += seqCount; ++ mlCodePtr += seqCount; ++ ofCodePtr += seqCount; ++ /* Entropy only needs to be written once */ ++ if (litEntropyWritten) { ++ writeLitEntropy = 0; ++ } ++ if (seqEntropyWritten) { ++ writeSeqEntropy = 0; ++ } ++ sp += seqCount; ++ blockBudgetSupp = 0; ++ } } ++ /* otherwise : do not compress yet, coalesce current sub-block with following one */ ++ } ++ } /* if (nbSeqs > 0) */ ++ ++ /* write last block */ ++ DEBUGLOG(5, "Generate last sub-block: %u sequences remaining", (unsigned)(send - sp)); ++ { int litEntropyWritten = 0; ++ int seqEntropyWritten = 0; ++ size_t litSize = (size_t)(lend - lp); ++ size_t seqCount = (size_t)(send - sp); ++ const size_t decompressedSize = ++ ZSTD_seqDecompressedSize(seqStorePtr, sp, seqCount, litSize, 1); ++ size_t const cSize = ZSTD_compressSubBlock(&nextCBlock->entropy, entropyMetadata, ++ sp, seqCount, ++ lp, litSize, ++ llCodePtr, mlCodePtr, ofCodePtr, ++ cctxParams, ++ op, (size_t)(oend-op), ++ bmi2, writeLitEntropy, writeSeqEntropy, ++ &litEntropyWritten, &seqEntropyWritten, ++ lastBlock); ++ FORWARD_IF_ERROR(cSize, "ZSTD_compressSubBlock failed"); ++ ++ /* update pointers, the nb of literals borrowed from next sequence must be preserved */ ++ if (cSize > 0 && cSize < decompressedSize) { ++ DEBUGLOG(5, "Last sub-block compressed %u bytes => %u bytes", ++ (unsigned)decompressedSize, (unsigned)cSize); ++ assert(ip + decompressedSize <= iend); ++ ip += decompressedSize; ++ lp += litSize; ++ op += cSize; ++ llCodePtr += seqCount; ++ mlCodePtr += seqCount; ++ ofCodePtr += seqCount; ++ /* Entropy only needs to be written once */ ++ if (litEntropyWritten) { ++ writeLitEntropy = 0; + } ++ if (seqEntropyWritten) { ++ writeSeqEntropy = 0; ++ } ++ sp += seqCount; + } +- } while (!lastSequence); ++ } ++ ++ + if (writeLitEntropy) { +- DEBUGLOG(5, "ZSTD_compressSubBlock_multi has literal entropy tables unwritten"); ++ DEBUGLOG(5, "Literal entropy tables were never written"); + ZSTD_memcpy(&nextCBlock->entropy.huf, &prevCBlock->entropy.huf, sizeof(prevCBlock->entropy.huf)); + } + if (writeSeqEntropy && ZSTD_needSequenceEntropyTables(&entropyMetadata->fseMetadata)) { + /* If we haven't written our entropy tables, then we've violated our contract and + * must emit an uncompressed block. + */ +- DEBUGLOG(5, "ZSTD_compressSubBlock_multi has sequence entropy tables unwritten"); ++ DEBUGLOG(5, "Sequence entropy tables were never written => cancel, emit an uncompressed block"); + return 0; + } ++ + if (ip < iend) { +- size_t const cSize = ZSTD_noCompressBlock(op, oend - op, ip, iend - ip, lastBlock); +- DEBUGLOG(5, "ZSTD_compressSubBlock_multi last sub-block uncompressed, %zu bytes", (size_t)(iend - ip)); ++ /* some data left : last part of the block sent uncompressed */ ++ size_t const rSize = (size_t)((iend - ip)); ++ size_t const cSize = ZSTD_noCompressBlock(op, (size_t)(oend - op), ip, rSize, lastBlock); ++ DEBUGLOG(5, "Generate last uncompressed sub-block of %u bytes", (unsigned)(rSize)); + FORWARD_IF_ERROR(cSize, "ZSTD_noCompressBlock failed"); + assert(cSize != 0); + op += cSize; + /* We have to regenerate the repcodes because we've skipped some sequences */ + if (sp < send) { +- seqDef const* seq; ++ const seqDef* seq; + repcodes_t rep; + ZSTD_memcpy(&rep, prevCBlock->rep, sizeof(rep)); + for (seq = sstart; seq < sp; ++seq) { +- ZSTD_updateRep(rep.rep, seq->offBase - 1, ZSTD_getSequenceLength(seqStorePtr, seq).litLength == 0); ++ ZSTD_updateRep(rep.rep, seq->offBase, ZSTD_getSequenceLength(seqStorePtr, seq).litLength == 0); + } + ZSTD_memcpy(nextCBlock->rep, &rep, sizeof(rep)); + } + } +- DEBUGLOG(5, "ZSTD_compressSubBlock_multi compressed"); +- return op-ostart; ++ ++ DEBUGLOG(5, "ZSTD_compressSubBlock_multi compressed all subBlocks: total compressed size = %u", ++ (unsigned)(op-ostart)); ++ return (size_t)(op-ostart); + } + + size_t ZSTD_compressSuperBlock(ZSTD_CCtx* zc, + void* dst, size_t dstCapacity, +- void const* src, size_t srcSize, +- unsigned lastBlock) { ++ const void* src, size_t srcSize, ++ unsigned lastBlock) ++{ + ZSTD_entropyCTablesMetadata_t entropyMetadata; + + FORWARD_IF_ERROR(ZSTD_buildBlockEntropyStats(&zc->seqStore, +--- a/lib/zstd/compress/zstd_compress_superblock.h ++++ b/lib/zstd/compress/zstd_compress_superblock.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +--- a/lib/zstd/compress/zstd_cwksp.h ++++ b/lib/zstd/compress/zstd_cwksp.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -14,7 +15,9 @@ + /*-************************************* + * Dependencies + ***************************************/ ++#include "../common/allocations.h" /* ZSTD_customMalloc, ZSTD_customFree */ + #include "../common/zstd_internal.h" ++#include "../common/portability_macros.h" + + + /*-************************************* +@@ -41,8 +44,9 @@ + ***************************************/ + typedef enum { + ZSTD_cwksp_alloc_objects, +- ZSTD_cwksp_alloc_buffers, +- ZSTD_cwksp_alloc_aligned ++ ZSTD_cwksp_alloc_aligned_init_once, ++ ZSTD_cwksp_alloc_aligned, ++ ZSTD_cwksp_alloc_buffers + } ZSTD_cwksp_alloc_phase_e; + + /* +@@ -95,8 +99,8 @@ typedef enum { + * + * Workspace Layout: + * +- * [ ... workspace ... ] +- * [objects][tables ... ->] free space [<- ... aligned][<- ... buffers] ++ * [ ... workspace ... ] ++ * [objects][tables ->] free space [<- buffers][<- aligned][<- init once] + * + * The various objects that live in the workspace are divided into the + * following categories, and are allocated separately: +@@ -120,9 +124,18 @@ typedef enum { + * uint32_t arrays, all of whose values are between 0 and (nextSrc - base). + * Their sizes depend on the cparams. These tables are 64-byte aligned. + * +- * - Aligned: these buffers are used for various purposes that require 4 byte +- * alignment, but don't require any initialization before they're used. These +- * buffers are each aligned to 64 bytes. ++ * - Init once: these buffers require to be initialized at least once before ++ * use. They should be used when we want to skip memory initialization ++ * while not triggering memory checkers (like Valgrind) when reading from ++ * from this memory without writing to it first. ++ * These buffers should be used carefully as they might contain data ++ * from previous compressions. ++ * Buffers are aligned to 64 bytes. ++ * ++ * - Aligned: these buffers don't require any initialization before they're ++ * used. The user of the buffer should make sure they write into a buffer ++ * location before reading from it. ++ * Buffers are aligned to 64 bytes. + * + * - Buffers: these buffers are used for various purposes that don't require + * any alignment or initialization before they're used. This means they can +@@ -134,8 +147,9 @@ typedef enum { + * correctly packed into the workspace buffer. That order is: + * + * 1. Objects +- * 2. Buffers +- * 3. Aligned/Tables ++ * 2. Init once / Tables ++ * 3. Aligned / Tables ++ * 4. Buffers / Tables + * + * Attempts to reserve objects of different types out of order will fail. + */ +@@ -147,6 +161,7 @@ typedef struct { + void* tableEnd; + void* tableValidEnd; + void* allocStart; ++ void* initOnceStart; + + BYTE allocFailed; + int workspaceOversizedDuration; +@@ -159,6 +174,7 @@ typedef struct { + ***************************************/ + + MEM_STATIC size_t ZSTD_cwksp_available_space(ZSTD_cwksp* ws); ++MEM_STATIC void* ZSTD_cwksp_initialAllocStart(ZSTD_cwksp* ws); + + MEM_STATIC void ZSTD_cwksp_assert_internal_consistency(ZSTD_cwksp* ws) { + (void)ws; +@@ -168,6 +184,8 @@ MEM_STATIC void ZSTD_cwksp_assert_intern + assert(ws->tableEnd <= ws->allocStart); + assert(ws->tableValidEnd <= ws->allocStart); + assert(ws->allocStart <= ws->workspaceEnd); ++ assert(ws->initOnceStart <= ZSTD_cwksp_initialAllocStart(ws)); ++ assert(ws->workspace <= ws->initOnceStart); + } + + /* +@@ -210,14 +228,10 @@ MEM_STATIC size_t ZSTD_cwksp_aligned_all + * for internal purposes (currently only alignment). + */ + MEM_STATIC size_t ZSTD_cwksp_slack_space_required(void) { +- /* For alignment, the wksp will always allocate an additional n_1=[1, 64] bytes +- * to align the beginning of tables section, as well as another n_2=[0, 63] bytes +- * to align the beginning of the aligned section. +- * +- * n_1 + n_2 == 64 bytes if the cwksp is freshly allocated, due to tables and +- * aligneds being sized in multiples of 64 bytes. ++ /* For alignment, the wksp will always allocate an additional 2*ZSTD_CWKSP_ALIGNMENT_BYTES ++ * bytes to align the beginning of tables section and end of buffers; + */ +- size_t const slackSpace = ZSTD_CWKSP_ALIGNMENT_BYTES; ++ size_t const slackSpace = ZSTD_CWKSP_ALIGNMENT_BYTES * 2; + return slackSpace; + } + +@@ -230,11 +244,19 @@ MEM_STATIC size_t ZSTD_cwksp_bytes_to_al + size_t const alignBytesMask = alignBytes - 1; + size_t const bytes = (alignBytes - ((size_t)ptr & (alignBytesMask))) & alignBytesMask; + assert((alignBytes & alignBytesMask) == 0); +- assert(bytes != ZSTD_CWKSP_ALIGNMENT_BYTES); ++ assert(bytes < alignBytes); + return bytes; + } + + /* ++ * Returns the initial value for allocStart which is used to determine the position from ++ * which we can allocate from the end of the workspace. ++ */ ++MEM_STATIC void* ZSTD_cwksp_initialAllocStart(ZSTD_cwksp* ws) { ++ return (void*)((size_t)ws->workspaceEnd & ~(ZSTD_CWKSP_ALIGNMENT_BYTES-1)); ++} ++ ++/* + * Internal function. Do not use directly. + * Reserves the given number of bytes within the aligned/buffer segment of the wksp, + * which counts from the end of the wksp (as opposed to the object/table segment). +@@ -274,27 +296,16 @@ ZSTD_cwksp_internal_advance_phase(ZSTD_c + { + assert(phase >= ws->phase); + if (phase > ws->phase) { +- /* Going from allocating objects to allocating buffers */ +- if (ws->phase < ZSTD_cwksp_alloc_buffers && +- phase >= ZSTD_cwksp_alloc_buffers) { ++ /* Going from allocating objects to allocating initOnce / tables */ ++ if (ws->phase < ZSTD_cwksp_alloc_aligned_init_once && ++ phase >= ZSTD_cwksp_alloc_aligned_init_once) { + ws->tableValidEnd = ws->objectEnd; +- } ++ ws->initOnceStart = ZSTD_cwksp_initialAllocStart(ws); + +- /* Going from allocating buffers to allocating aligneds/tables */ +- if (ws->phase < ZSTD_cwksp_alloc_aligned && +- phase >= ZSTD_cwksp_alloc_aligned) { +- { /* Align the start of the "aligned" to 64 bytes. Use [1, 64] bytes. */ +- size_t const bytesToAlign = +- ZSTD_CWKSP_ALIGNMENT_BYTES - ZSTD_cwksp_bytes_to_align_ptr(ws->allocStart, ZSTD_CWKSP_ALIGNMENT_BYTES); +- DEBUGLOG(5, "reserving aligned alignment addtl space: %zu", bytesToAlign); +- ZSTD_STATIC_ASSERT((ZSTD_CWKSP_ALIGNMENT_BYTES & (ZSTD_CWKSP_ALIGNMENT_BYTES - 1)) == 0); /* power of 2 */ +- RETURN_ERROR_IF(!ZSTD_cwksp_reserve_internal_buffer_space(ws, bytesToAlign), +- memory_allocation, "aligned phase - alignment initial allocation failed!"); +- } + { /* Align the start of the tables to 64 bytes. Use [0, 63] bytes */ +- void* const alloc = ws->objectEnd; ++ void *const alloc = ws->objectEnd; + size_t const bytesToAlign = ZSTD_cwksp_bytes_to_align_ptr(alloc, ZSTD_CWKSP_ALIGNMENT_BYTES); +- void* const objectEnd = (BYTE*)alloc + bytesToAlign; ++ void *const objectEnd = (BYTE *) alloc + bytesToAlign; + DEBUGLOG(5, "reserving table alignment addtl space: %zu", bytesToAlign); + RETURN_ERROR_IF(objectEnd > ws->workspaceEnd, memory_allocation, + "table phase - alignment initial allocation failed!"); +@@ -302,7 +313,9 @@ ZSTD_cwksp_internal_advance_phase(ZSTD_c + ws->tableEnd = objectEnd; /* table area starts being empty */ + if (ws->tableValidEnd < ws->tableEnd) { + ws->tableValidEnd = ws->tableEnd; +- } } } ++ } ++ } ++ } + ws->phase = phase; + ZSTD_cwksp_assert_internal_consistency(ws); + } +@@ -314,7 +327,7 @@ ZSTD_cwksp_internal_advance_phase(ZSTD_c + */ + MEM_STATIC int ZSTD_cwksp_owns_buffer(const ZSTD_cwksp* ws, const void* ptr) + { +- return (ptr != NULL) && (ws->workspace <= ptr) && (ptr <= ws->workspaceEnd); ++ return (ptr != NULL) && (ws->workspace <= ptr) && (ptr < ws->workspaceEnd); + } + + /* +@@ -345,6 +358,33 @@ MEM_STATIC BYTE* ZSTD_cwksp_reserve_buff + + /* + * Reserves and returns memory sized on and aligned on ZSTD_CWKSP_ALIGNMENT_BYTES (64 bytes). ++ * This memory has been initialized at least once in the past. ++ * This doesn't mean it has been initialized this time, and it might contain data from previous ++ * operations. ++ * The main usage is for algorithms that might need read access into uninitialized memory. ++ * The algorithm must maintain safety under these conditions and must make sure it doesn't ++ * leak any of the past data (directly or in side channels). ++ */ ++MEM_STATIC void* ZSTD_cwksp_reserve_aligned_init_once(ZSTD_cwksp* ws, size_t bytes) ++{ ++ size_t const alignedBytes = ZSTD_cwksp_align(bytes, ZSTD_CWKSP_ALIGNMENT_BYTES); ++ void* ptr = ZSTD_cwksp_reserve_internal(ws, alignedBytes, ZSTD_cwksp_alloc_aligned_init_once); ++ assert(((size_t)ptr & (ZSTD_CWKSP_ALIGNMENT_BYTES-1))== 0); ++ if(ptr && ptr < ws->initOnceStart) { ++ /* We assume the memory following the current allocation is either: ++ * 1. Not usable as initOnce memory (end of workspace) ++ * 2. Another initOnce buffer that has been allocated before (and so was previously memset) ++ * 3. An ASAN redzone, in which case we don't want to write on it ++ * For these reasons it should be fine to not explicitly zero every byte up to ws->initOnceStart. ++ * Note that we assume here that MSAN and ASAN cannot run in the same time. */ ++ ZSTD_memset(ptr, 0, MIN((size_t)((U8*)ws->initOnceStart - (U8*)ptr), alignedBytes)); ++ ws->initOnceStart = ptr; ++ } ++ return ptr; ++} ++ ++/* ++ * Reserves and returns memory sized on and aligned on ZSTD_CWKSP_ALIGNMENT_BYTES (64 bytes). + */ + MEM_STATIC void* ZSTD_cwksp_reserve_aligned(ZSTD_cwksp* ws, size_t bytes) + { +@@ -356,18 +396,22 @@ MEM_STATIC void* ZSTD_cwksp_reserve_alig + + /* + * Aligned on 64 bytes. These buffers have the special property that +- * their values remain constrained, allowing us to re-use them without ++ * their values remain constrained, allowing us to reuse them without + * memset()-ing them. + */ + MEM_STATIC void* ZSTD_cwksp_reserve_table(ZSTD_cwksp* ws, size_t bytes) + { +- const ZSTD_cwksp_alloc_phase_e phase = ZSTD_cwksp_alloc_aligned; ++ const ZSTD_cwksp_alloc_phase_e phase = ZSTD_cwksp_alloc_aligned_init_once; + void* alloc; + void* end; + void* top; + +- if (ZSTD_isError(ZSTD_cwksp_internal_advance_phase(ws, phase))) { +- return NULL; ++ /* We can only start allocating tables after we are done reserving space for objects at the ++ * start of the workspace */ ++ if(ws->phase < phase) { ++ if (ZSTD_isError(ZSTD_cwksp_internal_advance_phase(ws, phase))) { ++ return NULL; ++ } + } + alloc = ws->tableEnd; + end = (BYTE *)alloc + bytes; +@@ -451,7 +495,7 @@ MEM_STATIC void ZSTD_cwksp_clean_tables( + assert(ws->tableValidEnd >= ws->objectEnd); + assert(ws->tableValidEnd <= ws->allocStart); + if (ws->tableValidEnd < ws->tableEnd) { +- ZSTD_memset(ws->tableValidEnd, 0, (BYTE*)ws->tableEnd - (BYTE*)ws->tableValidEnd); ++ ZSTD_memset(ws->tableValidEnd, 0, (size_t)((BYTE*)ws->tableEnd - (BYTE*)ws->tableValidEnd)); + } + ZSTD_cwksp_mark_tables_clean(ws); + } +@@ -478,14 +522,23 @@ MEM_STATIC void ZSTD_cwksp_clear(ZSTD_cw + + + ws->tableEnd = ws->objectEnd; +- ws->allocStart = ws->workspaceEnd; ++ ws->allocStart = ZSTD_cwksp_initialAllocStart(ws); + ws->allocFailed = 0; +- if (ws->phase > ZSTD_cwksp_alloc_buffers) { +- ws->phase = ZSTD_cwksp_alloc_buffers; ++ if (ws->phase > ZSTD_cwksp_alloc_aligned_init_once) { ++ ws->phase = ZSTD_cwksp_alloc_aligned_init_once; + } + ZSTD_cwksp_assert_internal_consistency(ws); + } + ++MEM_STATIC size_t ZSTD_cwksp_sizeof(const ZSTD_cwksp* ws) { ++ return (size_t)((BYTE*)ws->workspaceEnd - (BYTE*)ws->workspace); ++} ++ ++MEM_STATIC size_t ZSTD_cwksp_used(const ZSTD_cwksp* ws) { ++ return (size_t)((BYTE*)ws->tableEnd - (BYTE*)ws->workspace) ++ + (size_t)((BYTE*)ws->workspaceEnd - (BYTE*)ws->allocStart); ++} ++ + /* + * The provided workspace takes ownership of the buffer [start, start+size). + * Any existing values in the workspace are ignored (the previously managed +@@ -498,6 +551,7 @@ MEM_STATIC void ZSTD_cwksp_init(ZSTD_cwk + ws->workspaceEnd = (BYTE*)start + size; + ws->objectEnd = ws->workspace; + ws->tableValidEnd = ws->objectEnd; ++ ws->initOnceStart = ZSTD_cwksp_initialAllocStart(ws); + ws->phase = ZSTD_cwksp_alloc_objects; + ws->isStatic = isStatic; + ZSTD_cwksp_clear(ws); +@@ -529,15 +583,6 @@ MEM_STATIC void ZSTD_cwksp_move(ZSTD_cwk + ZSTD_memset(src, 0, sizeof(ZSTD_cwksp)); + } + +-MEM_STATIC size_t ZSTD_cwksp_sizeof(const ZSTD_cwksp* ws) { +- return (size_t)((BYTE*)ws->workspaceEnd - (BYTE*)ws->workspace); +-} +- +-MEM_STATIC size_t ZSTD_cwksp_used(const ZSTD_cwksp* ws) { +- return (size_t)((BYTE*)ws->tableEnd - (BYTE*)ws->workspace) +- + (size_t)((BYTE*)ws->workspaceEnd - (BYTE*)ws->allocStart); +-} +- + MEM_STATIC int ZSTD_cwksp_reserve_failed(const ZSTD_cwksp* ws) { + return ws->allocFailed; + } +@@ -550,17 +595,11 @@ MEM_STATIC int ZSTD_cwksp_reserve_failed + * Returns if the estimated space needed for a wksp is within an acceptable limit of the + * actual amount of space used. + */ +-MEM_STATIC int ZSTD_cwksp_estimated_space_within_bounds(const ZSTD_cwksp* const ws, +- size_t const estimatedSpace, int resizedWorkspace) { +- if (resizedWorkspace) { +- /* Resized/newly allocated wksp should have exact bounds */ +- return ZSTD_cwksp_used(ws) == estimatedSpace; +- } else { +- /* Due to alignment, when reusing a workspace, we can actually consume 63 fewer or more bytes +- * than estimatedSpace. See the comments in zstd_cwksp.h for details. +- */ +- return (ZSTD_cwksp_used(ws) >= estimatedSpace - 63) && (ZSTD_cwksp_used(ws) <= estimatedSpace + 63); +- } ++MEM_STATIC int ZSTD_cwksp_estimated_space_within_bounds(const ZSTD_cwksp *const ws, size_t const estimatedSpace) { ++ /* We have an alignment space between objects and tables between tables and buffers, so we can have up to twice ++ * the alignment bytes difference between estimation and actual usage */ ++ return (estimatedSpace - ZSTD_cwksp_slack_space_required()) <= ZSTD_cwksp_used(ws) && ++ ZSTD_cwksp_used(ws) <= estimatedSpace; + } + + +--- a/lib/zstd/compress/zstd_double_fast.c ++++ b/lib/zstd/compress/zstd_double_fast.c +@@ -1,5 +1,6 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -11,8 +12,49 @@ + #include "zstd_compress_internal.h" + #include "zstd_double_fast.h" + ++#ifndef ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR + +-void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms, ++static ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++void ZSTD_fillDoubleHashTableForCDict(ZSTD_matchState_t* ms, ++ void const* end, ZSTD_dictTableLoadMethod_e dtlm) ++{ ++ const ZSTD_compressionParameters* const cParams = &ms->cParams; ++ U32* const hashLarge = ms->hashTable; ++ U32 const hBitsL = cParams->hashLog + ZSTD_SHORT_CACHE_TAG_BITS; ++ U32 const mls = cParams->minMatch; ++ U32* const hashSmall = ms->chainTable; ++ U32 const hBitsS = cParams->chainLog + ZSTD_SHORT_CACHE_TAG_BITS; ++ const BYTE* const base = ms->window.base; ++ const BYTE* ip = base + ms->nextToUpdate; ++ const BYTE* const iend = ((const BYTE*)end) - HASH_READ_SIZE; ++ const U32 fastHashFillStep = 3; ++ ++ /* Always insert every fastHashFillStep position into the hash tables. ++ * Insert the other positions into the large hash table if their entry ++ * is empty. ++ */ ++ for (; ip + fastHashFillStep - 1 <= iend; ip += fastHashFillStep) { ++ U32 const curr = (U32)(ip - base); ++ U32 i; ++ for (i = 0; i < fastHashFillStep; ++i) { ++ size_t const smHashAndTag = ZSTD_hashPtr(ip + i, hBitsS, mls); ++ size_t const lgHashAndTag = ZSTD_hashPtr(ip + i, hBitsL, 8); ++ if (i == 0) { ++ ZSTD_writeTaggedIndex(hashSmall, smHashAndTag, curr + i); ++ } ++ if (i == 0 || hashLarge[lgHashAndTag >> ZSTD_SHORT_CACHE_TAG_BITS] == 0) { ++ ZSTD_writeTaggedIndex(hashLarge, lgHashAndTag, curr + i); ++ } ++ /* Only load extra positions for ZSTD_dtlm_full */ ++ if (dtlm == ZSTD_dtlm_fast) ++ break; ++ } } ++} ++ ++static ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++void ZSTD_fillDoubleHashTableForCCtx(ZSTD_matchState_t* ms, + void const* end, ZSTD_dictTableLoadMethod_e dtlm) + { + const ZSTD_compressionParameters* const cParams = &ms->cParams; +@@ -43,11 +85,24 @@ void ZSTD_fillDoubleHashTable(ZSTD_match + /* Only load extra positions for ZSTD_dtlm_full */ + if (dtlm == ZSTD_dtlm_fast) + break; +- } } ++ } } ++} ++ ++void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms, ++ const void* const end, ++ ZSTD_dictTableLoadMethod_e dtlm, ++ ZSTD_tableFillPurpose_e tfp) ++{ ++ if (tfp == ZSTD_tfp_forCDict) { ++ ZSTD_fillDoubleHashTableForCDict(ms, end, dtlm); ++ } else { ++ ZSTD_fillDoubleHashTableForCCtx(ms, end, dtlm); ++ } + } + + + FORCE_INLINE_TEMPLATE ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR + size_t ZSTD_compressBlock_doubleFast_noDict_generic( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize, U32 const mls /* template */) +@@ -67,7 +122,7 @@ size_t ZSTD_compressBlock_doubleFast_noD + const BYTE* const iend = istart + srcSize; + const BYTE* const ilimit = iend - HASH_READ_SIZE; + U32 offset_1=rep[0], offset_2=rep[1]; +- U32 offsetSaved = 0; ++ U32 offsetSaved1 = 0, offsetSaved2 = 0; + + size_t mLength; + U32 offset; +@@ -100,8 +155,8 @@ size_t ZSTD_compressBlock_doubleFast_noD + U32 const current = (U32)(ip - base); + U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, current, cParams->windowLog); + U32 const maxRep = current - windowLow; +- if (offset_2 > maxRep) offsetSaved = offset_2, offset_2 = 0; +- if (offset_1 > maxRep) offsetSaved = offset_1, offset_1 = 0; ++ if (offset_2 > maxRep) offsetSaved2 = offset_2, offset_2 = 0; ++ if (offset_1 > maxRep) offsetSaved1 = offset_1, offset_1 = 0; + } + + /* Outer Loop: one iteration per match found and stored */ +@@ -131,7 +186,7 @@ size_t ZSTD_compressBlock_doubleFast_noD + if ((offset_1 > 0) & (MEM_read32(ip+1-offset_1) == MEM_read32(ip+1))) { + mLength = ZSTD_count(ip+1+4, ip+1+4-offset_1, iend) + 4; + ip++; +- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_REPCODE_1, mLength); ++ ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength); + goto _match_stored; + } + +@@ -175,9 +230,13 @@ size_t ZSTD_compressBlock_doubleFast_noD + } while (ip1 <= ilimit); + + _cleanup: ++ /* If offset_1 started invalid (offsetSaved1 != 0) and became valid (offset_1 != 0), ++ * rotate saved offsets. See comment in ZSTD_compressBlock_fast_noDict for more context. */ ++ offsetSaved2 = ((offsetSaved1 != 0) && (offset_1 != 0)) ? offsetSaved1 : offsetSaved2; ++ + /* save reps for next block */ +- rep[0] = offset_1 ? offset_1 : offsetSaved; +- rep[1] = offset_2 ? offset_2 : offsetSaved; ++ rep[0] = offset_1 ? offset_1 : offsetSaved1; ++ rep[1] = offset_2 ? offset_2 : offsetSaved2; + + /* Return the last literals size */ + return (size_t)(iend - anchor); +@@ -217,7 +276,7 @@ _match_found: /* requires ip, offset, mL + hashLong[hl1] = (U32)(ip1 - base); + } + +- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength); ++ ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength); + + _match_stored: + /* match found */ +@@ -243,7 +302,7 @@ _match_stored: + U32 const tmpOff = offset_2; offset_2 = offset_1; offset_1 = tmpOff; /* swap offset_2 <=> offset_1 */ + hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = (U32)(ip-base); + hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = (U32)(ip-base); +- ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, rLength); ++ ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, rLength); + ip += rLength; + anchor = ip; + continue; /* faster when present ... (?) */ +@@ -254,6 +313,7 @@ _match_stored: + + + FORCE_INLINE_TEMPLATE ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR + size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize, +@@ -275,7 +335,6 @@ size_t ZSTD_compressBlock_doubleFast_dic + const BYTE* const iend = istart + srcSize; + const BYTE* const ilimit = iend - HASH_READ_SIZE; + U32 offset_1=rep[0], offset_2=rep[1]; +- U32 offsetSaved = 0; + + const ZSTD_matchState_t* const dms = ms->dictMatchState; + const ZSTD_compressionParameters* const dictCParams = &dms->cParams; +@@ -286,8 +345,8 @@ size_t ZSTD_compressBlock_doubleFast_dic + const BYTE* const dictStart = dictBase + dictStartIndex; + const BYTE* const dictEnd = dms->window.nextSrc; + const U32 dictIndexDelta = prefixLowestIndex - (U32)(dictEnd - dictBase); +- const U32 dictHBitsL = dictCParams->hashLog; +- const U32 dictHBitsS = dictCParams->chainLog; ++ const U32 dictHBitsL = dictCParams->hashLog + ZSTD_SHORT_CACHE_TAG_BITS; ++ const U32 dictHBitsS = dictCParams->chainLog + ZSTD_SHORT_CACHE_TAG_BITS; + const U32 dictAndPrefixLength = (U32)((ip - prefixLowest) + (dictEnd - dictStart)); + + DEBUGLOG(5, "ZSTD_compressBlock_doubleFast_dictMatchState_generic"); +@@ -295,6 +354,13 @@ size_t ZSTD_compressBlock_doubleFast_dic + /* if a dictionary is attached, it must be within window range */ + assert(ms->window.dictLimit + (1U << cParams->windowLog) >= endIndex); + ++ if (ms->prefetchCDictTables) { ++ size_t const hashTableBytes = (((size_t)1) << dictCParams->hashLog) * sizeof(U32); ++ size_t const chainTableBytes = (((size_t)1) << dictCParams->chainLog) * sizeof(U32); ++ PREFETCH_AREA(dictHashLong, hashTableBytes); ++ PREFETCH_AREA(dictHashSmall, chainTableBytes); ++ } ++ + /* init */ + ip += (dictAndPrefixLength == 0); + +@@ -309,8 +375,12 @@ size_t ZSTD_compressBlock_doubleFast_dic + U32 offset; + size_t const h2 = ZSTD_hashPtr(ip, hBitsL, 8); + size_t const h = ZSTD_hashPtr(ip, hBitsS, mls); +- size_t const dictHL = ZSTD_hashPtr(ip, dictHBitsL, 8); +- size_t const dictHS = ZSTD_hashPtr(ip, dictHBitsS, mls); ++ size_t const dictHashAndTagL = ZSTD_hashPtr(ip, dictHBitsL, 8); ++ size_t const dictHashAndTagS = ZSTD_hashPtr(ip, dictHBitsS, mls); ++ U32 const dictMatchIndexAndTagL = dictHashLong[dictHashAndTagL >> ZSTD_SHORT_CACHE_TAG_BITS]; ++ U32 const dictMatchIndexAndTagS = dictHashSmall[dictHashAndTagS >> ZSTD_SHORT_CACHE_TAG_BITS]; ++ int const dictTagsMatchL = ZSTD_comparePackedTags(dictMatchIndexAndTagL, dictHashAndTagL); ++ int const dictTagsMatchS = ZSTD_comparePackedTags(dictMatchIndexAndTagS, dictHashAndTagS); + U32 const curr = (U32)(ip-base); + U32 const matchIndexL = hashLong[h2]; + U32 matchIndexS = hashSmall[h]; +@@ -328,7 +398,7 @@ size_t ZSTD_compressBlock_doubleFast_dic + const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend; + mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4; + ip++; +- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_REPCODE_1, mLength); ++ ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength); + goto _match_stored; + } + +@@ -340,9 +410,9 @@ size_t ZSTD_compressBlock_doubleFast_dic + while (((ip>anchor) & (matchLong>prefixLowest)) && (ip[-1] == matchLong[-1])) { ip--; matchLong--; mLength++; } /* catch up */ + goto _match_found; + } +- } else { ++ } else if (dictTagsMatchL) { + /* check dictMatchState long match */ +- U32 const dictMatchIndexL = dictHashLong[dictHL]; ++ U32 const dictMatchIndexL = dictMatchIndexAndTagL >> ZSTD_SHORT_CACHE_TAG_BITS; + const BYTE* dictMatchL = dictBase + dictMatchIndexL; + assert(dictMatchL < dictEnd); + +@@ -358,9 +428,9 @@ size_t ZSTD_compressBlock_doubleFast_dic + if (MEM_read32(match) == MEM_read32(ip)) { + goto _search_next_long; + } +- } else { ++ } else if (dictTagsMatchS) { + /* check dictMatchState short match */ +- U32 const dictMatchIndexS = dictHashSmall[dictHS]; ++ U32 const dictMatchIndexS = dictMatchIndexAndTagS >> ZSTD_SHORT_CACHE_TAG_BITS; + match = dictBase + dictMatchIndexS; + matchIndexS = dictMatchIndexS + dictIndexDelta; + +@@ -375,10 +445,11 @@ size_t ZSTD_compressBlock_doubleFast_dic + continue; + + _search_next_long: +- + { size_t const hl3 = ZSTD_hashPtr(ip+1, hBitsL, 8); +- size_t const dictHLNext = ZSTD_hashPtr(ip+1, dictHBitsL, 8); ++ size_t const dictHashAndTagL3 = ZSTD_hashPtr(ip+1, dictHBitsL, 8); + U32 const matchIndexL3 = hashLong[hl3]; ++ U32 const dictMatchIndexAndTagL3 = dictHashLong[dictHashAndTagL3 >> ZSTD_SHORT_CACHE_TAG_BITS]; ++ int const dictTagsMatchL3 = ZSTD_comparePackedTags(dictMatchIndexAndTagL3, dictHashAndTagL3); + const BYTE* matchL3 = base + matchIndexL3; + hashLong[hl3] = curr + 1; + +@@ -391,9 +462,9 @@ _search_next_long: + while (((ip>anchor) & (matchL3>prefixLowest)) && (ip[-1] == matchL3[-1])) { ip--; matchL3--; mLength++; } /* catch up */ + goto _match_found; + } +- } else { ++ } else if (dictTagsMatchL3) { + /* check dict long +1 match */ +- U32 const dictMatchIndexL3 = dictHashLong[dictHLNext]; ++ U32 const dictMatchIndexL3 = dictMatchIndexAndTagL3 >> ZSTD_SHORT_CACHE_TAG_BITS; + const BYTE* dictMatchL3 = dictBase + dictMatchIndexL3; + assert(dictMatchL3 < dictEnd); + if (dictMatchL3 > dictStart && MEM_read64(dictMatchL3) == MEM_read64(ip+1)) { +@@ -419,7 +490,7 @@ _match_found: + offset_2 = offset_1; + offset_1 = offset; + +- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength); ++ ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength); + + _match_stored: + /* match found */ +@@ -448,7 +519,7 @@ _match_stored: + const BYTE* const repEnd2 = repIndex2 < prefixLowestIndex ? dictEnd : iend; + size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixLowest) + 4; + U32 tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; /* swap offset_2 <=> offset_1 */ +- ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, repLength2); ++ ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, repLength2); + hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = current2; + hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = current2; + ip += repLength2; +@@ -461,8 +532,8 @@ _match_stored: + } /* while (ip < ilimit) */ + + /* save reps for next block */ +- rep[0] = offset_1 ? offset_1 : offsetSaved; +- rep[1] = offset_2 ? offset_2 : offsetSaved; ++ rep[0] = offset_1; ++ rep[1] = offset_2; + + /* Return the last literals size */ + return (size_t)(iend - anchor); +@@ -527,7 +598,9 @@ size_t ZSTD_compressBlock_doubleFast_dic + } + + +-static size_t ZSTD_compressBlock_doubleFast_extDict_generic( ++static ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++size_t ZSTD_compressBlock_doubleFast_extDict_generic( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize, + U32 const mls /* template */) +@@ -585,7 +658,7 @@ static size_t ZSTD_compressBlock_doubleF + const BYTE* repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend; + mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixStart) + 4; + ip++; +- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_REPCODE_1, mLength); ++ ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength); + } else { + if ((matchLongIndex > dictStartIndex) && (MEM_read64(matchLong) == MEM_read64(ip))) { + const BYTE* const matchEnd = matchLongIndex < prefixStartIndex ? dictEnd : iend; +@@ -596,7 +669,7 @@ static size_t ZSTD_compressBlock_doubleF + while (((ip>anchor) & (matchLong>lowMatchPtr)) && (ip[-1] == matchLong[-1])) { ip--; matchLong--; mLength++; } /* catch up */ + offset_2 = offset_1; + offset_1 = offset; +- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength); ++ ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength); + + } else if ((matchIndex > dictStartIndex) && (MEM_read32(match) == MEM_read32(ip))) { + size_t const h3 = ZSTD_hashPtr(ip+1, hBitsL, 8); +@@ -621,7 +694,7 @@ static size_t ZSTD_compressBlock_doubleF + } + offset_2 = offset_1; + offset_1 = offset; +- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength); ++ ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength); + + } else { + ip += ((ip-anchor) >> kSearchStrength) + 1; +@@ -653,7 +726,7 @@ static size_t ZSTD_compressBlock_doubleF + const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend; + size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixStart) + 4; + U32 const tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; /* swap offset_2 <=> offset_1 */ +- ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, repLength2); ++ ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, repLength2); + hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = current2; + hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = current2; + ip += repLength2; +@@ -694,3 +767,5 @@ size_t ZSTD_compressBlock_doubleFast_ext + return ZSTD_compressBlock_doubleFast_extDict_7(ms, seqStore, rep, src, srcSize); + } + } ++ ++#endif /* ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR */ +--- a/lib/zstd/compress/zstd_double_fast.h ++++ b/lib/zstd/compress/zstd_double_fast.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -15,8 +16,12 @@ + #include "../common/mem.h" /* U32 */ + #include "zstd_compress_internal.h" /* ZSTD_CCtx, size_t */ + ++#ifndef ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR ++ + void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms, +- void const* end, ZSTD_dictTableLoadMethod_e dtlm); ++ void const* end, ZSTD_dictTableLoadMethod_e dtlm, ++ ZSTD_tableFillPurpose_e tfp); ++ + size_t ZSTD_compressBlock_doubleFast( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +@@ -27,6 +32,14 @@ size_t ZSTD_compressBlock_doubleFast_ext + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); + ++#define ZSTD_COMPRESSBLOCK_DOUBLEFAST ZSTD_compressBlock_doubleFast ++#define ZSTD_COMPRESSBLOCK_DOUBLEFAST_DICTMATCHSTATE ZSTD_compressBlock_doubleFast_dictMatchState ++#define ZSTD_COMPRESSBLOCK_DOUBLEFAST_EXTDICT ZSTD_compressBlock_doubleFast_extDict ++#else ++#define ZSTD_COMPRESSBLOCK_DOUBLEFAST NULL ++#define ZSTD_COMPRESSBLOCK_DOUBLEFAST_DICTMATCHSTATE NULL ++#define ZSTD_COMPRESSBLOCK_DOUBLEFAST_EXTDICT NULL ++#endif /* ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR */ + + + #endif /* ZSTD_DOUBLE_FAST_H */ +--- a/lib/zstd/compress/zstd_fast.c ++++ b/lib/zstd/compress/zstd_fast.c +@@ -1,5 +1,6 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -11,8 +12,46 @@ + #include "zstd_compress_internal.h" /* ZSTD_hashPtr, ZSTD_count, ZSTD_storeSeq */ + #include "zstd_fast.h" + ++static ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++void ZSTD_fillHashTableForCDict(ZSTD_matchState_t* ms, ++ const void* const end, ++ ZSTD_dictTableLoadMethod_e dtlm) ++{ ++ const ZSTD_compressionParameters* const cParams = &ms->cParams; ++ U32* const hashTable = ms->hashTable; ++ U32 const hBits = cParams->hashLog + ZSTD_SHORT_CACHE_TAG_BITS; ++ U32 const mls = cParams->minMatch; ++ const BYTE* const base = ms->window.base; ++ const BYTE* ip = base + ms->nextToUpdate; ++ const BYTE* const iend = ((const BYTE*)end) - HASH_READ_SIZE; ++ const U32 fastHashFillStep = 3; + +-void ZSTD_fillHashTable(ZSTD_matchState_t* ms, ++ /* Currently, we always use ZSTD_dtlm_full for filling CDict tables. ++ * Feel free to remove this assert if there's a good reason! */ ++ assert(dtlm == ZSTD_dtlm_full); ++ ++ /* Always insert every fastHashFillStep position into the hash table. ++ * Insert the other positions if their hash entry is empty. ++ */ ++ for ( ; ip + fastHashFillStep < iend + 2; ip += fastHashFillStep) { ++ U32 const curr = (U32)(ip - base); ++ { size_t const hashAndTag = ZSTD_hashPtr(ip, hBits, mls); ++ ZSTD_writeTaggedIndex(hashTable, hashAndTag, curr); } ++ ++ if (dtlm == ZSTD_dtlm_fast) continue; ++ /* Only load extra positions for ZSTD_dtlm_full */ ++ { U32 p; ++ for (p = 1; p < fastHashFillStep; ++p) { ++ size_t const hashAndTag = ZSTD_hashPtr(ip + p, hBits, mls); ++ if (hashTable[hashAndTag >> ZSTD_SHORT_CACHE_TAG_BITS] == 0) { /* not yet filled */ ++ ZSTD_writeTaggedIndex(hashTable, hashAndTag, curr + p); ++ } } } } ++} ++ ++static ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++void ZSTD_fillHashTableForCCtx(ZSTD_matchState_t* ms, + const void* const end, + ZSTD_dictTableLoadMethod_e dtlm) + { +@@ -25,6 +64,10 @@ void ZSTD_fillHashTable(ZSTD_matchState_ + const BYTE* const iend = ((const BYTE*)end) - HASH_READ_SIZE; + const U32 fastHashFillStep = 3; + ++ /* Currently, we always use ZSTD_dtlm_fast for filling CCtx tables. ++ * Feel free to remove this assert if there's a good reason! */ ++ assert(dtlm == ZSTD_dtlm_fast); ++ + /* Always insert every fastHashFillStep position into the hash table. + * Insert the other positions if their hash entry is empty. + */ +@@ -42,6 +85,18 @@ void ZSTD_fillHashTable(ZSTD_matchState_ + } } } } + } + ++void ZSTD_fillHashTable(ZSTD_matchState_t* ms, ++ const void* const end, ++ ZSTD_dictTableLoadMethod_e dtlm, ++ ZSTD_tableFillPurpose_e tfp) ++{ ++ if (tfp == ZSTD_tfp_forCDict) { ++ ZSTD_fillHashTableForCDict(ms, end, dtlm); ++ } else { ++ ZSTD_fillHashTableForCCtx(ms, end, dtlm); ++ } ++} ++ + + /* + * If you squint hard enough (and ignore repcodes), the search operation at any +@@ -89,8 +144,9 @@ void ZSTD_fillHashTable(ZSTD_matchState_ + * + * This is also the work we do at the beginning to enter the loop initially. + */ +-FORCE_INLINE_TEMPLATE size_t +-ZSTD_compressBlock_fast_noDict_generic( ++FORCE_INLINE_TEMPLATE ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++size_t ZSTD_compressBlock_fast_noDict_generic( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize, + U32 const mls, U32 const hasStep) +@@ -117,7 +173,7 @@ ZSTD_compressBlock_fast_noDict_generic( + + U32 rep_offset1 = rep[0]; + U32 rep_offset2 = rep[1]; +- U32 offsetSaved = 0; ++ U32 offsetSaved1 = 0, offsetSaved2 = 0; + + size_t hash0; /* hash for ip0 */ + size_t hash1; /* hash for ip1 */ +@@ -141,8 +197,8 @@ ZSTD_compressBlock_fast_noDict_generic( + { U32 const curr = (U32)(ip0 - base); + U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, curr, cParams->windowLog); + U32 const maxRep = curr - windowLow; +- if (rep_offset2 > maxRep) offsetSaved = rep_offset2, rep_offset2 = 0; +- if (rep_offset1 > maxRep) offsetSaved = rep_offset1, rep_offset1 = 0; ++ if (rep_offset2 > maxRep) offsetSaved2 = rep_offset2, rep_offset2 = 0; ++ if (rep_offset1 > maxRep) offsetSaved1 = rep_offset1, rep_offset1 = 0; + } + + /* start each op */ +@@ -180,8 +236,14 @@ _start: /* Requires: ip0 */ + mLength = ip0[-1] == match0[-1]; + ip0 -= mLength; + match0 -= mLength; +- offcode = STORE_REPCODE_1; ++ offcode = REPCODE1_TO_OFFBASE; + mLength += 4; ++ ++ /* First write next hash table entry; we've already calculated it. ++ * This write is known to be safe because the ip1 is before the ++ * repcode (ip2). */ ++ hashTable[hash1] = (U32)(ip1 - base); ++ + goto _match; + } + +@@ -195,6 +257,12 @@ _start: /* Requires: ip0 */ + /* check match at ip[0] */ + if (MEM_read32(ip0) == mval) { + /* found a match! */ ++ ++ /* First write next hash table entry; we've already calculated it. ++ * This write is known to be safe because the ip1 == ip0 + 1, so ++ * we know we will resume searching after ip1 */ ++ hashTable[hash1] = (U32)(ip1 - base); ++ + goto _offset; + } + +@@ -224,6 +292,21 @@ _start: /* Requires: ip0 */ + /* check match at ip[0] */ + if (MEM_read32(ip0) == mval) { + /* found a match! */ ++ ++ /* first write next hash table entry; we've already calculated it */ ++ if (step <= 4) { ++ /* We need to avoid writing an index into the hash table >= the ++ * position at which we will pick up our searching after we've ++ * taken this match. ++ * ++ * The minimum possible match has length 4, so the earliest ip0 ++ * can be after we take this match will be the current ip0 + 4. ++ * ip1 is ip0 + step - 1. If ip1 is >= ip0 + 4, we can't safely ++ * write this position. ++ */ ++ hashTable[hash1] = (U32)(ip1 - base); ++ } ++ + goto _offset; + } + +@@ -254,9 +337,24 @@ _cleanup: + * However, it seems to be a meaningful performance hit to try to search + * them. So let's not. */ + ++ /* When the repcodes are outside of the prefix, we set them to zero before the loop. ++ * When the offsets are still zero, we need to restore them after the block to have a correct ++ * repcode history. If only one offset was invalid, it is easy. The tricky case is when both ++ * offsets were invalid. We need to figure out which offset to refill with. ++ * - If both offsets are zero they are in the same order. ++ * - If both offsets are non-zero, we won't restore the offsets from `offsetSaved[12]`. ++ * - If only one is zero, we need to decide which offset to restore. ++ * - If rep_offset1 is non-zero, then rep_offset2 must be offsetSaved1. ++ * - It is impossible for rep_offset2 to be non-zero. ++ * ++ * So if rep_offset1 started invalid (offsetSaved1 != 0) and became valid (rep_offset1 != 0), then ++ * set rep[0] = rep_offset1 and rep[1] = offsetSaved1. ++ */ ++ offsetSaved2 = ((offsetSaved1 != 0) && (rep_offset1 != 0)) ? offsetSaved1 : offsetSaved2; ++ + /* save reps for next block */ +- rep[0] = rep_offset1 ? rep_offset1 : offsetSaved; +- rep[1] = rep_offset2 ? rep_offset2 : offsetSaved; ++ rep[0] = rep_offset1 ? rep_offset1 : offsetSaved1; ++ rep[1] = rep_offset2 ? rep_offset2 : offsetSaved2; + + /* Return the last literals size */ + return (size_t)(iend - anchor); +@@ -267,7 +365,7 @@ _offset: /* Requires: ip0, idx */ + match0 = base + idx; + rep_offset2 = rep_offset1; + rep_offset1 = (U32)(ip0-match0); +- offcode = STORE_OFFSET(rep_offset1); ++ offcode = OFFSET_TO_OFFBASE(rep_offset1); + mLength = 4; + + /* Count the backwards match length. */ +@@ -287,11 +385,6 @@ _match: /* Requires: ip0, match0, offcod + ip0 += mLength; + anchor = ip0; + +- /* write next hash table entry */ +- if (ip1 < ip0) { +- hashTable[hash1] = (U32)(ip1 - base); +- } +- + /* Fill table and check for immediate repcode. */ + if (ip0 <= ilimit) { + /* Fill Table */ +@@ -306,7 +399,7 @@ _match: /* Requires: ip0, match0, offcod + { U32 const tmpOff = rep_offset2; rep_offset2 = rep_offset1; rep_offset1 = tmpOff; } /* swap rep_offset2 <=> rep_offset1 */ + hashTable[ZSTD_hashPtr(ip0, hlog, mls)] = (U32)(ip0-base); + ip0 += rLength; +- ZSTD_storeSeq(seqStore, 0 /*litLen*/, anchor, iend, STORE_REPCODE_1, rLength); ++ ZSTD_storeSeq(seqStore, 0 /*litLen*/, anchor, iend, REPCODE1_TO_OFFBASE, rLength); + anchor = ip0; + continue; /* faster when present (confirmed on gcc-8) ... (?) */ + } } } +@@ -369,6 +462,7 @@ size_t ZSTD_compressBlock_fast( + } + + FORCE_INLINE_TEMPLATE ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR + size_t ZSTD_compressBlock_fast_dictMatchState_generic( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize, U32 const mls, U32 const hasStep) +@@ -380,14 +474,14 @@ size_t ZSTD_compressBlock_fast_dictMatch + U32 const stepSize = cParams->targetLength + !(cParams->targetLength); + const BYTE* const base = ms->window.base; + const BYTE* const istart = (const BYTE*)src; +- const BYTE* ip = istart; ++ const BYTE* ip0 = istart; ++ const BYTE* ip1 = ip0 + stepSize; /* we assert below that stepSize >= 1 */ + const BYTE* anchor = istart; + const U32 prefixStartIndex = ms->window.dictLimit; + const BYTE* const prefixStart = base + prefixStartIndex; + const BYTE* const iend = istart + srcSize; + const BYTE* const ilimit = iend - HASH_READ_SIZE; + U32 offset_1=rep[0], offset_2=rep[1]; +- U32 offsetSaved = 0; + + const ZSTD_matchState_t* const dms = ms->dictMatchState; + const ZSTD_compressionParameters* const dictCParams = &dms->cParams ; +@@ -397,13 +491,13 @@ size_t ZSTD_compressBlock_fast_dictMatch + const BYTE* const dictStart = dictBase + dictStartIndex; + const BYTE* const dictEnd = dms->window.nextSrc; + const U32 dictIndexDelta = prefixStartIndex - (U32)(dictEnd - dictBase); +- const U32 dictAndPrefixLength = (U32)(ip - prefixStart + dictEnd - dictStart); +- const U32 dictHLog = dictCParams->hashLog; ++ const U32 dictAndPrefixLength = (U32)(istart - prefixStart + dictEnd - dictStart); ++ const U32 dictHBits = dictCParams->hashLog + ZSTD_SHORT_CACHE_TAG_BITS; + + /* if a dictionary is still attached, it necessarily means that + * it is within window size. So we just check it. */ + const U32 maxDistance = 1U << cParams->windowLog; +- const U32 endIndex = (U32)((size_t)(ip - base) + srcSize); ++ const U32 endIndex = (U32)((size_t)(istart - base) + srcSize); + assert(endIndex - prefixStartIndex <= maxDistance); + (void)maxDistance; (void)endIndex; /* these variables are not used when assert() is disabled */ + +@@ -413,106 +507,155 @@ size_t ZSTD_compressBlock_fast_dictMatch + * when translating a dict index into a local index */ + assert(prefixStartIndex >= (U32)(dictEnd - dictBase)); + ++ if (ms->prefetchCDictTables) { ++ size_t const hashTableBytes = (((size_t)1) << dictCParams->hashLog) * sizeof(U32); ++ PREFETCH_AREA(dictHashTable, hashTableBytes); ++ } ++ + /* init */ + DEBUGLOG(5, "ZSTD_compressBlock_fast_dictMatchState_generic"); +- ip += (dictAndPrefixLength == 0); ++ ip0 += (dictAndPrefixLength == 0); + /* dictMatchState repCode checks don't currently handle repCode == 0 + * disabling. */ + assert(offset_1 <= dictAndPrefixLength); + assert(offset_2 <= dictAndPrefixLength); + +- /* Main Search Loop */ +- while (ip < ilimit) { /* < instead of <=, because repcode check at (ip+1) */ ++ /* Outer search loop */ ++ assert(stepSize >= 1); ++ while (ip1 <= ilimit) { /* repcode check at (ip0 + 1) is safe because ip0 < ip1 */ + size_t mLength; +- size_t const h = ZSTD_hashPtr(ip, hlog, mls); +- U32 const curr = (U32)(ip-base); +- U32 const matchIndex = hashTable[h]; +- const BYTE* match = base + matchIndex; +- const U32 repIndex = curr + 1 - offset_1; +- const BYTE* repMatch = (repIndex < prefixStartIndex) ? +- dictBase + (repIndex - dictIndexDelta) : +- base + repIndex; +- hashTable[h] = curr; /* update hash table */ +- +- if ( ((U32)((prefixStartIndex-1) - repIndex) >= 3) /* intentional underflow : ensure repIndex isn't overlapping dict + prefix */ +- && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) { +- const BYTE* const repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend; +- mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixStart) + 4; +- ip++; +- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_REPCODE_1, mLength); +- } else if ( (matchIndex <= prefixStartIndex) ) { +- size_t const dictHash = ZSTD_hashPtr(ip, dictHLog, mls); +- U32 const dictMatchIndex = dictHashTable[dictHash]; +- const BYTE* dictMatch = dictBase + dictMatchIndex; +- if (dictMatchIndex <= dictStartIndex || +- MEM_read32(dictMatch) != MEM_read32(ip)) { +- assert(stepSize >= 1); +- ip += ((ip-anchor) >> kSearchStrength) + stepSize; +- continue; +- } else { +- /* found a dict match */ +- U32 const offset = (U32)(curr-dictMatchIndex-dictIndexDelta); +- mLength = ZSTD_count_2segments(ip+4, dictMatch+4, iend, dictEnd, prefixStart) + 4; +- while (((ip>anchor) & (dictMatch>dictStart)) +- && (ip[-1] == dictMatch[-1])) { +- ip--; dictMatch--; mLength++; ++ size_t hash0 = ZSTD_hashPtr(ip0, hlog, mls); ++ ++ size_t const dictHashAndTag0 = ZSTD_hashPtr(ip0, dictHBits, mls); ++ U32 dictMatchIndexAndTag = dictHashTable[dictHashAndTag0 >> ZSTD_SHORT_CACHE_TAG_BITS]; ++ int dictTagsMatch = ZSTD_comparePackedTags(dictMatchIndexAndTag, dictHashAndTag0); ++ ++ U32 matchIndex = hashTable[hash0]; ++ U32 curr = (U32)(ip0 - base); ++ size_t step = stepSize; ++ const size_t kStepIncr = 1 << kSearchStrength; ++ const BYTE* nextStep = ip0 + kStepIncr; ++ ++ /* Inner search loop */ ++ while (1) { ++ const BYTE* match = base + matchIndex; ++ const U32 repIndex = curr + 1 - offset_1; ++ const BYTE* repMatch = (repIndex < prefixStartIndex) ? ++ dictBase + (repIndex - dictIndexDelta) : ++ base + repIndex; ++ const size_t hash1 = ZSTD_hashPtr(ip1, hlog, mls); ++ size_t const dictHashAndTag1 = ZSTD_hashPtr(ip1, dictHBits, mls); ++ hashTable[hash0] = curr; /* update hash table */ ++ ++ if (((U32) ((prefixStartIndex - 1) - repIndex) >= ++ 3) /* intentional underflow : ensure repIndex isn't overlapping dict + prefix */ ++ && (MEM_read32(repMatch) == MEM_read32(ip0 + 1))) { ++ const BYTE* const repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend; ++ mLength = ZSTD_count_2segments(ip0 + 1 + 4, repMatch + 4, iend, repMatchEnd, prefixStart) + 4; ++ ip0++; ++ ZSTD_storeSeq(seqStore, (size_t) (ip0 - anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength); ++ break; ++ } ++ ++ if (dictTagsMatch) { ++ /* Found a possible dict match */ ++ const U32 dictMatchIndex = dictMatchIndexAndTag >> ZSTD_SHORT_CACHE_TAG_BITS; ++ const BYTE* dictMatch = dictBase + dictMatchIndex; ++ if (dictMatchIndex > dictStartIndex && ++ MEM_read32(dictMatch) == MEM_read32(ip0)) { ++ /* To replicate extDict parse behavior, we only use dict matches when the normal matchIndex is invalid */ ++ if (matchIndex <= prefixStartIndex) { ++ U32 const offset = (U32) (curr - dictMatchIndex - dictIndexDelta); ++ mLength = ZSTD_count_2segments(ip0 + 4, dictMatch + 4, iend, dictEnd, prefixStart) + 4; ++ while (((ip0 > anchor) & (dictMatch > dictStart)) ++ && (ip0[-1] == dictMatch[-1])) { ++ ip0--; ++ dictMatch--; ++ mLength++; ++ } /* catch up */ ++ offset_2 = offset_1; ++ offset_1 = offset; ++ ZSTD_storeSeq(seqStore, (size_t) (ip0 - anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength); ++ break; ++ } ++ } ++ } ++ ++ if (matchIndex > prefixStartIndex && MEM_read32(match) == MEM_read32(ip0)) { ++ /* found a regular match */ ++ U32 const offset = (U32) (ip0 - match); ++ mLength = ZSTD_count(ip0 + 4, match + 4, iend) + 4; ++ while (((ip0 > anchor) & (match > prefixStart)) ++ && (ip0[-1] == match[-1])) { ++ ip0--; ++ match--; ++ mLength++; + } /* catch up */ + offset_2 = offset_1; + offset_1 = offset; +- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength); ++ ZSTD_storeSeq(seqStore, (size_t) (ip0 - anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength); ++ break; + } +- } else if (MEM_read32(match) != MEM_read32(ip)) { +- /* it's not a match, and we're not going to check the dictionary */ +- assert(stepSize >= 1); +- ip += ((ip-anchor) >> kSearchStrength) + stepSize; +- continue; +- } else { +- /* found a regular match */ +- U32 const offset = (U32)(ip-match); +- mLength = ZSTD_count(ip+4, match+4, iend) + 4; +- while (((ip>anchor) & (match>prefixStart)) +- && (ip[-1] == match[-1])) { ip--; match--; mLength++; } /* catch up */ +- offset_2 = offset_1; +- offset_1 = offset; +- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength); +- } ++ ++ /* Prepare for next iteration */ ++ dictMatchIndexAndTag = dictHashTable[dictHashAndTag1 >> ZSTD_SHORT_CACHE_TAG_BITS]; ++ dictTagsMatch = ZSTD_comparePackedTags(dictMatchIndexAndTag, dictHashAndTag1); ++ matchIndex = hashTable[hash1]; ++ ++ if (ip1 >= nextStep) { ++ step++; ++ nextStep += kStepIncr; ++ } ++ ip0 = ip1; ++ ip1 = ip1 + step; ++ if (ip1 > ilimit) goto _cleanup; ++ ++ curr = (U32)(ip0 - base); ++ hash0 = hash1; ++ } /* end inner search loop */ + + /* match found */ +- ip += mLength; +- anchor = ip; ++ assert(mLength); ++ ip0 += mLength; ++ anchor = ip0; + +- if (ip <= ilimit) { ++ if (ip0 <= ilimit) { + /* Fill Table */ + assert(base+curr+2 > istart); /* check base overflow */ + hashTable[ZSTD_hashPtr(base+curr+2, hlog, mls)] = curr+2; /* here because curr+2 could be > iend-8 */ +- hashTable[ZSTD_hashPtr(ip-2, hlog, mls)] = (U32)(ip-2-base); ++ hashTable[ZSTD_hashPtr(ip0-2, hlog, mls)] = (U32)(ip0-2-base); + + /* check immediate repcode */ +- while (ip <= ilimit) { +- U32 const current2 = (U32)(ip-base); ++ while (ip0 <= ilimit) { ++ U32 const current2 = (U32)(ip0-base); + U32 const repIndex2 = current2 - offset_2; + const BYTE* repMatch2 = repIndex2 < prefixStartIndex ? + dictBase - dictIndexDelta + repIndex2 : + base + repIndex2; + if ( ((U32)((prefixStartIndex-1) - (U32)repIndex2) >= 3 /* intentional overflow */) +- && (MEM_read32(repMatch2) == MEM_read32(ip)) ) { ++ && (MEM_read32(repMatch2) == MEM_read32(ip0))) { + const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend; +- size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixStart) + 4; ++ size_t const repLength2 = ZSTD_count_2segments(ip0+4, repMatch2+4, iend, repEnd2, prefixStart) + 4; + U32 tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; /* swap offset_2 <=> offset_1 */ +- ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, repLength2); +- hashTable[ZSTD_hashPtr(ip, hlog, mls)] = current2; +- ip += repLength2; +- anchor = ip; ++ ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, repLength2); ++ hashTable[ZSTD_hashPtr(ip0, hlog, mls)] = current2; ++ ip0 += repLength2; ++ anchor = ip0; + continue; + } + break; + } + } ++ ++ /* Prepare for next iteration */ ++ assert(ip0 == anchor); ++ ip1 = ip0 + stepSize; + } + ++_cleanup: + /* save reps for next block */ +- rep[0] = offset_1 ? offset_1 : offsetSaved; +- rep[1] = offset_2 ? offset_2 : offsetSaved; ++ rep[0] = offset_1; ++ rep[1] = offset_2; + + /* Return the last literals size */ + return (size_t)(iend - anchor); +@@ -545,7 +688,9 @@ size_t ZSTD_compressBlock_fast_dictMatch + } + + +-static size_t ZSTD_compressBlock_fast_extDict_generic( ++static ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++size_t ZSTD_compressBlock_fast_extDict_generic( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize, U32 const mls, U32 const hasStep) + { +@@ -553,11 +698,10 @@ static size_t ZSTD_compressBlock_fast_ex + U32* const hashTable = ms->hashTable; + U32 const hlog = cParams->hashLog; + /* support stepSize of 0 */ +- U32 const stepSize = cParams->targetLength + !(cParams->targetLength); ++ size_t const stepSize = cParams->targetLength + !(cParams->targetLength) + 1; + const BYTE* const base = ms->window.base; + const BYTE* const dictBase = ms->window.dictBase; + const BYTE* const istart = (const BYTE*)src; +- const BYTE* ip = istart; + const BYTE* anchor = istart; + const U32 endIndex = (U32)((size_t)(istart - base) + srcSize); + const U32 lowLimit = ZSTD_getLowestMatchIndex(ms, endIndex, cParams->windowLog); +@@ -570,6 +714,28 @@ static size_t ZSTD_compressBlock_fast_ex + const BYTE* const iend = istart + srcSize; + const BYTE* const ilimit = iend - 8; + U32 offset_1=rep[0], offset_2=rep[1]; ++ U32 offsetSaved1 = 0, offsetSaved2 = 0; ++ ++ const BYTE* ip0 = istart; ++ const BYTE* ip1; ++ const BYTE* ip2; ++ const BYTE* ip3; ++ U32 current0; ++ ++ ++ size_t hash0; /* hash for ip0 */ ++ size_t hash1; /* hash for ip1 */ ++ U32 idx; /* match idx for ip0 */ ++ const BYTE* idxBase; /* base pointer for idx */ ++ ++ U32 offcode; ++ const BYTE* match0; ++ size_t mLength; ++ const BYTE* matchEnd = 0; /* initialize to avoid warning, assert != 0 later */ ++ ++ size_t step; ++ const BYTE* nextStep; ++ const size_t kStepIncr = (1 << (kSearchStrength - 1)); + + (void)hasStep; /* not currently specialized on whether it's accelerated */ + +@@ -579,75 +745,202 @@ static size_t ZSTD_compressBlock_fast_ex + if (prefixStartIndex == dictStartIndex) + return ZSTD_compressBlock_fast(ms, seqStore, rep, src, srcSize); + +- /* Search Loop */ +- while (ip < ilimit) { /* < instead of <=, because (ip+1) */ +- const size_t h = ZSTD_hashPtr(ip, hlog, mls); +- const U32 matchIndex = hashTable[h]; +- const BYTE* const matchBase = matchIndex < prefixStartIndex ? dictBase : base; +- const BYTE* match = matchBase + matchIndex; +- const U32 curr = (U32)(ip-base); +- const U32 repIndex = curr + 1 - offset_1; +- const BYTE* const repBase = repIndex < prefixStartIndex ? dictBase : base; +- const BYTE* const repMatch = repBase + repIndex; +- hashTable[h] = curr; /* update hash table */ +- DEBUGLOG(7, "offset_1 = %u , curr = %u", offset_1, curr); +- +- if ( ( ((U32)((prefixStartIndex-1) - repIndex) >= 3) /* intentional underflow */ +- & (offset_1 <= curr+1 - dictStartIndex) ) /* note: we are searching at curr+1 */ +- && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) { +- const BYTE* const repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend; +- size_t const rLength = ZSTD_count_2segments(ip+1 +4, repMatch +4, iend, repMatchEnd, prefixStart) + 4; +- ip++; +- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_REPCODE_1, rLength); +- ip += rLength; +- anchor = ip; +- } else { +- if ( (matchIndex < dictStartIndex) || +- (MEM_read32(match) != MEM_read32(ip)) ) { +- assert(stepSize >= 1); +- ip += ((ip-anchor) >> kSearchStrength) + stepSize; +- continue; ++ { U32 const curr = (U32)(ip0 - base); ++ U32 const maxRep = curr - dictStartIndex; ++ if (offset_2 >= maxRep) offsetSaved2 = offset_2, offset_2 = 0; ++ if (offset_1 >= maxRep) offsetSaved1 = offset_1, offset_1 = 0; ++ } ++ ++ /* start each op */ ++_start: /* Requires: ip0 */ ++ ++ step = stepSize; ++ nextStep = ip0 + kStepIncr; ++ ++ /* calculate positions, ip0 - anchor == 0, so we skip step calc */ ++ ip1 = ip0 + 1; ++ ip2 = ip0 + step; ++ ip3 = ip2 + 1; ++ ++ if (ip3 >= ilimit) { ++ goto _cleanup; ++ } ++ ++ hash0 = ZSTD_hashPtr(ip0, hlog, mls); ++ hash1 = ZSTD_hashPtr(ip1, hlog, mls); ++ ++ idx = hashTable[hash0]; ++ idxBase = idx < prefixStartIndex ? dictBase : base; ++ ++ do { ++ { /* load repcode match for ip[2] */ ++ U32 const current2 = (U32)(ip2 - base); ++ U32 const repIndex = current2 - offset_1; ++ const BYTE* const repBase = repIndex < prefixStartIndex ? dictBase : base; ++ U32 rval; ++ if ( ((U32)(prefixStartIndex - repIndex) >= 4) /* intentional underflow */ ++ & (offset_1 > 0) ) { ++ rval = MEM_read32(repBase + repIndex); ++ } else { ++ rval = MEM_read32(ip2) ^ 1; /* guaranteed to not match. */ + } +- { const BYTE* const matchEnd = matchIndex < prefixStartIndex ? dictEnd : iend; +- const BYTE* const lowMatchPtr = matchIndex < prefixStartIndex ? dictStart : prefixStart; +- U32 const offset = curr - matchIndex; +- size_t mLength = ZSTD_count_2segments(ip+4, match+4, iend, matchEnd, prefixStart) + 4; +- while (((ip>anchor) & (match>lowMatchPtr)) && (ip[-1] == match[-1])) { ip--; match--; mLength++; } /* catch up */ +- offset_2 = offset_1; offset_1 = offset; /* update offset history */ +- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength); +- ip += mLength; +- anchor = ip; ++ ++ /* write back hash table entry */ ++ current0 = (U32)(ip0 - base); ++ hashTable[hash0] = current0; ++ ++ /* check repcode at ip[2] */ ++ if (MEM_read32(ip2) == rval) { ++ ip0 = ip2; ++ match0 = repBase + repIndex; ++ matchEnd = repIndex < prefixStartIndex ? dictEnd : iend; ++ assert((match0 != prefixStart) & (match0 != dictStart)); ++ mLength = ip0[-1] == match0[-1]; ++ ip0 -= mLength; ++ match0 -= mLength; ++ offcode = REPCODE1_TO_OFFBASE; ++ mLength += 4; ++ goto _match; + } } + +- if (ip <= ilimit) { +- /* Fill Table */ +- hashTable[ZSTD_hashPtr(base+curr+2, hlog, mls)] = curr+2; +- hashTable[ZSTD_hashPtr(ip-2, hlog, mls)] = (U32)(ip-2-base); +- /* check immediate repcode */ +- while (ip <= ilimit) { +- U32 const current2 = (U32)(ip-base); +- U32 const repIndex2 = current2 - offset_2; +- const BYTE* const repMatch2 = repIndex2 < prefixStartIndex ? dictBase + repIndex2 : base + repIndex2; +- if ( (((U32)((prefixStartIndex-1) - repIndex2) >= 3) & (offset_2 <= curr - dictStartIndex)) /* intentional overflow */ +- && (MEM_read32(repMatch2) == MEM_read32(ip)) ) { +- const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend; +- size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixStart) + 4; +- { U32 const tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; } /* swap offset_2 <=> offset_1 */ +- ZSTD_storeSeq(seqStore, 0 /*litlen*/, anchor, iend, STORE_REPCODE_1, repLength2); +- hashTable[ZSTD_hashPtr(ip, hlog, mls)] = current2; +- ip += repLength2; +- anchor = ip; +- continue; +- } +- break; +- } } } ++ { /* load match for ip[0] */ ++ U32 const mval = idx >= dictStartIndex ? ++ MEM_read32(idxBase + idx) : ++ MEM_read32(ip0) ^ 1; /* guaranteed not to match */ ++ ++ /* check match at ip[0] */ ++ if (MEM_read32(ip0) == mval) { ++ /* found a match! */ ++ goto _offset; ++ } } ++ ++ /* lookup ip[1] */ ++ idx = hashTable[hash1]; ++ idxBase = idx < prefixStartIndex ? dictBase : base; ++ ++ /* hash ip[2] */ ++ hash0 = hash1; ++ hash1 = ZSTD_hashPtr(ip2, hlog, mls); ++ ++ /* advance to next positions */ ++ ip0 = ip1; ++ ip1 = ip2; ++ ip2 = ip3; ++ ++ /* write back hash table entry */ ++ current0 = (U32)(ip0 - base); ++ hashTable[hash0] = current0; ++ ++ { /* load match for ip[0] */ ++ U32 const mval = idx >= dictStartIndex ? ++ MEM_read32(idxBase + idx) : ++ MEM_read32(ip0) ^ 1; /* guaranteed not to match */ ++ ++ /* check match at ip[0] */ ++ if (MEM_read32(ip0) == mval) { ++ /* found a match! */ ++ goto _offset; ++ } } ++ ++ /* lookup ip[1] */ ++ idx = hashTable[hash1]; ++ idxBase = idx < prefixStartIndex ? dictBase : base; ++ ++ /* hash ip[2] */ ++ hash0 = hash1; ++ hash1 = ZSTD_hashPtr(ip2, hlog, mls); ++ ++ /* advance to next positions */ ++ ip0 = ip1; ++ ip1 = ip2; ++ ip2 = ip0 + step; ++ ip3 = ip1 + step; ++ ++ /* calculate step */ ++ if (ip2 >= nextStep) { ++ step++; ++ PREFETCH_L1(ip1 + 64); ++ PREFETCH_L1(ip1 + 128); ++ nextStep += kStepIncr; ++ } ++ } while (ip3 < ilimit); ++ ++_cleanup: ++ /* Note that there are probably still a couple positions we could search. ++ * However, it seems to be a meaningful performance hit to try to search ++ * them. So let's not. */ ++ ++ /* If offset_1 started invalid (offsetSaved1 != 0) and became valid (offset_1 != 0), ++ * rotate saved offsets. See comment in ZSTD_compressBlock_fast_noDict for more context. */ ++ offsetSaved2 = ((offsetSaved1 != 0) && (offset_1 != 0)) ? offsetSaved1 : offsetSaved2; + + /* save reps for next block */ +- rep[0] = offset_1; +- rep[1] = offset_2; ++ rep[0] = offset_1 ? offset_1 : offsetSaved1; ++ rep[1] = offset_2 ? offset_2 : offsetSaved2; + + /* Return the last literals size */ + return (size_t)(iend - anchor); ++ ++_offset: /* Requires: ip0, idx, idxBase */ ++ ++ /* Compute the offset code. */ ++ { U32 const offset = current0 - idx; ++ const BYTE* const lowMatchPtr = idx < prefixStartIndex ? dictStart : prefixStart; ++ matchEnd = idx < prefixStartIndex ? dictEnd : iend; ++ match0 = idxBase + idx; ++ offset_2 = offset_1; ++ offset_1 = offset; ++ offcode = OFFSET_TO_OFFBASE(offset); ++ mLength = 4; ++ ++ /* Count the backwards match length. */ ++ while (((ip0>anchor) & (match0>lowMatchPtr)) && (ip0[-1] == match0[-1])) { ++ ip0--; ++ match0--; ++ mLength++; ++ } } ++ ++_match: /* Requires: ip0, match0, offcode, matchEnd */ ++ ++ /* Count the forward length. */ ++ assert(matchEnd != 0); ++ mLength += ZSTD_count_2segments(ip0 + mLength, match0 + mLength, iend, matchEnd, prefixStart); ++ ++ ZSTD_storeSeq(seqStore, (size_t)(ip0 - anchor), anchor, iend, offcode, mLength); ++ ++ ip0 += mLength; ++ anchor = ip0; ++ ++ /* write next hash table entry */ ++ if (ip1 < ip0) { ++ hashTable[hash1] = (U32)(ip1 - base); ++ } ++ ++ /* Fill table and check for immediate repcode. */ ++ if (ip0 <= ilimit) { ++ /* Fill Table */ ++ assert(base+current0+2 > istart); /* check base overflow */ ++ hashTable[ZSTD_hashPtr(base+current0+2, hlog, mls)] = current0+2; /* here because current+2 could be > iend-8 */ ++ hashTable[ZSTD_hashPtr(ip0-2, hlog, mls)] = (U32)(ip0-2-base); ++ ++ while (ip0 <= ilimit) { ++ U32 const repIndex2 = (U32)(ip0-base) - offset_2; ++ const BYTE* const repMatch2 = repIndex2 < prefixStartIndex ? dictBase + repIndex2 : base + repIndex2; ++ if ( (((U32)((prefixStartIndex-1) - repIndex2) >= 3) & (offset_2 > 0)) /* intentional underflow */ ++ && (MEM_read32(repMatch2) == MEM_read32(ip0)) ) { ++ const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend; ++ size_t const repLength2 = ZSTD_count_2segments(ip0+4, repMatch2+4, iend, repEnd2, prefixStart) + 4; ++ { U32 const tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; } /* swap offset_2 <=> offset_1 */ ++ ZSTD_storeSeq(seqStore, 0 /*litlen*/, anchor, iend, REPCODE1_TO_OFFBASE, repLength2); ++ hashTable[ZSTD_hashPtr(ip0, hlog, mls)] = (U32)(ip0-base); ++ ip0 += repLength2; ++ anchor = ip0; ++ continue; ++ } ++ break; ++ } } ++ ++ goto _start; + } + + ZSTD_GEN_FAST_FN(extDict, 4, 0) +@@ -660,6 +953,7 @@ size_t ZSTD_compressBlock_fast_extDict( + void const* src, size_t srcSize) + { + U32 const mls = ms->cParams.minMatch; ++ assert(ms->dictMatchState == NULL); + switch(mls) + { + default: /* includes case 3 */ +--- a/lib/zstd/compress/zstd_fast.h ++++ b/lib/zstd/compress/zstd_fast.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -16,7 +17,8 @@ + #include "zstd_compress_internal.h" + + void ZSTD_fillHashTable(ZSTD_matchState_t* ms, +- void const* end, ZSTD_dictTableLoadMethod_e dtlm); ++ void const* end, ZSTD_dictTableLoadMethod_e dtlm, ++ ZSTD_tableFillPurpose_e tfp); + size_t ZSTD_compressBlock_fast( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +--- a/lib/zstd/compress/zstd_lazy.c ++++ b/lib/zstd/compress/zstd_lazy.c +@@ -1,5 +1,6 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -10,14 +11,23 @@ + + #include "zstd_compress_internal.h" + #include "zstd_lazy.h" ++#include "../common/bits.h" /* ZSTD_countTrailingZeros64 */ ++ ++#if !defined(ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR) \ ++ || !defined(ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR) \ ++ || !defined(ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR) \ ++ || !defined(ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR) ++ ++#define kLazySkippingStep 8 + + + /*-************************************* + * Binary Tree search + ***************************************/ + +-static void +-ZSTD_updateDUBT(ZSTD_matchState_t* ms, ++static ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++void ZSTD_updateDUBT(ZSTD_matchState_t* ms, + const BYTE* ip, const BYTE* iend, + U32 mls) + { +@@ -60,8 +70,9 @@ ZSTD_updateDUBT(ZSTD_matchState_t* ms, + * sort one already inserted but unsorted position + * assumption : curr >= btlow == (curr - btmask) + * doesn't fail */ +-static void +-ZSTD_insertDUBT1(const ZSTD_matchState_t* ms, ++static ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++void ZSTD_insertDUBT1(const ZSTD_matchState_t* ms, + U32 curr, const BYTE* inputEnd, + U32 nbCompares, U32 btLow, + const ZSTD_dictMode_e dictMode) +@@ -149,8 +160,9 @@ ZSTD_insertDUBT1(const ZSTD_matchState_t + } + + +-static size_t +-ZSTD_DUBT_findBetterDictMatch ( ++static ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++size_t ZSTD_DUBT_findBetterDictMatch ( + const ZSTD_matchState_t* ms, + const BYTE* const ip, const BYTE* const iend, + size_t* offsetPtr, +@@ -197,8 +209,8 @@ ZSTD_DUBT_findBetterDictMatch ( + U32 matchIndex = dictMatchIndex + dictIndexDelta; + if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) ) { + DEBUGLOG(9, "ZSTD_DUBT_findBetterDictMatch(%u) : found better match length %u -> %u and offsetCode %u -> %u (dictMatchIndex %u, matchIndex %u)", +- curr, (U32)bestLength, (U32)matchLength, (U32)*offsetPtr, STORE_OFFSET(curr - matchIndex), dictMatchIndex, matchIndex); +- bestLength = matchLength, *offsetPtr = STORE_OFFSET(curr - matchIndex); ++ curr, (U32)bestLength, (U32)matchLength, (U32)*offsetPtr, OFFSET_TO_OFFBASE(curr - matchIndex), dictMatchIndex, matchIndex); ++ bestLength = matchLength, *offsetPtr = OFFSET_TO_OFFBASE(curr - matchIndex); + } + if (ip+matchLength == iend) { /* reached end of input : ip[matchLength] is not valid, no way to know if it's larger or smaller than match */ + break; /* drop, to guarantee consistency (miss a little bit of compression) */ +@@ -218,7 +230,7 @@ ZSTD_DUBT_findBetterDictMatch ( + } + + if (bestLength >= MINMATCH) { +- U32 const mIndex = curr - (U32)STORED_OFFSET(*offsetPtr); (void)mIndex; ++ U32 const mIndex = curr - (U32)OFFBASE_TO_OFFSET(*offsetPtr); (void)mIndex; + DEBUGLOG(8, "ZSTD_DUBT_findBetterDictMatch(%u) : found match of length %u and offsetCode %u (pos %u)", + curr, (U32)bestLength, (U32)*offsetPtr, mIndex); + } +@@ -227,10 +239,11 @@ ZSTD_DUBT_findBetterDictMatch ( + } + + +-static size_t +-ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms, ++static ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++size_t ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms, + const BYTE* const ip, const BYTE* const iend, +- size_t* offsetPtr, ++ size_t* offBasePtr, + U32 const mls, + const ZSTD_dictMode_e dictMode) + { +@@ -327,8 +340,8 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_ + if (matchLength > bestLength) { + if (matchLength > matchEndIdx - matchIndex) + matchEndIdx = matchIndex + (U32)matchLength; +- if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) ) +- bestLength = matchLength, *offsetPtr = STORE_OFFSET(curr - matchIndex); ++ if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr - matchIndex + 1) - ZSTD_highbit32((U32)*offBasePtr)) ) ++ bestLength = matchLength, *offBasePtr = OFFSET_TO_OFFBASE(curr - matchIndex); + if (ip+matchLength == iend) { /* equal : no way to know if inf or sup */ + if (dictMode == ZSTD_dictMatchState) { + nbCompares = 0; /* in addition to avoiding checking any +@@ -361,16 +374,16 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_ + if (dictMode == ZSTD_dictMatchState && nbCompares) { + bestLength = ZSTD_DUBT_findBetterDictMatch( + ms, ip, iend, +- offsetPtr, bestLength, nbCompares, ++ offBasePtr, bestLength, nbCompares, + mls, dictMode); + } + + assert(matchEndIdx > curr+8); /* ensure nextToUpdate is increased */ + ms->nextToUpdate = matchEndIdx - 8; /* skip repetitive patterns */ + if (bestLength >= MINMATCH) { +- U32 const mIndex = curr - (U32)STORED_OFFSET(*offsetPtr); (void)mIndex; ++ U32 const mIndex = curr - (U32)OFFBASE_TO_OFFSET(*offBasePtr); (void)mIndex; + DEBUGLOG(8, "ZSTD_DUBT_findBestMatch(%u) : found match of length %u and offsetCode %u (pos %u)", +- curr, (U32)bestLength, (U32)*offsetPtr, mIndex); ++ curr, (U32)bestLength, (U32)*offBasePtr, mIndex); + } + return bestLength; + } +@@ -378,17 +391,18 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_ + + + /* ZSTD_BtFindBestMatch() : Tree updater, providing best match */ +-FORCE_INLINE_TEMPLATE size_t +-ZSTD_BtFindBestMatch( ZSTD_matchState_t* ms, ++FORCE_INLINE_TEMPLATE ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++size_t ZSTD_BtFindBestMatch( ZSTD_matchState_t* ms, + const BYTE* const ip, const BYTE* const iLimit, +- size_t* offsetPtr, ++ size_t* offBasePtr, + const U32 mls /* template */, + const ZSTD_dictMode_e dictMode) + { + DEBUGLOG(7, "ZSTD_BtFindBestMatch"); + if (ip < ms->window.base + ms->nextToUpdate) return 0; /* skipped area */ + ZSTD_updateDUBT(ms, ip, iLimit, mls); +- return ZSTD_DUBT_findBestMatch(ms, ip, iLimit, offsetPtr, mls, dictMode); ++ return ZSTD_DUBT_findBestMatch(ms, ip, iLimit, offBasePtr, mls, dictMode); + } + + /* ********************************* +@@ -561,7 +575,7 @@ size_t ZSTD_dedicatedDictSearch_lazy_sea + /* save best solution */ + if (currentMl > ml) { + ml = currentMl; +- *offsetPtr = STORE_OFFSET(curr - (matchIndex + ddsIndexDelta)); ++ *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + ddsIndexDelta)); + if (ip+currentMl == iLimit) { + /* best possible, avoids read overflow on next attempt */ + return ml; +@@ -598,7 +612,7 @@ size_t ZSTD_dedicatedDictSearch_lazy_sea + /* save best solution */ + if (currentMl > ml) { + ml = currentMl; +- *offsetPtr = STORE_OFFSET(curr - (matchIndex + ddsIndexDelta)); ++ *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + ddsIndexDelta)); + if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */ + } + } +@@ -614,10 +628,12 @@ size_t ZSTD_dedicatedDictSearch_lazy_sea + + /* Update chains up to ip (excluded) + Assumption : always within prefix (i.e. not within extDict) */ +-FORCE_INLINE_TEMPLATE U32 ZSTD_insertAndFindFirstIndex_internal( ++FORCE_INLINE_TEMPLATE ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++U32 ZSTD_insertAndFindFirstIndex_internal( + ZSTD_matchState_t* ms, + const ZSTD_compressionParameters* const cParams, +- const BYTE* ip, U32 const mls) ++ const BYTE* ip, U32 const mls, U32 const lazySkipping) + { + U32* const hashTable = ms->hashTable; + const U32 hashLog = cParams->hashLog; +@@ -632,6 +648,9 @@ FORCE_INLINE_TEMPLATE U32 ZSTD_insertAnd + NEXT_IN_CHAIN(idx, chainMask) = hashTable[h]; + hashTable[h] = idx; + idx++; ++ /* Stop inserting every position when in the lazy skipping mode. */ ++ if (lazySkipping) ++ break; + } + + ms->nextToUpdate = target; +@@ -640,11 +659,12 @@ FORCE_INLINE_TEMPLATE U32 ZSTD_insertAnd + + U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip) { + const ZSTD_compressionParameters* const cParams = &ms->cParams; +- return ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, ms->cParams.minMatch); ++ return ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, ms->cParams.minMatch, /* lazySkipping*/ 0); + } + + /* inlining is important to hardwire a hot branch (template emulation) */ + FORCE_INLINE_TEMPLATE ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR + size_t ZSTD_HcFindBestMatch( + ZSTD_matchState_t* ms, + const BYTE* const ip, const BYTE* const iLimit, +@@ -684,14 +704,15 @@ size_t ZSTD_HcFindBestMatch( + } + + /* HC4 match finder */ +- matchIndex = ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, mls); ++ matchIndex = ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, mls, ms->lazySkipping); + + for ( ; (matchIndex>=lowLimit) & (nbAttempts>0) ; nbAttempts--) { + size_t currentMl=0; + if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) { + const BYTE* const match = base + matchIndex; + assert(matchIndex >= dictLimit); /* ensures this is true if dictMode != ZSTD_extDict */ +- if (match[ml] == ip[ml]) /* potentially better */ ++ /* read 4B starting from (match + ml + 1 - sizeof(U32)) */ ++ if (MEM_read32(match + ml - 3) == MEM_read32(ip + ml - 3)) /* potentially better */ + currentMl = ZSTD_count(ip, match, iLimit); + } else { + const BYTE* const match = dictBase + matchIndex; +@@ -703,7 +724,7 @@ size_t ZSTD_HcFindBestMatch( + /* save best solution */ + if (currentMl > ml) { + ml = currentMl; +- *offsetPtr = STORE_OFFSET(curr - matchIndex); ++ *offsetPtr = OFFSET_TO_OFFBASE(curr - matchIndex); + if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */ + } + +@@ -739,7 +760,7 @@ size_t ZSTD_HcFindBestMatch( + if (currentMl > ml) { + ml = currentMl; + assert(curr > matchIndex + dmsIndexDelta); +- *offsetPtr = STORE_OFFSET(curr - (matchIndex + dmsIndexDelta)); ++ *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + dmsIndexDelta)); + if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */ + } + +@@ -756,8 +777,6 @@ size_t ZSTD_HcFindBestMatch( + * (SIMD) Row-based matchfinder + ***********************************/ + /* Constants for row-based hash */ +-#define ZSTD_ROW_HASH_TAG_OFFSET 16 /* byte offset of hashes in the match state's tagTable from the beginning of a row */ +-#define ZSTD_ROW_HASH_TAG_BITS 8 /* nb bits to use for the tag */ + #define ZSTD_ROW_HASH_TAG_MASK ((1u << ZSTD_ROW_HASH_TAG_BITS) - 1) + #define ZSTD_ROW_HASH_MAX_ENTRIES 64 /* absolute maximum number of entries per row, for all configurations */ + +@@ -769,64 +788,19 @@ typedef U64 ZSTD_VecMask; /* Clarifies + * Starting from the LSB, returns the idx of the next non-zero bit. + * Basically counting the nb of trailing zeroes. + */ +-static U32 ZSTD_VecMask_next(ZSTD_VecMask val) { +- assert(val != 0); +-# if (defined(__GNUC__) && ((__GNUC__ > 3) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 4)))) +- if (sizeof(size_t) == 4) { +- U32 mostSignificantWord = (U32)(val >> 32); +- U32 leastSignificantWord = (U32)val; +- if (leastSignificantWord == 0) { +- return 32 + (U32)__builtin_ctz(mostSignificantWord); +- } else { +- return (U32)__builtin_ctz(leastSignificantWord); +- } +- } else { +- return (U32)__builtin_ctzll(val); +- } +-# else +- /* Software ctz version: http://aggregate.org/MAGIC/#Trailing%20Zero%20Count +- * and: https://stackoverflow.com/questions/2709430/count-number-of-bits-in-a-64-bit-long-big-integer +- */ +- val = ~val & (val - 1ULL); /* Lowest set bit mask */ +- val = val - ((val >> 1) & 0x5555555555555555); +- val = (val & 0x3333333333333333ULL) + ((val >> 2) & 0x3333333333333333ULL); +- return (U32)((((val + (val >> 4)) & 0xF0F0F0F0F0F0F0FULL) * 0x101010101010101ULL) >> 56); +-# endif +-} +- +-/* ZSTD_rotateRight_*(): +- * Rotates a bitfield to the right by "count" bits. +- * https://en.wikipedia.org/w/index.php?title=Circular_shift&oldid=991635599#Implementing_circular_shifts +- */ +-FORCE_INLINE_TEMPLATE +-U64 ZSTD_rotateRight_U64(U64 const value, U32 count) { +- assert(count < 64); +- count &= 0x3F; /* for fickle pattern recognition */ +- return (value >> count) | (U64)(value << ((0U - count) & 0x3F)); +-} +- +-FORCE_INLINE_TEMPLATE +-U32 ZSTD_rotateRight_U32(U32 const value, U32 count) { +- assert(count < 32); +- count &= 0x1F; /* for fickle pattern recognition */ +- return (value >> count) | (U32)(value << ((0U - count) & 0x1F)); +-} +- +-FORCE_INLINE_TEMPLATE +-U16 ZSTD_rotateRight_U16(U16 const value, U32 count) { +- assert(count < 16); +- count &= 0x0F; /* for fickle pattern recognition */ +- return (value >> count) | (U16)(value << ((0U - count) & 0x0F)); ++MEM_STATIC U32 ZSTD_VecMask_next(ZSTD_VecMask val) { ++ return ZSTD_countTrailingZeros64(val); + } + + /* ZSTD_row_nextIndex(): + * Returns the next index to insert at within a tagTable row, and updates the "head" +- * value to reflect the update. Essentially cycles backwards from [0, {entries per row}) ++ * value to reflect the update. Essentially cycles backwards from [1, {entries per row}) + */ + FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextIndex(BYTE* const tagRow, U32 const rowMask) { +- U32 const next = (*tagRow - 1) & rowMask; +- *tagRow = (BYTE)next; +- return next; ++ U32 next = (*tagRow-1) & rowMask; ++ next += (next == 0) ? rowMask : 0; /* skip first position */ ++ *tagRow = (BYTE)next; ++ return next; + } + + /* ZSTD_isAligned(): +@@ -840,7 +814,7 @@ MEM_STATIC int ZSTD_isAligned(void const + /* ZSTD_row_prefetch(): + * Performs prefetching for the hashTable and tagTable at a given row. + */ +-FORCE_INLINE_TEMPLATE void ZSTD_row_prefetch(U32 const* hashTable, U16 const* tagTable, U32 const relRow, U32 const rowLog) { ++FORCE_INLINE_TEMPLATE void ZSTD_row_prefetch(U32 const* hashTable, BYTE const* tagTable, U32 const relRow, U32 const rowLog) { + PREFETCH_L1(hashTable + relRow); + if (rowLog >= 5) { + PREFETCH_L1(hashTable + relRow + 16); +@@ -859,18 +833,20 @@ FORCE_INLINE_TEMPLATE void ZSTD_row_pref + * Fill up the hash cache starting at idx, prefetching up to ZSTD_ROW_HASH_CACHE_SIZE entries, + * but not beyond iLimit. + */ +-FORCE_INLINE_TEMPLATE void ZSTD_row_fillHashCache(ZSTD_matchState_t* ms, const BYTE* base, ++FORCE_INLINE_TEMPLATE ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++void ZSTD_row_fillHashCache(ZSTD_matchState_t* ms, const BYTE* base, + U32 const rowLog, U32 const mls, + U32 idx, const BYTE* const iLimit) + { + U32 const* const hashTable = ms->hashTable; +- U16 const* const tagTable = ms->tagTable; ++ BYTE const* const tagTable = ms->tagTable; + U32 const hashLog = ms->rowHashLog; + U32 const maxElemsToPrefetch = (base + idx) > iLimit ? 0 : (U32)(iLimit - (base + idx) + 1); + U32 const lim = idx + MIN(ZSTD_ROW_HASH_CACHE_SIZE, maxElemsToPrefetch); + + for (; idx < lim; ++idx) { +- U32 const hash = (U32)ZSTD_hashPtr(base + idx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls); ++ U32 const hash = (U32)ZSTD_hashPtrSalted(base + idx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, ms->hashSalt); + U32 const row = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog; + ZSTD_row_prefetch(hashTable, tagTable, row, rowLog); + ms->hashCache[idx & ZSTD_ROW_HASH_CACHE_MASK] = hash; +@@ -885,12 +861,15 @@ FORCE_INLINE_TEMPLATE void ZSTD_row_fill + * Returns the hash of base + idx, and replaces the hash in the hash cache with the byte at + * base + idx + ZSTD_ROW_HASH_CACHE_SIZE. Also prefetches the appropriate rows from hashTable and tagTable. + */ +-FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextCachedHash(U32* cache, U32 const* hashTable, +- U16 const* tagTable, BYTE const* base, ++FORCE_INLINE_TEMPLATE ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++U32 ZSTD_row_nextCachedHash(U32* cache, U32 const* hashTable, ++ BYTE const* tagTable, BYTE const* base, + U32 idx, U32 const hashLog, +- U32 const rowLog, U32 const mls) ++ U32 const rowLog, U32 const mls, ++ U64 const hashSalt) + { +- U32 const newHash = (U32)ZSTD_hashPtr(base+idx+ZSTD_ROW_HASH_CACHE_SIZE, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls); ++ U32 const newHash = (U32)ZSTD_hashPtrSalted(base+idx+ZSTD_ROW_HASH_CACHE_SIZE, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, hashSalt); + U32 const row = (newHash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog; + ZSTD_row_prefetch(hashTable, tagTable, row, rowLog); + { U32 const hash = cache[idx & ZSTD_ROW_HASH_CACHE_MASK]; +@@ -902,28 +881,29 @@ FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextC + /* ZSTD_row_update_internalImpl(): + * Updates the hash table with positions starting from updateStartIdx until updateEndIdx. + */ +-FORCE_INLINE_TEMPLATE void ZSTD_row_update_internalImpl(ZSTD_matchState_t* ms, +- U32 updateStartIdx, U32 const updateEndIdx, +- U32 const mls, U32 const rowLog, +- U32 const rowMask, U32 const useCache) ++FORCE_INLINE_TEMPLATE ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++void ZSTD_row_update_internalImpl(ZSTD_matchState_t* ms, ++ U32 updateStartIdx, U32 const updateEndIdx, ++ U32 const mls, U32 const rowLog, ++ U32 const rowMask, U32 const useCache) + { + U32* const hashTable = ms->hashTable; +- U16* const tagTable = ms->tagTable; ++ BYTE* const tagTable = ms->tagTable; + U32 const hashLog = ms->rowHashLog; + const BYTE* const base = ms->window.base; + + DEBUGLOG(6, "ZSTD_row_update_internalImpl(): updateStartIdx=%u, updateEndIdx=%u", updateStartIdx, updateEndIdx); + for (; updateStartIdx < updateEndIdx; ++updateStartIdx) { +- U32 const hash = useCache ? ZSTD_row_nextCachedHash(ms->hashCache, hashTable, tagTable, base, updateStartIdx, hashLog, rowLog, mls) +- : (U32)ZSTD_hashPtr(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls); ++ U32 const hash = useCache ? ZSTD_row_nextCachedHash(ms->hashCache, hashTable, tagTable, base, updateStartIdx, hashLog, rowLog, mls, ms->hashSalt) ++ : (U32)ZSTD_hashPtrSalted(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, ms->hashSalt); + U32 const relRow = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog; + U32* const row = hashTable + relRow; +- BYTE* tagRow = (BYTE*)(tagTable + relRow); /* Though tagTable is laid out as a table of U16, each tag is only 1 byte. +- Explicit cast allows us to get exact desired position within each row */ ++ BYTE* tagRow = tagTable + relRow; + U32 const pos = ZSTD_row_nextIndex(tagRow, rowMask); + +- assert(hash == ZSTD_hashPtr(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls)); +- ((BYTE*)tagRow)[pos + ZSTD_ROW_HASH_TAG_OFFSET] = hash & ZSTD_ROW_HASH_TAG_MASK; ++ assert(hash == ZSTD_hashPtrSalted(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, ms->hashSalt)); ++ tagRow[pos] = hash & ZSTD_ROW_HASH_TAG_MASK; + row[pos] = updateStartIdx; + } + } +@@ -932,9 +912,11 @@ FORCE_INLINE_TEMPLATE void ZSTD_row_upda + * Inserts the byte at ip into the appropriate position in the hash table, and updates ms->nextToUpdate. + * Skips sections of long matches as is necessary. + */ +-FORCE_INLINE_TEMPLATE void ZSTD_row_update_internal(ZSTD_matchState_t* ms, const BYTE* ip, +- U32 const mls, U32 const rowLog, +- U32 const rowMask, U32 const useCache) ++FORCE_INLINE_TEMPLATE ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++void ZSTD_row_update_internal(ZSTD_matchState_t* ms, const BYTE* ip, ++ U32 const mls, U32 const rowLog, ++ U32 const rowMask, U32 const useCache) + { + U32 idx = ms->nextToUpdate; + const BYTE* const base = ms->window.base; +@@ -971,7 +953,35 @@ void ZSTD_row_update(ZSTD_matchState_t* + const U32 mls = MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */); + + DEBUGLOG(5, "ZSTD_row_update(), rowLog=%u", rowLog); +- ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 0 /* dont use cache */); ++ ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 0 /* don't use cache */); ++} ++ ++/* Returns the mask width of bits group of which will be set to 1. Given not all ++ * architectures have easy movemask instruction, this helps to iterate over ++ * groups of bits easier and faster. ++ */ ++FORCE_INLINE_TEMPLATE U32 ++ZSTD_row_matchMaskGroupWidth(const U32 rowEntries) ++{ ++ assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64); ++ assert(rowEntries <= ZSTD_ROW_HASH_MAX_ENTRIES); ++ (void)rowEntries; ++#if defined(ZSTD_ARCH_ARM_NEON) ++ /* NEON path only works for little endian */ ++ if (!MEM_isLittleEndian()) { ++ return 1; ++ } ++ if (rowEntries == 16) { ++ return 4; ++ } ++ if (rowEntries == 32) { ++ return 2; ++ } ++ if (rowEntries == 64) { ++ return 1; ++ } ++#endif ++ return 1; + } + + #if defined(ZSTD_ARCH_X86_SSE2) +@@ -994,71 +1004,82 @@ ZSTD_row_getSSEMask(int nbChunks, const + } + #endif + +-/* Returns a ZSTD_VecMask (U32) that has the nth bit set to 1 if the newly-computed "tag" matches +- * the hash at the nth position in a row of the tagTable. +- * Each row is a circular buffer beginning at the value of "head". So we must rotate the "matches" bitfield +- * to match up with the actual layout of the entries within the hashTable */ ++#if defined(ZSTD_ARCH_ARM_NEON) ++FORCE_INLINE_TEMPLATE ZSTD_VecMask ++ZSTD_row_getNEONMask(const U32 rowEntries, const BYTE* const src, const BYTE tag, const U32 headGrouped) ++{ ++ assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64); ++ if (rowEntries == 16) { ++ /* vshrn_n_u16 shifts by 4 every u16 and narrows to 8 lower bits. ++ * After that groups of 4 bits represent the equalMask. We lower ++ * all bits except the highest in these groups by doing AND with ++ * 0x88 = 0b10001000. ++ */ ++ const uint8x16_t chunk = vld1q_u8(src); ++ const uint16x8_t equalMask = vreinterpretq_u16_u8(vceqq_u8(chunk, vdupq_n_u8(tag))); ++ const uint8x8_t res = vshrn_n_u16(equalMask, 4); ++ const U64 matches = vget_lane_u64(vreinterpret_u64_u8(res), 0); ++ return ZSTD_rotateRight_U64(matches, headGrouped) & 0x8888888888888888ull; ++ } else if (rowEntries == 32) { ++ /* Same idea as with rowEntries == 16 but doing AND with ++ * 0x55 = 0b01010101. ++ */ ++ const uint16x8x2_t chunk = vld2q_u16((const uint16_t*)(const void*)src); ++ const uint8x16_t chunk0 = vreinterpretq_u8_u16(chunk.val[0]); ++ const uint8x16_t chunk1 = vreinterpretq_u8_u16(chunk.val[1]); ++ const uint8x16_t dup = vdupq_n_u8(tag); ++ const uint8x8_t t0 = vshrn_n_u16(vreinterpretq_u16_u8(vceqq_u8(chunk0, dup)), 6); ++ const uint8x8_t t1 = vshrn_n_u16(vreinterpretq_u16_u8(vceqq_u8(chunk1, dup)), 6); ++ const uint8x8_t res = vsli_n_u8(t0, t1, 4); ++ const U64 matches = vget_lane_u64(vreinterpret_u64_u8(res), 0) ; ++ return ZSTD_rotateRight_U64(matches, headGrouped) & 0x5555555555555555ull; ++ } else { /* rowEntries == 64 */ ++ const uint8x16x4_t chunk = vld4q_u8(src); ++ const uint8x16_t dup = vdupq_n_u8(tag); ++ const uint8x16_t cmp0 = vceqq_u8(chunk.val[0], dup); ++ const uint8x16_t cmp1 = vceqq_u8(chunk.val[1], dup); ++ const uint8x16_t cmp2 = vceqq_u8(chunk.val[2], dup); ++ const uint8x16_t cmp3 = vceqq_u8(chunk.val[3], dup); ++ ++ const uint8x16_t t0 = vsriq_n_u8(cmp1, cmp0, 1); ++ const uint8x16_t t1 = vsriq_n_u8(cmp3, cmp2, 1); ++ const uint8x16_t t2 = vsriq_n_u8(t1, t0, 2); ++ const uint8x16_t t3 = vsriq_n_u8(t2, t2, 4); ++ const uint8x8_t t4 = vshrn_n_u16(vreinterpretq_u16_u8(t3), 4); ++ const U64 matches = vget_lane_u64(vreinterpret_u64_u8(t4), 0); ++ return ZSTD_rotateRight_U64(matches, headGrouped); ++ } ++} ++#endif ++ ++/* Returns a ZSTD_VecMask (U64) that has the nth group (determined by ++ * ZSTD_row_matchMaskGroupWidth) of bits set to 1 if the newly-computed "tag" ++ * matches the hash at the nth position in a row of the tagTable. ++ * Each row is a circular buffer beginning at the value of "headGrouped". So we ++ * must rotate the "matches" bitfield to match up with the actual layout of the ++ * entries within the hashTable */ + FORCE_INLINE_TEMPLATE ZSTD_VecMask +-ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 head, const U32 rowEntries) ++ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 headGrouped, const U32 rowEntries) + { +- const BYTE* const src = tagRow + ZSTD_ROW_HASH_TAG_OFFSET; ++ const BYTE* const src = tagRow; + assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64); + assert(rowEntries <= ZSTD_ROW_HASH_MAX_ENTRIES); ++ assert(ZSTD_row_matchMaskGroupWidth(rowEntries) * rowEntries <= sizeof(ZSTD_VecMask) * 8); + + #if defined(ZSTD_ARCH_X86_SSE2) + +- return ZSTD_row_getSSEMask(rowEntries / 16, src, tag, head); ++ return ZSTD_row_getSSEMask(rowEntries / 16, src, tag, headGrouped); + + #else /* SW or NEON-LE */ + + # if defined(ZSTD_ARCH_ARM_NEON) + /* This NEON path only works for little endian - otherwise use SWAR below */ + if (MEM_isLittleEndian()) { +- if (rowEntries == 16) { +- const uint8x16_t chunk = vld1q_u8(src); +- const uint16x8_t equalMask = vreinterpretq_u16_u8(vceqq_u8(chunk, vdupq_n_u8(tag))); +- const uint16x8_t t0 = vshlq_n_u16(equalMask, 7); +- const uint32x4_t t1 = vreinterpretq_u32_u16(vsriq_n_u16(t0, t0, 14)); +- const uint64x2_t t2 = vreinterpretq_u64_u32(vshrq_n_u32(t1, 14)); +- const uint8x16_t t3 = vreinterpretq_u8_u64(vsraq_n_u64(t2, t2, 28)); +- const U16 hi = (U16)vgetq_lane_u8(t3, 8); +- const U16 lo = (U16)vgetq_lane_u8(t3, 0); +- return ZSTD_rotateRight_U16((hi << 8) | lo, head); +- } else if (rowEntries == 32) { +- const uint16x8x2_t chunk = vld2q_u16((const U16*)(const void*)src); +- const uint8x16_t chunk0 = vreinterpretq_u8_u16(chunk.val[0]); +- const uint8x16_t chunk1 = vreinterpretq_u8_u16(chunk.val[1]); +- const uint8x16_t equalMask0 = vceqq_u8(chunk0, vdupq_n_u8(tag)); +- const uint8x16_t equalMask1 = vceqq_u8(chunk1, vdupq_n_u8(tag)); +- const int8x8_t pack0 = vqmovn_s16(vreinterpretq_s16_u8(equalMask0)); +- const int8x8_t pack1 = vqmovn_s16(vreinterpretq_s16_u8(equalMask1)); +- const uint8x8_t t0 = vreinterpret_u8_s8(pack0); +- const uint8x8_t t1 = vreinterpret_u8_s8(pack1); +- const uint8x8_t t2 = vsri_n_u8(t1, t0, 2); +- const uint8x8x2_t t3 = vuzp_u8(t2, t0); +- const uint8x8_t t4 = vsri_n_u8(t3.val[1], t3.val[0], 4); +- const U32 matches = vget_lane_u32(vreinterpret_u32_u8(t4), 0); +- return ZSTD_rotateRight_U32(matches, head); +- } else { /* rowEntries == 64 */ +- const uint8x16x4_t chunk = vld4q_u8(src); +- const uint8x16_t dup = vdupq_n_u8(tag); +- const uint8x16_t cmp0 = vceqq_u8(chunk.val[0], dup); +- const uint8x16_t cmp1 = vceqq_u8(chunk.val[1], dup); +- const uint8x16_t cmp2 = vceqq_u8(chunk.val[2], dup); +- const uint8x16_t cmp3 = vceqq_u8(chunk.val[3], dup); +- +- const uint8x16_t t0 = vsriq_n_u8(cmp1, cmp0, 1); +- const uint8x16_t t1 = vsriq_n_u8(cmp3, cmp2, 1); +- const uint8x16_t t2 = vsriq_n_u8(t1, t0, 2); +- const uint8x16_t t3 = vsriq_n_u8(t2, t2, 4); +- const uint8x8_t t4 = vshrn_n_u16(vreinterpretq_u16_u8(t3), 4); +- const U64 matches = vget_lane_u64(vreinterpret_u64_u8(t4), 0); +- return ZSTD_rotateRight_U64(matches, head); +- } ++ return ZSTD_row_getNEONMask(rowEntries, src, tag, headGrouped); + } + # endif /* ZSTD_ARCH_ARM_NEON */ + /* SWAR */ +- { const size_t chunkSize = sizeof(size_t); ++ { const int chunkSize = sizeof(size_t); + const size_t shiftAmount = ((chunkSize * 8) - chunkSize); + const size_t xFF = ~((size_t)0); + const size_t x01 = xFF / 0xFF; +@@ -1091,11 +1112,11 @@ ZSTD_row_getMatchMask(const BYTE* const + } + matches = ~matches; + if (rowEntries == 16) { +- return ZSTD_rotateRight_U16((U16)matches, head); ++ return ZSTD_rotateRight_U16((U16)matches, headGrouped); + } else if (rowEntries == 32) { +- return ZSTD_rotateRight_U32((U32)matches, head); ++ return ZSTD_rotateRight_U32((U32)matches, headGrouped); + } else { +- return ZSTD_rotateRight_U64((U64)matches, head); ++ return ZSTD_rotateRight_U64((U64)matches, headGrouped); + } + } + #endif +@@ -1103,20 +1124,21 @@ ZSTD_row_getMatchMask(const BYTE* const + + /* The high-level approach of the SIMD row based match finder is as follows: + * - Figure out where to insert the new entry: +- * - Generate a hash from a byte along with an additional 1-byte "short hash". The additional byte is our "tag" +- * - The hashTable is effectively split into groups or "rows" of 16 or 32 entries of U32, and the hash determines ++ * - Generate a hash for current input posistion and split it into a one byte of tag and `rowHashLog` bits of index. ++ * - The hash is salted by a value that changes on every contex reset, so when the same table is used ++ * we will avoid collisions that would otherwise slow us down by intorducing phantom matches. ++ * - The hashTable is effectively split into groups or "rows" of 15 or 31 entries of U32, and the index determines + * which row to insert into. +- * - Determine the correct position within the row to insert the entry into. Each row of 16 or 32 can +- * be considered as a circular buffer with a "head" index that resides in the tagTable. +- * - Also insert the "tag" into the equivalent row and position in the tagTable. +- * - Note: The tagTable has 17 or 33 1-byte entries per row, due to 16 or 32 tags, and 1 "head" entry. +- * The 17 or 33 entry rows are spaced out to occur every 32 or 64 bytes, respectively, +- * for alignment/performance reasons, leaving some bytes unused. +- * - Use SIMD to efficiently compare the tags in the tagTable to the 1-byte "short hash" and ++ * - Determine the correct position within the row to insert the entry into. Each row of 15 or 31 can ++ * be considered as a circular buffer with a "head" index that resides in the tagTable (overall 16 or 32 bytes ++ * per row). ++ * - Use SIMD to efficiently compare the tags in the tagTable to the 1-byte tag calculated for the position and + * generate a bitfield that we can cycle through to check the collisions in the hash table. + * - Pick the longest match. ++ * - Insert the tag into the equivalent row and position in the tagTable. + */ + FORCE_INLINE_TEMPLATE ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR + size_t ZSTD_RowFindBestMatch( + ZSTD_matchState_t* ms, + const BYTE* const ip, const BYTE* const iLimit, +@@ -1125,7 +1147,7 @@ size_t ZSTD_RowFindBestMatch( + const U32 rowLog) + { + U32* const hashTable = ms->hashTable; +- U16* const tagTable = ms->tagTable; ++ BYTE* const tagTable = ms->tagTable; + U32* const hashCache = ms->hashCache; + const U32 hashLog = ms->rowHashLog; + const ZSTD_compressionParameters* const cParams = &ms->cParams; +@@ -1143,8 +1165,11 @@ size_t ZSTD_RowFindBestMatch( + const U32 rowEntries = (1U << rowLog); + const U32 rowMask = rowEntries - 1; + const U32 cappedSearchLog = MIN(cParams->searchLog, rowLog); /* nb of searches is capped at nb entries per row */ ++ const U32 groupWidth = ZSTD_row_matchMaskGroupWidth(rowEntries); ++ const U64 hashSalt = ms->hashSalt; + U32 nbAttempts = 1U << cappedSearchLog; + size_t ml=4-1; ++ U32 hash; + + /* DMS/DDS variables that may be referenced laster */ + const ZSTD_matchState_t* const dms = ms->dictMatchState; +@@ -1168,7 +1193,7 @@ size_t ZSTD_RowFindBestMatch( + if (dictMode == ZSTD_dictMatchState) { + /* Prefetch DMS rows */ + U32* const dmsHashTable = dms->hashTable; +- U16* const dmsTagTable = dms->tagTable; ++ BYTE* const dmsTagTable = dms->tagTable; + U32 const dmsHash = (U32)ZSTD_hashPtr(ip, dms->rowHashLog + ZSTD_ROW_HASH_TAG_BITS, mls); + U32 const dmsRelRow = (dmsHash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog; + dmsTag = dmsHash & ZSTD_ROW_HASH_TAG_MASK; +@@ -1178,23 +1203,34 @@ size_t ZSTD_RowFindBestMatch( + } + + /* Update the hashTable and tagTable up to (but not including) ip */ +- ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 1 /* useCache */); ++ if (!ms->lazySkipping) { ++ ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 1 /* useCache */); ++ hash = ZSTD_row_nextCachedHash(hashCache, hashTable, tagTable, base, curr, hashLog, rowLog, mls, hashSalt); ++ } else { ++ /* Stop inserting every position when in the lazy skipping mode. ++ * The hash cache is also not kept up to date in this mode. ++ */ ++ hash = (U32)ZSTD_hashPtrSalted(ip, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, hashSalt); ++ ms->nextToUpdate = curr; ++ } ++ ms->hashSaltEntropy += hash; /* collect salt entropy */ ++ + { /* Get the hash for ip, compute the appropriate row */ +- U32 const hash = ZSTD_row_nextCachedHash(hashCache, hashTable, tagTable, base, curr, hashLog, rowLog, mls); + U32 const relRow = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog; + U32 const tag = hash & ZSTD_ROW_HASH_TAG_MASK; + U32* const row = hashTable + relRow; + BYTE* tagRow = (BYTE*)(tagTable + relRow); +- U32 const head = *tagRow & rowMask; ++ U32 const headGrouped = (*tagRow & rowMask) * groupWidth; + U32 matchBuffer[ZSTD_ROW_HASH_MAX_ENTRIES]; + size_t numMatches = 0; + size_t currMatch = 0; +- ZSTD_VecMask matches = ZSTD_row_getMatchMask(tagRow, (BYTE)tag, head, rowEntries); ++ ZSTD_VecMask matches = ZSTD_row_getMatchMask(tagRow, (BYTE)tag, headGrouped, rowEntries); + + /* Cycle through the matches and prefetch */ +- for (; (matches > 0) && (nbAttempts > 0); --nbAttempts, matches &= (matches - 1)) { +- U32 const matchPos = (head + ZSTD_VecMask_next(matches)) & rowMask; ++ for (; (matches > 0) && (nbAttempts > 0); matches &= (matches - 1)) { ++ U32 const matchPos = ((headGrouped + ZSTD_VecMask_next(matches)) / groupWidth) & rowMask; + U32 const matchIndex = row[matchPos]; ++ if(matchPos == 0) continue; + assert(numMatches < rowEntries); + if (matchIndex < lowLimit) + break; +@@ -1204,13 +1240,14 @@ size_t ZSTD_RowFindBestMatch( + PREFETCH_L1(dictBase + matchIndex); + } + matchBuffer[numMatches++] = matchIndex; ++ --nbAttempts; + } + + /* Speed opt: insert current byte into hashtable too. This allows us to avoid one iteration of the loop + in ZSTD_row_update_internal() at the next search. */ + { + U32 const pos = ZSTD_row_nextIndex(tagRow, rowMask); +- tagRow[pos + ZSTD_ROW_HASH_TAG_OFFSET] = (BYTE)tag; ++ tagRow[pos] = (BYTE)tag; + row[pos] = ms->nextToUpdate++; + } + +@@ -1224,7 +1261,8 @@ size_t ZSTD_RowFindBestMatch( + if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) { + const BYTE* const match = base + matchIndex; + assert(matchIndex >= dictLimit); /* ensures this is true if dictMode != ZSTD_extDict */ +- if (match[ml] == ip[ml]) /* potentially better */ ++ /* read 4B starting from (match + ml + 1 - sizeof(U32)) */ ++ if (MEM_read32(match + ml - 3) == MEM_read32(ip + ml - 3)) /* potentially better */ + currentMl = ZSTD_count(ip, match, iLimit); + } else { + const BYTE* const match = dictBase + matchIndex; +@@ -1236,7 +1274,7 @@ size_t ZSTD_RowFindBestMatch( + /* Save best solution */ + if (currentMl > ml) { + ml = currentMl; +- *offsetPtr = STORE_OFFSET(curr - matchIndex); ++ *offsetPtr = OFFSET_TO_OFFBASE(curr - matchIndex); + if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */ + } + } +@@ -1254,19 +1292,21 @@ size_t ZSTD_RowFindBestMatch( + const U32 dmsSize = (U32)(dmsEnd - dmsBase); + const U32 dmsIndexDelta = dictLimit - dmsSize; + +- { U32 const head = *dmsTagRow & rowMask; ++ { U32 const headGrouped = (*dmsTagRow & rowMask) * groupWidth; + U32 matchBuffer[ZSTD_ROW_HASH_MAX_ENTRIES]; + size_t numMatches = 0; + size_t currMatch = 0; +- ZSTD_VecMask matches = ZSTD_row_getMatchMask(dmsTagRow, (BYTE)dmsTag, head, rowEntries); ++ ZSTD_VecMask matches = ZSTD_row_getMatchMask(dmsTagRow, (BYTE)dmsTag, headGrouped, rowEntries); + +- for (; (matches > 0) && (nbAttempts > 0); --nbAttempts, matches &= (matches - 1)) { +- U32 const matchPos = (head + ZSTD_VecMask_next(matches)) & rowMask; ++ for (; (matches > 0) && (nbAttempts > 0); matches &= (matches - 1)) { ++ U32 const matchPos = ((headGrouped + ZSTD_VecMask_next(matches)) / groupWidth) & rowMask; + U32 const matchIndex = dmsRow[matchPos]; ++ if(matchPos == 0) continue; + if (matchIndex < dmsLowestIndex) + break; + PREFETCH_L1(dmsBase + matchIndex); + matchBuffer[numMatches++] = matchIndex; ++ --nbAttempts; + } + + /* Return the longest match */ +@@ -1285,7 +1325,7 @@ size_t ZSTD_RowFindBestMatch( + if (currentMl > ml) { + ml = currentMl; + assert(curr > matchIndex + dmsIndexDelta); +- *offsetPtr = STORE_OFFSET(curr - (matchIndex + dmsIndexDelta)); ++ *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + dmsIndexDelta)); + if (ip+currentMl == iLimit) break; + } + } +@@ -1472,8 +1512,9 @@ FORCE_INLINE_TEMPLATE size_t ZSTD_search + * Common parser - lazy strategy + *********************************/ + +-FORCE_INLINE_TEMPLATE size_t +-ZSTD_compressBlock_lazy_generic( ++FORCE_INLINE_TEMPLATE ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++size_t ZSTD_compressBlock_lazy_generic( + ZSTD_matchState_t* ms, seqStore_t* seqStore, + U32 rep[ZSTD_REP_NUM], + const void* src, size_t srcSize, +@@ -1491,7 +1532,8 @@ ZSTD_compressBlock_lazy_generic( + const U32 mls = BOUNDED(4, ms->cParams.minMatch, 6); + const U32 rowLog = BOUNDED(4, ms->cParams.searchLog, 6); + +- U32 offset_1 = rep[0], offset_2 = rep[1], savedOffset=0; ++ U32 offset_1 = rep[0], offset_2 = rep[1]; ++ U32 offsetSaved1 = 0, offsetSaved2 = 0; + + const int isDMS = dictMode == ZSTD_dictMatchState; + const int isDDS = dictMode == ZSTD_dedicatedDictSearch; +@@ -1512,8 +1554,8 @@ ZSTD_compressBlock_lazy_generic( + U32 const curr = (U32)(ip - base); + U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, curr, ms->cParams.windowLog); + U32 const maxRep = curr - windowLow; +- if (offset_2 > maxRep) savedOffset = offset_2, offset_2 = 0; +- if (offset_1 > maxRep) savedOffset = offset_1, offset_1 = 0; ++ if (offset_2 > maxRep) offsetSaved2 = offset_2, offset_2 = 0; ++ if (offset_1 > maxRep) offsetSaved1 = offset_1, offset_1 = 0; + } + if (isDxS) { + /* dictMatchState repCode checks don't currently handle repCode == 0 +@@ -1522,10 +1564,11 @@ ZSTD_compressBlock_lazy_generic( + assert(offset_2 <= dictAndPrefixLength); + } + ++ /* Reset the lazy skipping state */ ++ ms->lazySkipping = 0; ++ + if (searchMethod == search_rowHash) { +- ZSTD_row_fillHashCache(ms, base, rowLog, +- MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */), +- ms->nextToUpdate, ilimit); ++ ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit); + } + + /* Match Loop */ +@@ -1537,7 +1580,7 @@ ZSTD_compressBlock_lazy_generic( + #endif + while (ip < ilimit) { + size_t matchLength=0; +- size_t offcode=STORE_REPCODE_1; ++ size_t offBase = REPCODE1_TO_OFFBASE; + const BYTE* start=ip+1; + DEBUGLOG(7, "search baseline (depth 0)"); + +@@ -1562,14 +1605,23 @@ ZSTD_compressBlock_lazy_generic( + } + + /* first search (depth 0) */ +- { size_t offsetFound = 999999999; +- size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offsetFound, mls, rowLog, searchMethod, dictMode); ++ { size_t offbaseFound = 999999999; ++ size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offbaseFound, mls, rowLog, searchMethod, dictMode); + if (ml2 > matchLength) +- matchLength = ml2, start = ip, offcode=offsetFound; ++ matchLength = ml2, start = ip, offBase = offbaseFound; + } + + if (matchLength < 4) { +- ip += ((ip-anchor) >> kSearchStrength) + 1; /* jump faster over incompressible sections */ ++ size_t const step = ((size_t)(ip-anchor) >> kSearchStrength) + 1; /* jump faster over incompressible sections */; ++ ip += step; ++ /* Enter the lazy skipping mode once we are skipping more than 8 bytes at a time. ++ * In this mode we stop inserting every position into our tables, and only insert ++ * positions that we search, which is one in step positions. ++ * The exact cutoff is flexible, I've just chosen a number that is reasonably high, ++ * so we minimize the compression ratio loss in "normal" scenarios. This mode gets ++ * triggered once we've gone 2KB without finding any matches. ++ */ ++ ms->lazySkipping = step > kLazySkippingStep; + continue; + } + +@@ -1579,12 +1631,12 @@ ZSTD_compressBlock_lazy_generic( + DEBUGLOG(7, "search depth 1"); + ip ++; + if ( (dictMode == ZSTD_noDict) +- && (offcode) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) { ++ && (offBase) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) { + size_t const mlRep = ZSTD_count(ip+4, ip+4-offset_1, iend) + 4; + int const gain2 = (int)(mlRep * 3); +- int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1); ++ int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offBase) + 1); + if ((mlRep >= 4) && (gain2 > gain1)) +- matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip; ++ matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip; + } + if (isDxS) { + const U32 repIndex = (U32)(ip - base) - offset_1; +@@ -1596,17 +1648,17 @@ ZSTD_compressBlock_lazy_generic( + const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend; + size_t const mlRep = ZSTD_count_2segments(ip+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4; + int const gain2 = (int)(mlRep * 3); +- int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1); ++ int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offBase) + 1); + if ((mlRep >= 4) && (gain2 > gain1)) +- matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip; ++ matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip; + } + } +- { size_t offset2=999999999; +- size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offset2, mls, rowLog, searchMethod, dictMode); +- int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2))); /* raw approx */ +- int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 4); ++ { size_t ofbCandidate=999999999; ++ size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, dictMode); ++ int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate)); /* raw approx */ ++ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 4); + if ((ml2 >= 4) && (gain2 > gain1)) { +- matchLength = ml2, offcode = offset2, start = ip; ++ matchLength = ml2, offBase = ofbCandidate, start = ip; + continue; /* search a better one */ + } } + +@@ -1615,12 +1667,12 @@ ZSTD_compressBlock_lazy_generic( + DEBUGLOG(7, "search depth 2"); + ip ++; + if ( (dictMode == ZSTD_noDict) +- && (offcode) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) { ++ && (offBase) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) { + size_t const mlRep = ZSTD_count(ip+4, ip+4-offset_1, iend) + 4; + int const gain2 = (int)(mlRep * 4); +- int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1); ++ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 1); + if ((mlRep >= 4) && (gain2 > gain1)) +- matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip; ++ matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip; + } + if (isDxS) { + const U32 repIndex = (U32)(ip - base) - offset_1; +@@ -1632,17 +1684,17 @@ ZSTD_compressBlock_lazy_generic( + const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend; + size_t const mlRep = ZSTD_count_2segments(ip+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4; + int const gain2 = (int)(mlRep * 4); +- int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1); ++ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 1); + if ((mlRep >= 4) && (gain2 > gain1)) +- matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip; ++ matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip; + } + } +- { size_t offset2=999999999; +- size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offset2, mls, rowLog, searchMethod, dictMode); +- int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2))); /* raw approx */ +- int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 7); ++ { size_t ofbCandidate=999999999; ++ size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, dictMode); ++ int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate)); /* raw approx */ ++ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 7); + if ((ml2 >= 4) && (gain2 > gain1)) { +- matchLength = ml2, offcode = offset2, start = ip; ++ matchLength = ml2, offBase = ofbCandidate, start = ip; + continue; + } } } + break; /* nothing found : store previous solution */ +@@ -1653,26 +1705,33 @@ ZSTD_compressBlock_lazy_generic( + * notably if `value` is unsigned, resulting in a large positive `-value`. + */ + /* catch up */ +- if (STORED_IS_OFFSET(offcode)) { ++ if (OFFBASE_IS_OFFSET(offBase)) { + if (dictMode == ZSTD_noDict) { +- while ( ((start > anchor) & (start - STORED_OFFSET(offcode) > prefixLowest)) +- && (start[-1] == (start-STORED_OFFSET(offcode))[-1]) ) /* only search for offset within prefix */ ++ while ( ((start > anchor) & (start - OFFBASE_TO_OFFSET(offBase) > prefixLowest)) ++ && (start[-1] == (start-OFFBASE_TO_OFFSET(offBase))[-1]) ) /* only search for offset within prefix */ + { start--; matchLength++; } + } + if (isDxS) { +- U32 const matchIndex = (U32)((size_t)(start-base) - STORED_OFFSET(offcode)); ++ U32 const matchIndex = (U32)((size_t)(start-base) - OFFBASE_TO_OFFSET(offBase)); + const BYTE* match = (matchIndex < prefixLowestIndex) ? dictBase + matchIndex - dictIndexDelta : base + matchIndex; + const BYTE* const mStart = (matchIndex < prefixLowestIndex) ? dictLowest : prefixLowest; + while ((start>anchor) && (match>mStart) && (start[-1] == match[-1])) { start--; match--; matchLength++; } /* catch up */ + } +- offset_2 = offset_1; offset_1 = (U32)STORED_OFFSET(offcode); ++ offset_2 = offset_1; offset_1 = (U32)OFFBASE_TO_OFFSET(offBase); + } + /* store sequence */ + _storeSequence: + { size_t const litLength = (size_t)(start - anchor); +- ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offcode, matchLength); ++ ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offBase, matchLength); + anchor = ip = start + matchLength; + } ++ if (ms->lazySkipping) { ++ /* We've found a match, disable lazy skipping mode, and refill the hash cache. */ ++ if (searchMethod == search_rowHash) { ++ ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit); ++ } ++ ms->lazySkipping = 0; ++ } + + /* check immediate repcode */ + if (isDxS) { +@@ -1686,8 +1745,8 @@ _storeSequence: + && (MEM_read32(repMatch) == MEM_read32(ip)) ) { + const BYTE* const repEnd2 = repIndex < prefixLowestIndex ? dictEnd : iend; + matchLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd2, prefixLowest) + 4; +- offcode = offset_2; offset_2 = offset_1; offset_1 = (U32)offcode; /* swap offset_2 <=> offset_1 */ +- ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, matchLength); ++ offBase = offset_2; offset_2 = offset_1; offset_1 = (U32)offBase; /* swap offset_2 <=> offset_1 */ ++ ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, matchLength); + ip += matchLength; + anchor = ip; + continue; +@@ -1701,166 +1760,181 @@ _storeSequence: + && (MEM_read32(ip) == MEM_read32(ip - offset_2)) ) { + /* store sequence */ + matchLength = ZSTD_count(ip+4, ip+4-offset_2, iend) + 4; +- offcode = offset_2; offset_2 = offset_1; offset_1 = (U32)offcode; /* swap repcodes */ +- ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, matchLength); ++ offBase = offset_2; offset_2 = offset_1; offset_1 = (U32)offBase; /* swap repcodes */ ++ ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, matchLength); + ip += matchLength; + anchor = ip; + continue; /* faster when present ... (?) */ + } } } + +- /* Save reps for next block */ +- rep[0] = offset_1 ? offset_1 : savedOffset; +- rep[1] = offset_2 ? offset_2 : savedOffset; ++ /* If offset_1 started invalid (offsetSaved1 != 0) and became valid (offset_1 != 0), ++ * rotate saved offsets. See comment in ZSTD_compressBlock_fast_noDict for more context. */ ++ offsetSaved2 = ((offsetSaved1 != 0) && (offset_1 != 0)) ? offsetSaved1 : offsetSaved2; ++ ++ /* save reps for next block */ ++ rep[0] = offset_1 ? offset_1 : offsetSaved1; ++ rep[1] = offset_2 ? offset_2 : offsetSaved2; + + /* Return the last literals size */ + return (size_t)(iend - anchor); + } ++#endif /* build exclusions */ + + +-size_t ZSTD_compressBlock_btlazy2( ++#ifndef ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR ++size_t ZSTD_compressBlock_greedy( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + { +- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2, ZSTD_noDict); ++ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_noDict); + } + +-size_t ZSTD_compressBlock_lazy2( ++size_t ZSTD_compressBlock_greedy_dictMatchState( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + { +- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_noDict); ++ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_dictMatchState); + } + +-size_t ZSTD_compressBlock_lazy( ++size_t ZSTD_compressBlock_greedy_dedicatedDictSearch( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + { +- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_noDict); ++ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_dedicatedDictSearch); + } + +-size_t ZSTD_compressBlock_greedy( ++size_t ZSTD_compressBlock_greedy_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + { +- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_noDict); ++ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_noDict); + } + +-size_t ZSTD_compressBlock_btlazy2_dictMatchState( ++size_t ZSTD_compressBlock_greedy_dictMatchState_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + { +- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2, ZSTD_dictMatchState); ++ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_dictMatchState); + } + +-size_t ZSTD_compressBlock_lazy2_dictMatchState( ++size_t ZSTD_compressBlock_greedy_dedicatedDictSearch_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + { +- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_dictMatchState); ++ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_dedicatedDictSearch); + } ++#endif + +-size_t ZSTD_compressBlock_lazy_dictMatchState( ++#ifndef ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR ++size_t ZSTD_compressBlock_lazy( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + { +- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_dictMatchState); ++ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_noDict); + } + +-size_t ZSTD_compressBlock_greedy_dictMatchState( ++size_t ZSTD_compressBlock_lazy_dictMatchState( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + { +- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_dictMatchState); ++ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_dictMatchState); + } + +- +-size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch( ++size_t ZSTD_compressBlock_lazy_dedicatedDictSearch( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + { +- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_dedicatedDictSearch); ++ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_dedicatedDictSearch); + } + +-size_t ZSTD_compressBlock_lazy_dedicatedDictSearch( ++size_t ZSTD_compressBlock_lazy_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + { +- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_dedicatedDictSearch); ++ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_noDict); + } + +-size_t ZSTD_compressBlock_greedy_dedicatedDictSearch( ++size_t ZSTD_compressBlock_lazy_dictMatchState_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + { +- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_dedicatedDictSearch); ++ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_dictMatchState); + } + +-/* Row-based matchfinder */ +-size_t ZSTD_compressBlock_lazy2_row( ++size_t ZSTD_compressBlock_lazy_dedicatedDictSearch_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + { +- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_noDict); ++ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_dedicatedDictSearch); + } ++#endif + +-size_t ZSTD_compressBlock_lazy_row( ++#ifndef ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR ++size_t ZSTD_compressBlock_lazy2( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + { +- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_noDict); ++ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_noDict); + } + +-size_t ZSTD_compressBlock_greedy_row( ++size_t ZSTD_compressBlock_lazy2_dictMatchState( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + { +- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_noDict); ++ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_dictMatchState); + } + +-size_t ZSTD_compressBlock_lazy2_dictMatchState_row( ++size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + { +- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_dictMatchState); ++ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_dedicatedDictSearch); + } + +-size_t ZSTD_compressBlock_lazy_dictMatchState_row( ++size_t ZSTD_compressBlock_lazy2_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + { +- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_dictMatchState); ++ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_noDict); + } + +-size_t ZSTD_compressBlock_greedy_dictMatchState_row( ++size_t ZSTD_compressBlock_lazy2_dictMatchState_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + { +- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_dictMatchState); ++ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_dictMatchState); + } + +- + size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + { + return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_dedicatedDictSearch); + } ++#endif + +-size_t ZSTD_compressBlock_lazy_dedicatedDictSearch_row( ++#ifndef ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR ++size_t ZSTD_compressBlock_btlazy2( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + { +- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_dedicatedDictSearch); ++ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2, ZSTD_noDict); + } + +-size_t ZSTD_compressBlock_greedy_dedicatedDictSearch_row( ++size_t ZSTD_compressBlock_btlazy2_dictMatchState( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + { +- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_dedicatedDictSearch); ++ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2, ZSTD_dictMatchState); + } ++#endif + ++#if !defined(ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR) \ ++ || !defined(ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR) \ ++ || !defined(ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR) \ ++ || !defined(ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR) + FORCE_INLINE_TEMPLATE ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR + size_t ZSTD_compressBlock_lazy_extDict_generic( + ZSTD_matchState_t* ms, seqStore_t* seqStore, + U32 rep[ZSTD_REP_NUM], +@@ -1886,12 +1960,13 @@ size_t ZSTD_compressBlock_lazy_extDict_g + + DEBUGLOG(5, "ZSTD_compressBlock_lazy_extDict_generic (searchFunc=%u)", (U32)searchMethod); + ++ /* Reset the lazy skipping state */ ++ ms->lazySkipping = 0; ++ + /* init */ + ip += (ip == prefixStart); + if (searchMethod == search_rowHash) { +- ZSTD_row_fillHashCache(ms, base, rowLog, +- MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */), +- ms->nextToUpdate, ilimit); ++ ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit); + } + + /* Match Loop */ +@@ -1903,7 +1978,7 @@ size_t ZSTD_compressBlock_lazy_extDict_g + #endif + while (ip < ilimit) { + size_t matchLength=0; +- size_t offcode=STORE_REPCODE_1; ++ size_t offBase = REPCODE1_TO_OFFBASE; + const BYTE* start=ip+1; + U32 curr = (U32)(ip-base); + +@@ -1922,14 +1997,23 @@ size_t ZSTD_compressBlock_lazy_extDict_g + } } + + /* first search (depth 0) */ +- { size_t offsetFound = 999999999; +- size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offsetFound, mls, rowLog, searchMethod, ZSTD_extDict); ++ { size_t ofbCandidate = 999999999; ++ size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, ZSTD_extDict); + if (ml2 > matchLength) +- matchLength = ml2, start = ip, offcode=offsetFound; ++ matchLength = ml2, start = ip, offBase = ofbCandidate; + } + + if (matchLength < 4) { +- ip += ((ip-anchor) >> kSearchStrength) + 1; /* jump faster over incompressible sections */ ++ size_t const step = ((size_t)(ip-anchor) >> kSearchStrength); ++ ip += step + 1; /* jump faster over incompressible sections */ ++ /* Enter the lazy skipping mode once we are skipping more than 8 bytes at a time. ++ * In this mode we stop inserting every position into our tables, and only insert ++ * positions that we search, which is one in step positions. ++ * The exact cutoff is flexible, I've just chosen a number that is reasonably high, ++ * so we minimize the compression ratio loss in "normal" scenarios. This mode gets ++ * triggered once we've gone 2KB without finding any matches. ++ */ ++ ms->lazySkipping = step > kLazySkippingStep; + continue; + } + +@@ -1939,7 +2023,7 @@ size_t ZSTD_compressBlock_lazy_extDict_g + ip ++; + curr++; + /* check repCode */ +- if (offcode) { ++ if (offBase) { + const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr, windowLog); + const U32 repIndex = (U32)(curr - offset_1); + const BYTE* const repBase = repIndex < dictLimit ? dictBase : base; +@@ -1951,18 +2035,18 @@ size_t ZSTD_compressBlock_lazy_extDict_g + const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend; + size_t const repLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4; + int const gain2 = (int)(repLength * 3); +- int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1); ++ int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offBase) + 1); + if ((repLength >= 4) && (gain2 > gain1)) +- matchLength = repLength, offcode = STORE_REPCODE_1, start = ip; ++ matchLength = repLength, offBase = REPCODE1_TO_OFFBASE, start = ip; + } } + + /* search match, depth 1 */ +- { size_t offset2=999999999; +- size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offset2, mls, rowLog, searchMethod, ZSTD_extDict); +- int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2))); /* raw approx */ +- int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 4); ++ { size_t ofbCandidate = 999999999; ++ size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, ZSTD_extDict); ++ int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate)); /* raw approx */ ++ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 4); + if ((ml2 >= 4) && (gain2 > gain1)) { +- matchLength = ml2, offcode = offset2, start = ip; ++ matchLength = ml2, offBase = ofbCandidate, start = ip; + continue; /* search a better one */ + } } + +@@ -1971,7 +2055,7 @@ size_t ZSTD_compressBlock_lazy_extDict_g + ip ++; + curr++; + /* check repCode */ +- if (offcode) { ++ if (offBase) { + const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr, windowLog); + const U32 repIndex = (U32)(curr - offset_1); + const BYTE* const repBase = repIndex < dictLimit ? dictBase : base; +@@ -1983,38 +2067,45 @@ size_t ZSTD_compressBlock_lazy_extDict_g + const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend; + size_t const repLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4; + int const gain2 = (int)(repLength * 4); +- int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1); ++ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 1); + if ((repLength >= 4) && (gain2 > gain1)) +- matchLength = repLength, offcode = STORE_REPCODE_1, start = ip; ++ matchLength = repLength, offBase = REPCODE1_TO_OFFBASE, start = ip; + } } + + /* search match, depth 2 */ +- { size_t offset2=999999999; +- size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offset2, mls, rowLog, searchMethod, ZSTD_extDict); +- int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2))); /* raw approx */ +- int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 7); ++ { size_t ofbCandidate = 999999999; ++ size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, ZSTD_extDict); ++ int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate)); /* raw approx */ ++ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 7); + if ((ml2 >= 4) && (gain2 > gain1)) { +- matchLength = ml2, offcode = offset2, start = ip; ++ matchLength = ml2, offBase = ofbCandidate, start = ip; + continue; + } } } + break; /* nothing found : store previous solution */ + } + + /* catch up */ +- if (STORED_IS_OFFSET(offcode)) { +- U32 const matchIndex = (U32)((size_t)(start-base) - STORED_OFFSET(offcode)); ++ if (OFFBASE_IS_OFFSET(offBase)) { ++ U32 const matchIndex = (U32)((size_t)(start-base) - OFFBASE_TO_OFFSET(offBase)); + const BYTE* match = (matchIndex < dictLimit) ? dictBase + matchIndex : base + matchIndex; + const BYTE* const mStart = (matchIndex < dictLimit) ? dictStart : prefixStart; + while ((start>anchor) && (match>mStart) && (start[-1] == match[-1])) { start--; match--; matchLength++; } /* catch up */ +- offset_2 = offset_1; offset_1 = (U32)STORED_OFFSET(offcode); ++ offset_2 = offset_1; offset_1 = (U32)OFFBASE_TO_OFFSET(offBase); + } + + /* store sequence */ + _storeSequence: + { size_t const litLength = (size_t)(start - anchor); +- ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offcode, matchLength); ++ ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offBase, matchLength); + anchor = ip = start + matchLength; + } ++ if (ms->lazySkipping) { ++ /* We've found a match, disable lazy skipping mode, and refill the hash cache. */ ++ if (searchMethod == search_rowHash) { ++ ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit); ++ } ++ ms->lazySkipping = 0; ++ } + + /* check immediate repcode */ + while (ip <= ilimit) { +@@ -2029,8 +2120,8 @@ _storeSequence: + /* repcode detected we should take it */ + const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend; + matchLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4; +- offcode = offset_2; offset_2 = offset_1; offset_1 = (U32)offcode; /* swap offset history */ +- ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, matchLength); ++ offBase = offset_2; offset_2 = offset_1; offset_1 = (U32)offBase; /* swap offset history */ ++ ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, matchLength); + ip += matchLength; + anchor = ip; + continue; /* faster when present ... (?) */ +@@ -2045,8 +2136,9 @@ _storeSequence: + /* Return the last literals size */ + return (size_t)(iend - anchor); + } ++#endif /* build exclusions */ + +- ++#ifndef ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR + size_t ZSTD_compressBlock_greedy_extDict( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) +@@ -2054,49 +2146,55 @@ size_t ZSTD_compressBlock_greedy_extDict + return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0); + } + +-size_t ZSTD_compressBlock_lazy_extDict( ++size_t ZSTD_compressBlock_greedy_extDict_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) +- + { +- return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1); ++ return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0); + } ++#endif + +-size_t ZSTD_compressBlock_lazy2_extDict( ++#ifndef ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR ++size_t ZSTD_compressBlock_lazy_extDict( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + + { +- return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2); ++ return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1); + } + +-size_t ZSTD_compressBlock_btlazy2_extDict( ++size_t ZSTD_compressBlock_lazy_extDict_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + + { +- return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2); ++ return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1); + } ++#endif + +-size_t ZSTD_compressBlock_greedy_extDict_row( ++#ifndef ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR ++size_t ZSTD_compressBlock_lazy2_extDict( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) ++ + { +- return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0); ++ return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2); + } + +-size_t ZSTD_compressBlock_lazy_extDict_row( ++size_t ZSTD_compressBlock_lazy2_extDict_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) +- + { +- return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1); ++ return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2); + } ++#endif + +-size_t ZSTD_compressBlock_lazy2_extDict_row( ++#ifndef ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR ++size_t ZSTD_compressBlock_btlazy2_extDict( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + + { +- return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2); ++ return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2); + } ++#endif +--- a/lib/zstd/compress/zstd_lazy.h ++++ b/lib/zstd/compress/zstd_lazy.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -22,98 +23,175 @@ + */ + #define ZSTD_LAZY_DDSS_BUCKET_LOG 2 + ++#define ZSTD_ROW_HASH_TAG_BITS 8 /* nb bits to use for the tag */ ++ ++#if !defined(ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR) \ ++ || !defined(ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR) \ ++ || !defined(ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR) \ ++ || !defined(ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR) + U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip); + void ZSTD_row_update(ZSTD_matchState_t* const ms, const BYTE* ip); + + void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZSTD_matchState_t* ms, const BYTE* const ip); + + void ZSTD_preserveUnsortedMark (U32* const table, U32 const size, U32 const reducerValue); /*! used in ZSTD_reduceIndex(). preemptively increase value of ZSTD_DUBT_UNSORTED_MARK */ ++#endif + +-size_t ZSTD_compressBlock_btlazy2( ++#ifndef ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR ++size_t ZSTD_compressBlock_greedy( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +-size_t ZSTD_compressBlock_lazy2( ++size_t ZSTD_compressBlock_greedy_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +-size_t ZSTD_compressBlock_lazy( ++size_t ZSTD_compressBlock_greedy_dictMatchState( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +-size_t ZSTD_compressBlock_greedy( ++size_t ZSTD_compressBlock_greedy_dictMatchState_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +-size_t ZSTD_compressBlock_lazy2_row( ++size_t ZSTD_compressBlock_greedy_dedicatedDictSearch( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +-size_t ZSTD_compressBlock_lazy_row( ++size_t ZSTD_compressBlock_greedy_dedicatedDictSearch_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +-size_t ZSTD_compressBlock_greedy_row( ++size_t ZSTD_compressBlock_greedy_extDict( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +- +-size_t ZSTD_compressBlock_btlazy2_dictMatchState( ++size_t ZSTD_compressBlock_greedy_extDict_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +-size_t ZSTD_compressBlock_lazy2_dictMatchState( ++ ++#define ZSTD_COMPRESSBLOCK_GREEDY ZSTD_compressBlock_greedy ++#define ZSTD_COMPRESSBLOCK_GREEDY_ROW ZSTD_compressBlock_greedy_row ++#define ZSTD_COMPRESSBLOCK_GREEDY_DICTMATCHSTATE ZSTD_compressBlock_greedy_dictMatchState ++#define ZSTD_COMPRESSBLOCK_GREEDY_DICTMATCHSTATE_ROW ZSTD_compressBlock_greedy_dictMatchState_row ++#define ZSTD_COMPRESSBLOCK_GREEDY_DEDICATEDDICTSEARCH ZSTD_compressBlock_greedy_dedicatedDictSearch ++#define ZSTD_COMPRESSBLOCK_GREEDY_DEDICATEDDICTSEARCH_ROW ZSTD_compressBlock_greedy_dedicatedDictSearch_row ++#define ZSTD_COMPRESSBLOCK_GREEDY_EXTDICT ZSTD_compressBlock_greedy_extDict ++#define ZSTD_COMPRESSBLOCK_GREEDY_EXTDICT_ROW ZSTD_compressBlock_greedy_extDict_row ++#else ++#define ZSTD_COMPRESSBLOCK_GREEDY NULL ++#define ZSTD_COMPRESSBLOCK_GREEDY_ROW NULL ++#define ZSTD_COMPRESSBLOCK_GREEDY_DICTMATCHSTATE NULL ++#define ZSTD_COMPRESSBLOCK_GREEDY_DICTMATCHSTATE_ROW NULL ++#define ZSTD_COMPRESSBLOCK_GREEDY_DEDICATEDDICTSEARCH NULL ++#define ZSTD_COMPRESSBLOCK_GREEDY_DEDICATEDDICTSEARCH_ROW NULL ++#define ZSTD_COMPRESSBLOCK_GREEDY_EXTDICT NULL ++#define ZSTD_COMPRESSBLOCK_GREEDY_EXTDICT_ROW NULL ++#endif ++ ++#ifndef ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR ++size_t ZSTD_compressBlock_lazy( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +-size_t ZSTD_compressBlock_lazy_dictMatchState( ++size_t ZSTD_compressBlock_lazy_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +-size_t ZSTD_compressBlock_greedy_dictMatchState( ++size_t ZSTD_compressBlock_lazy_dictMatchState( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +-size_t ZSTD_compressBlock_lazy2_dictMatchState_row( ++size_t ZSTD_compressBlock_lazy_dictMatchState_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +-size_t ZSTD_compressBlock_lazy_dictMatchState_row( ++size_t ZSTD_compressBlock_lazy_dedicatedDictSearch( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +-size_t ZSTD_compressBlock_greedy_dictMatchState_row( ++size_t ZSTD_compressBlock_lazy_dedicatedDictSearch_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +- +-size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch( ++size_t ZSTD_compressBlock_lazy_extDict( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +-size_t ZSTD_compressBlock_lazy_dedicatedDictSearch( ++size_t ZSTD_compressBlock_lazy_extDict_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +-size_t ZSTD_compressBlock_greedy_dedicatedDictSearch( ++ ++#define ZSTD_COMPRESSBLOCK_LAZY ZSTD_compressBlock_lazy ++#define ZSTD_COMPRESSBLOCK_LAZY_ROW ZSTD_compressBlock_lazy_row ++#define ZSTD_COMPRESSBLOCK_LAZY_DICTMATCHSTATE ZSTD_compressBlock_lazy_dictMatchState ++#define ZSTD_COMPRESSBLOCK_LAZY_DICTMATCHSTATE_ROW ZSTD_compressBlock_lazy_dictMatchState_row ++#define ZSTD_COMPRESSBLOCK_LAZY_DEDICATEDDICTSEARCH ZSTD_compressBlock_lazy_dedicatedDictSearch ++#define ZSTD_COMPRESSBLOCK_LAZY_DEDICATEDDICTSEARCH_ROW ZSTD_compressBlock_lazy_dedicatedDictSearch_row ++#define ZSTD_COMPRESSBLOCK_LAZY_EXTDICT ZSTD_compressBlock_lazy_extDict ++#define ZSTD_COMPRESSBLOCK_LAZY_EXTDICT_ROW ZSTD_compressBlock_lazy_extDict_row ++#else ++#define ZSTD_COMPRESSBLOCK_LAZY NULL ++#define ZSTD_COMPRESSBLOCK_LAZY_ROW NULL ++#define ZSTD_COMPRESSBLOCK_LAZY_DICTMATCHSTATE NULL ++#define ZSTD_COMPRESSBLOCK_LAZY_DICTMATCHSTATE_ROW NULL ++#define ZSTD_COMPRESSBLOCK_LAZY_DEDICATEDDICTSEARCH NULL ++#define ZSTD_COMPRESSBLOCK_LAZY_DEDICATEDDICTSEARCH_ROW NULL ++#define ZSTD_COMPRESSBLOCK_LAZY_EXTDICT NULL ++#define ZSTD_COMPRESSBLOCK_LAZY_EXTDICT_ROW NULL ++#endif ++ ++#ifndef ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR ++size_t ZSTD_compressBlock_lazy2( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +-size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch_row( ++size_t ZSTD_compressBlock_lazy2_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +-size_t ZSTD_compressBlock_lazy_dedicatedDictSearch_row( ++size_t ZSTD_compressBlock_lazy2_dictMatchState( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +-size_t ZSTD_compressBlock_greedy_dedicatedDictSearch_row( ++size_t ZSTD_compressBlock_lazy2_dictMatchState_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +- +-size_t ZSTD_compressBlock_greedy_extDict( ++size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +-size_t ZSTD_compressBlock_lazy_extDict( ++size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); + size_t ZSTD_compressBlock_lazy2_extDict( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +-size_t ZSTD_compressBlock_greedy_extDict_row( ++size_t ZSTD_compressBlock_lazy2_extDict_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +-size_t ZSTD_compressBlock_lazy_extDict_row( ++ ++#define ZSTD_COMPRESSBLOCK_LAZY2 ZSTD_compressBlock_lazy2 ++#define ZSTD_COMPRESSBLOCK_LAZY2_ROW ZSTD_compressBlock_lazy2_row ++#define ZSTD_COMPRESSBLOCK_LAZY2_DICTMATCHSTATE ZSTD_compressBlock_lazy2_dictMatchState ++#define ZSTD_COMPRESSBLOCK_LAZY2_DICTMATCHSTATE_ROW ZSTD_compressBlock_lazy2_dictMatchState_row ++#define ZSTD_COMPRESSBLOCK_LAZY2_DEDICATEDDICTSEARCH ZSTD_compressBlock_lazy2_dedicatedDictSearch ++#define ZSTD_COMPRESSBLOCK_LAZY2_DEDICATEDDICTSEARCH_ROW ZSTD_compressBlock_lazy2_dedicatedDictSearch_row ++#define ZSTD_COMPRESSBLOCK_LAZY2_EXTDICT ZSTD_compressBlock_lazy2_extDict ++#define ZSTD_COMPRESSBLOCK_LAZY2_EXTDICT_ROW ZSTD_compressBlock_lazy2_extDict_row ++#else ++#define ZSTD_COMPRESSBLOCK_LAZY2 NULL ++#define ZSTD_COMPRESSBLOCK_LAZY2_ROW NULL ++#define ZSTD_COMPRESSBLOCK_LAZY2_DICTMATCHSTATE NULL ++#define ZSTD_COMPRESSBLOCK_LAZY2_DICTMATCHSTATE_ROW NULL ++#define ZSTD_COMPRESSBLOCK_LAZY2_DEDICATEDDICTSEARCH NULL ++#define ZSTD_COMPRESSBLOCK_LAZY2_DEDICATEDDICTSEARCH_ROW NULL ++#define ZSTD_COMPRESSBLOCK_LAZY2_EXTDICT NULL ++#define ZSTD_COMPRESSBLOCK_LAZY2_EXTDICT_ROW NULL ++#endif ++ ++#ifndef ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR ++size_t ZSTD_compressBlock_btlazy2( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +-size_t ZSTD_compressBlock_lazy2_extDict_row( ++size_t ZSTD_compressBlock_btlazy2_dictMatchState( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); + size_t ZSTD_compressBlock_btlazy2_extDict( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +- ++ ++#define ZSTD_COMPRESSBLOCK_BTLAZY2 ZSTD_compressBlock_btlazy2 ++#define ZSTD_COMPRESSBLOCK_BTLAZY2_DICTMATCHSTATE ZSTD_compressBlock_btlazy2_dictMatchState ++#define ZSTD_COMPRESSBLOCK_BTLAZY2_EXTDICT ZSTD_compressBlock_btlazy2_extDict ++#else ++#define ZSTD_COMPRESSBLOCK_BTLAZY2 NULL ++#define ZSTD_COMPRESSBLOCK_BTLAZY2_DICTMATCHSTATE NULL ++#define ZSTD_COMPRESSBLOCK_BTLAZY2_EXTDICT NULL ++#endif ++ + + + #endif /* ZSTD_LAZY_H */ +--- a/lib/zstd/compress/zstd_ldm.c ++++ b/lib/zstd/compress/zstd_ldm.c +@@ -1,5 +1,6 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -242,11 +243,15 @@ static size_t ZSTD_ldm_fillFastTables(ZS + switch(ms->cParams.strategy) + { + case ZSTD_fast: +- ZSTD_fillHashTable(ms, iend, ZSTD_dtlm_fast); ++ ZSTD_fillHashTable(ms, iend, ZSTD_dtlm_fast, ZSTD_tfp_forCCtx); + break; + + case ZSTD_dfast: +- ZSTD_fillDoubleHashTable(ms, iend, ZSTD_dtlm_fast); ++#ifndef ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR ++ ZSTD_fillDoubleHashTable(ms, iend, ZSTD_dtlm_fast, ZSTD_tfp_forCCtx); ++#else ++ assert(0); /* shouldn't be called: cparams should've been adjusted. */ ++#endif + break; + + case ZSTD_greedy: +@@ -318,7 +323,9 @@ static void ZSTD_ldm_limitTableUpdate(ZS + } + } + +-static size_t ZSTD_ldm_generateSequences_internal( ++static ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++size_t ZSTD_ldm_generateSequences_internal( + ldmState_t* ldmState, rawSeqStore_t* rawSeqStore, + ldmParams_t const* params, void const* src, size_t srcSize) + { +@@ -549,7 +556,7 @@ size_t ZSTD_ldm_generateSequences( + * the window through early invalidation. + * TODO: * Test the chunk size. + * * Try invalidation after the sequence generation and test the +- * the offset against maxDist directly. ++ * offset against maxDist directly. + * + * NOTE: Because of dictionaries + sequence splitting we MUST make sure + * that any offset used is valid at the END of the sequence, since it may +@@ -689,7 +696,6 @@ size_t ZSTD_ldm_blockCompress(rawSeqStor + /* maybeSplitSequence updates rawSeqStore->pos */ + rawSeq const sequence = maybeSplitSequence(rawSeqStore, + (U32)(iend - ip), minMatch); +- int i; + /* End signal */ + if (sequence.offset == 0) + break; +@@ -702,6 +708,7 @@ size_t ZSTD_ldm_blockCompress(rawSeqStor + /* Run the block compressor */ + DEBUGLOG(5, "pos %u : calling block compressor on segment of size %u", (unsigned)(ip-istart), sequence.litLength); + { ++ int i; + size_t const newLitLength = + blockCompressor(ms, seqStore, rep, ip, sequence.litLength); + ip += sequence.litLength; +@@ -711,7 +718,7 @@ size_t ZSTD_ldm_blockCompress(rawSeqStor + rep[0] = sequence.offset; + /* Store the sequence */ + ZSTD_storeSeq(seqStore, newLitLength, ip - newLitLength, iend, +- STORE_OFFSET(sequence.offset), ++ OFFSET_TO_OFFBASE(sequence.offset), + sequence.matchLength); + ip += sequence.matchLength; + } +--- a/lib/zstd/compress/zstd_ldm.h ++++ b/lib/zstd/compress/zstd_ldm.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +--- a/lib/zstd/compress/zstd_ldm_geartab.h ++++ b/lib/zstd/compress/zstd_ldm_geartab.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +--- a/lib/zstd/compress/zstd_opt.c ++++ b/lib/zstd/compress/zstd_opt.c +@@ -1,5 +1,6 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* +- * Copyright (c) Przemyslaw Skibinski, Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -12,11 +13,14 @@ + #include "hist.h" + #include "zstd_opt.h" + ++#if !defined(ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR) \ ++ || !defined(ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR) \ ++ || !defined(ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR) + + #define ZSTD_LITFREQ_ADD 2 /* scaling factor for litFreq, so that frequencies adapt faster to new stats */ + #define ZSTD_MAX_PRICE (1<<30) + +-#define ZSTD_PREDEF_THRESHOLD 1024 /* if srcSize < ZSTD_PREDEF_THRESHOLD, symbols' cost is assumed static, directly determined by pre-defined distributions */ ++#define ZSTD_PREDEF_THRESHOLD 8 /* if srcSize < ZSTD_PREDEF_THRESHOLD, symbols' cost is assumed static, directly determined by pre-defined distributions */ + + + /*-************************************* +@@ -26,27 +30,35 @@ + #if 0 /* approximation at bit level (for tests) */ + # define BITCOST_ACCURACY 0 + # define BITCOST_MULTIPLIER (1 << BITCOST_ACCURACY) +-# define WEIGHT(stat, opt) ((void)opt, ZSTD_bitWeight(stat)) ++# define WEIGHT(stat, opt) ((void)(opt), ZSTD_bitWeight(stat)) + #elif 0 /* fractional bit accuracy (for tests) */ + # define BITCOST_ACCURACY 8 + # define BITCOST_MULTIPLIER (1 << BITCOST_ACCURACY) +-# define WEIGHT(stat,opt) ((void)opt, ZSTD_fracWeight(stat)) ++# define WEIGHT(stat,opt) ((void)(opt), ZSTD_fracWeight(stat)) + #else /* opt==approx, ultra==accurate */ + # define BITCOST_ACCURACY 8 + # define BITCOST_MULTIPLIER (1 << BITCOST_ACCURACY) +-# define WEIGHT(stat,opt) (opt ? ZSTD_fracWeight(stat) : ZSTD_bitWeight(stat)) ++# define WEIGHT(stat,opt) ((opt) ? ZSTD_fracWeight(stat) : ZSTD_bitWeight(stat)) + #endif + ++/* ZSTD_bitWeight() : ++ * provide estimated "cost" of a stat in full bits only */ + MEM_STATIC U32 ZSTD_bitWeight(U32 stat) + { + return (ZSTD_highbit32(stat+1) * BITCOST_MULTIPLIER); + } + ++/* ZSTD_fracWeight() : ++ * provide fractional-bit "cost" of a stat, ++ * using linear interpolation approximation */ + MEM_STATIC U32 ZSTD_fracWeight(U32 rawStat) + { + U32 const stat = rawStat + 1; + U32 const hb = ZSTD_highbit32(stat); + U32 const BWeight = hb * BITCOST_MULTIPLIER; ++ /* Fweight was meant for "Fractional weight" ++ * but it's effectively a value between 1 and 2 ++ * using fixed point arithmetic */ + U32 const FWeight = (stat << BITCOST_ACCURACY) >> hb; + U32 const weight = BWeight + FWeight; + assert(hb + BITCOST_ACCURACY < 31); +@@ -57,7 +69,7 @@ MEM_STATIC U32 ZSTD_fracWeight(U32 rawSt + /* debugging function, + * @return price in bytes as fractional value + * for debug messages only */ +-MEM_STATIC double ZSTD_fCost(U32 price) ++MEM_STATIC double ZSTD_fCost(int price) + { + return (double)price / (BITCOST_MULTIPLIER*8); + } +@@ -88,20 +100,26 @@ static U32 sum_u32(const unsigned table[ + return total; + } + +-static U32 ZSTD_downscaleStats(unsigned* table, U32 lastEltIndex, U32 shift) ++typedef enum { base_0possible=0, base_1guaranteed=1 } base_directive_e; ++ ++static U32 ++ZSTD_downscaleStats(unsigned* table, U32 lastEltIndex, U32 shift, base_directive_e base1) + { + U32 s, sum=0; +- DEBUGLOG(5, "ZSTD_downscaleStats (nbElts=%u, shift=%u)", (unsigned)lastEltIndex+1, (unsigned)shift); ++ DEBUGLOG(5, "ZSTD_downscaleStats (nbElts=%u, shift=%u)", ++ (unsigned)lastEltIndex+1, (unsigned)shift ); + assert(shift < 30); + for (s=0; s> shift); +- sum += table[s]; ++ unsigned const base = base1 ? 1 : (table[s]>0); ++ unsigned const newStat = base + (table[s] >> shift); ++ sum += newStat; ++ table[s] = newStat; + } + return sum; + } + + /* ZSTD_scaleStats() : +- * reduce all elements in table is sum too large ++ * reduce all elt frequencies in table if sum too large + * return the resulting sum of elements */ + static U32 ZSTD_scaleStats(unsigned* table, U32 lastEltIndex, U32 logTarget) + { +@@ -110,7 +128,7 @@ static U32 ZSTD_scaleStats(unsigned* tab + DEBUGLOG(5, "ZSTD_scaleStats (nbElts=%u, target=%u)", (unsigned)lastEltIndex+1, (unsigned)logTarget); + assert(logTarget < 30); + if (factor <= 1) return prevsum; +- return ZSTD_downscaleStats(table, lastEltIndex, ZSTD_highbit32(factor)); ++ return ZSTD_downscaleStats(table, lastEltIndex, ZSTD_highbit32(factor), base_1guaranteed); + } + + /* ZSTD_rescaleFreqs() : +@@ -129,18 +147,22 @@ ZSTD_rescaleFreqs(optState_t* const optP + DEBUGLOG(5, "ZSTD_rescaleFreqs (srcSize=%u)", (unsigned)srcSize); + optPtr->priceType = zop_dynamic; + +- if (optPtr->litLengthSum == 0) { /* first block : init */ +- if (srcSize <= ZSTD_PREDEF_THRESHOLD) { /* heuristic */ +- DEBUGLOG(5, "(srcSize <= ZSTD_PREDEF_THRESHOLD) => zop_predef"); ++ if (optPtr->litLengthSum == 0) { /* no literals stats collected -> first block assumed -> init */ ++ ++ /* heuristic: use pre-defined stats for too small inputs */ ++ if (srcSize <= ZSTD_PREDEF_THRESHOLD) { ++ DEBUGLOG(5, "srcSize <= %i : use predefined stats", ZSTD_PREDEF_THRESHOLD); + optPtr->priceType = zop_predef; + } + + assert(optPtr->symbolCosts != NULL); + if (optPtr->symbolCosts->huf.repeatMode == HUF_repeat_valid) { +- /* huffman table presumed generated by dictionary */ ++ ++ /* huffman stats covering the full value set : table presumed generated by dictionary */ + optPtr->priceType = zop_dynamic; + + if (compressedLiterals) { ++ /* generate literals statistics from huffman table */ + unsigned lit; + assert(optPtr->litFreq != NULL); + optPtr->litSum = 0; +@@ -188,13 +210,14 @@ ZSTD_rescaleFreqs(optState_t* const optP + optPtr->offCodeSum += optPtr->offCodeFreq[of]; + } } + +- } else { /* not a dictionary */ ++ } else { /* first block, no dictionary */ + + assert(optPtr->litFreq != NULL); + if (compressedLiterals) { ++ /* base initial cost of literals on direct frequency within src */ + unsigned lit = MaxLit; + HIST_count_simple(optPtr->litFreq, &lit, src, srcSize); /* use raw first block to init statistics */ +- optPtr->litSum = ZSTD_downscaleStats(optPtr->litFreq, MaxLit, 8); ++ optPtr->litSum = ZSTD_downscaleStats(optPtr->litFreq, MaxLit, 8, base_0possible); + } + + { unsigned const baseLLfreqs[MaxLL+1] = { +@@ -224,10 +247,9 @@ ZSTD_rescaleFreqs(optState_t* const optP + optPtr->offCodeSum = sum_u32(baseOFCfreqs, MaxOff+1); + } + +- + } + +- } else { /* new block : re-use previous statistics, scaled down */ ++ } else { /* new block : scale down accumulated statistics */ + + if (compressedLiterals) + optPtr->litSum = ZSTD_scaleStats(optPtr->litFreq, MaxLit, 12); +@@ -246,6 +268,7 @@ static U32 ZSTD_rawLiteralsCost(const BY + const optState_t* const optPtr, + int optLevel) + { ++ DEBUGLOG(8, "ZSTD_rawLiteralsCost (%u literals)", litLength); + if (litLength == 0) return 0; + + if (!ZSTD_compressedLiterals(optPtr)) +@@ -255,11 +278,14 @@ static U32 ZSTD_rawLiteralsCost(const BY + return (litLength*6) * BITCOST_MULTIPLIER; /* 6 bit per literal - no statistic used */ + + /* dynamic statistics */ +- { U32 price = litLength * optPtr->litSumBasePrice; ++ { U32 price = optPtr->litSumBasePrice * litLength; ++ U32 const litPriceMax = optPtr->litSumBasePrice - BITCOST_MULTIPLIER; + U32 u; ++ assert(optPtr->litSumBasePrice >= BITCOST_MULTIPLIER); + for (u=0; u < litLength; u++) { +- assert(WEIGHT(optPtr->litFreq[literals[u]], optLevel) <= optPtr->litSumBasePrice); /* literal cost should never be negative */ +- price -= WEIGHT(optPtr->litFreq[literals[u]], optLevel); ++ U32 litPrice = WEIGHT(optPtr->litFreq[literals[u]], optLevel); ++ if (UNLIKELY(litPrice > litPriceMax)) litPrice = litPriceMax; ++ price -= litPrice; + } + return price; + } +@@ -272,10 +298,11 @@ static U32 ZSTD_litLengthPrice(U32 const + assert(litLength <= ZSTD_BLOCKSIZE_MAX); + if (optPtr->priceType == zop_predef) + return WEIGHT(litLength, optLevel); +- /* We can't compute the litLength price for sizes >= ZSTD_BLOCKSIZE_MAX +- * because it isn't representable in the zstd format. So instead just +- * call it 1 bit more than ZSTD_BLOCKSIZE_MAX - 1. In this case the block +- * would be all literals. ++ ++ /* ZSTD_LLcode() can't compute litLength price for sizes >= ZSTD_BLOCKSIZE_MAX ++ * because it isn't representable in the zstd format. ++ * So instead just pretend it would cost 1 bit more than ZSTD_BLOCKSIZE_MAX - 1. ++ * In such a case, the block would be all literals. + */ + if (litLength == ZSTD_BLOCKSIZE_MAX) + return BITCOST_MULTIPLIER + ZSTD_litLengthPrice(ZSTD_BLOCKSIZE_MAX - 1, optPtr, optLevel); +@@ -289,24 +316,25 @@ static U32 ZSTD_litLengthPrice(U32 const + } + + /* ZSTD_getMatchPrice() : +- * Provides the cost of the match part (offset + matchLength) of a sequence ++ * Provides the cost of the match part (offset + matchLength) of a sequence. + * Must be combined with ZSTD_fullLiteralsCost() to get the full cost of a sequence. +- * @offcode : expects a scale where 0,1,2 are repcodes 1-3, and 3+ are real_offsets+2 ++ * @offBase : sumtype, representing an offset or a repcode, and using numeric representation of ZSTD_storeSeq() + * @optLevel: when <2, favors small offset for decompression speed (improved cache efficiency) + */ + FORCE_INLINE_TEMPLATE U32 +-ZSTD_getMatchPrice(U32 const offcode, ++ZSTD_getMatchPrice(U32 const offBase, + U32 const matchLength, + const optState_t* const optPtr, + int const optLevel) + { + U32 price; +- U32 const offCode = ZSTD_highbit32(STORED_TO_OFFBASE(offcode)); ++ U32 const offCode = ZSTD_highbit32(offBase); + U32 const mlBase = matchLength - MINMATCH; + assert(matchLength >= MINMATCH); + +- if (optPtr->priceType == zop_predef) /* fixed scheme, do not use statistics */ +- return WEIGHT(mlBase, optLevel) + ((16 + offCode) * BITCOST_MULTIPLIER); ++ if (optPtr->priceType == zop_predef) /* fixed scheme, does not use statistics */ ++ return WEIGHT(mlBase, optLevel) ++ + ((16 + offCode) * BITCOST_MULTIPLIER); /* emulated offset cost */ + + /* dynamic statistics */ + price = (offCode * BITCOST_MULTIPLIER) + (optPtr->offCodeSumBasePrice - WEIGHT(optPtr->offCodeFreq[offCode], optLevel)); +@@ -325,10 +353,10 @@ ZSTD_getMatchPrice(U32 const offcode, + } + + /* ZSTD_updateStats() : +- * assumption : literals + litLengtn <= iend */ ++ * assumption : literals + litLength <= iend */ + static void ZSTD_updateStats(optState_t* const optPtr, + U32 litLength, const BYTE* literals, +- U32 offsetCode, U32 matchLength) ++ U32 offBase, U32 matchLength) + { + /* literals */ + if (ZSTD_compressedLiterals(optPtr)) { +@@ -344,8 +372,8 @@ static void ZSTD_updateStats(optState_t* + optPtr->litLengthSum++; + } + +- /* offset code : expected to follow storeSeq() numeric representation */ +- { U32 const offCode = ZSTD_highbit32(STORED_TO_OFFBASE(offsetCode)); ++ /* offset code : follows storeSeq() numeric representation */ ++ { U32 const offCode = ZSTD_highbit32(offBase); + assert(offCode <= MaxOff); + optPtr->offCodeFreq[offCode]++; + optPtr->offCodeSum++; +@@ -379,9 +407,11 @@ MEM_STATIC U32 ZSTD_readMINMATCH(const v + + /* Update hashTable3 up to ip (excluded) + Assumption : always within prefix (i.e. not within extDict) */ +-static U32 ZSTD_insertAndFindFirstIndexHash3 (const ZSTD_matchState_t* ms, +- U32* nextToUpdate3, +- const BYTE* const ip) ++static ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++U32 ZSTD_insertAndFindFirstIndexHash3 (const ZSTD_matchState_t* ms, ++ U32* nextToUpdate3, ++ const BYTE* const ip) + { + U32* const hashTable3 = ms->hashTable3; + U32 const hashLog3 = ms->hashLog3; +@@ -408,7 +438,9 @@ static U32 ZSTD_insertAndFindFirstIndexH + * @param ip assumed <= iend-8 . + * @param target The target of ZSTD_updateTree_internal() - we are filling to this position + * @return : nb of positions added */ +-static U32 ZSTD_insertBt1( ++static ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++U32 ZSTD_insertBt1( + const ZSTD_matchState_t* ms, + const BYTE* const ip, const BYTE* const iend, + U32 const target, +@@ -527,6 +559,7 @@ static U32 ZSTD_insertBt1( + } + + FORCE_INLINE_TEMPLATE ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR + void ZSTD_updateTree_internal( + ZSTD_matchState_t* ms, + const BYTE* const ip, const BYTE* const iend, +@@ -535,7 +568,7 @@ void ZSTD_updateTree_internal( + const BYTE* const base = ms->window.base; + U32 const target = (U32)(ip - base); + U32 idx = ms->nextToUpdate; +- DEBUGLOG(6, "ZSTD_updateTree_internal, from %u to %u (dictMode:%u)", ++ DEBUGLOG(7, "ZSTD_updateTree_internal, from %u to %u (dictMode:%u)", + idx, target, dictMode); + + while(idx < target) { +@@ -553,15 +586,18 @@ void ZSTD_updateTree(ZSTD_matchState_t* + } + + FORCE_INLINE_TEMPLATE +-U32 ZSTD_insertBtAndGetAllMatches ( +- ZSTD_match_t* matches, /* store result (found matches) in this table (presumed large enough) */ +- ZSTD_matchState_t* ms, +- U32* nextToUpdate3, +- const BYTE* const ip, const BYTE* const iLimit, const ZSTD_dictMode_e dictMode, +- const U32 rep[ZSTD_REP_NUM], +- U32 const ll0, /* tells if associated literal length is 0 or not. This value must be 0 or 1 */ +- const U32 lengthToBeat, +- U32 const mls /* template */) ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++U32 ++ZSTD_insertBtAndGetAllMatches ( ++ ZSTD_match_t* matches, /* store result (found matches) in this table (presumed large enough) */ ++ ZSTD_matchState_t* ms, ++ U32* nextToUpdate3, ++ const BYTE* const ip, const BYTE* const iLimit, ++ const ZSTD_dictMode_e dictMode, ++ const U32 rep[ZSTD_REP_NUM], ++ const U32 ll0, /* tells if associated literal length is 0 or not. This value must be 0 or 1 */ ++ const U32 lengthToBeat, ++ const U32 mls /* template */) + { + const ZSTD_compressionParameters* const cParams = &ms->cParams; + U32 const sufficient_len = MIN(cParams->targetLength, ZSTD_OPT_NUM -1); +@@ -644,7 +680,7 @@ U32 ZSTD_insertBtAndGetAllMatches ( + DEBUGLOG(8, "found repCode %u (ll0:%u, offset:%u) of length %u", + repCode, ll0, repOffset, repLen); + bestLength = repLen; +- matches[mnum].off = STORE_REPCODE(repCode - ll0 + 1); /* expect value between 1 and 3 */ ++ matches[mnum].off = REPCODE_TO_OFFBASE(repCode - ll0 + 1); /* expect value between 1 and 3 */ + matches[mnum].len = (U32)repLen; + mnum++; + if ( (repLen > sufficient_len) +@@ -673,7 +709,7 @@ U32 ZSTD_insertBtAndGetAllMatches ( + bestLength = mlen; + assert(curr > matchIndex3); + assert(mnum==0); /* no prior solution */ +- matches[0].off = STORE_OFFSET(curr - matchIndex3); ++ matches[0].off = OFFSET_TO_OFFBASE(curr - matchIndex3); + matches[0].len = (U32)mlen; + mnum = 1; + if ( (mlen > sufficient_len) | +@@ -706,13 +742,13 @@ U32 ZSTD_insertBtAndGetAllMatches ( + } + + if (matchLength > bestLength) { +- DEBUGLOG(8, "found match of length %u at distance %u (offCode=%u)", +- (U32)matchLength, curr - matchIndex, STORE_OFFSET(curr - matchIndex)); ++ DEBUGLOG(8, "found match of length %u at distance %u (offBase=%u)", ++ (U32)matchLength, curr - matchIndex, OFFSET_TO_OFFBASE(curr - matchIndex)); + assert(matchEndIdx > matchIndex); + if (matchLength > matchEndIdx - matchIndex) + matchEndIdx = matchIndex + (U32)matchLength; + bestLength = matchLength; +- matches[mnum].off = STORE_OFFSET(curr - matchIndex); ++ matches[mnum].off = OFFSET_TO_OFFBASE(curr - matchIndex); + matches[mnum].len = (U32)matchLength; + mnum++; + if ( (matchLength > ZSTD_OPT_NUM) +@@ -754,12 +790,12 @@ U32 ZSTD_insertBtAndGetAllMatches ( + + if (matchLength > bestLength) { + matchIndex = dictMatchIndex + dmsIndexDelta; +- DEBUGLOG(8, "found dms match of length %u at distance %u (offCode=%u)", +- (U32)matchLength, curr - matchIndex, STORE_OFFSET(curr - matchIndex)); ++ DEBUGLOG(8, "found dms match of length %u at distance %u (offBase=%u)", ++ (U32)matchLength, curr - matchIndex, OFFSET_TO_OFFBASE(curr - matchIndex)); + if (matchLength > matchEndIdx - matchIndex) + matchEndIdx = matchIndex + (U32)matchLength; + bestLength = matchLength; +- matches[mnum].off = STORE_OFFSET(curr - matchIndex); ++ matches[mnum].off = OFFSET_TO_OFFBASE(curr - matchIndex); + matches[mnum].len = (U32)matchLength; + mnum++; + if ( (matchLength > ZSTD_OPT_NUM) +@@ -792,7 +828,9 @@ typedef U32 (*ZSTD_getAllMatchesFn)( + U32 const ll0, + U32 const lengthToBeat); + +-FORCE_INLINE_TEMPLATE U32 ZSTD_btGetAllMatches_internal( ++FORCE_INLINE_TEMPLATE ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++U32 ZSTD_btGetAllMatches_internal( + ZSTD_match_t* matches, + ZSTD_matchState_t* ms, + U32* nextToUpdate3, +@@ -960,7 +998,7 @@ static void ZSTD_optLdm_maybeAddMatch(ZS + const ZSTD_optLdm_t* optLdm, U32 currPosInBlock) + { + U32 const posDiff = currPosInBlock - optLdm->startPosInBlock; +- /* Note: ZSTD_match_t actually contains offCode and matchLength (before subtracting MINMATCH) */ ++ /* Note: ZSTD_match_t actually contains offBase and matchLength (before subtracting MINMATCH) */ + U32 const candidateMatchLength = optLdm->endPosInBlock - optLdm->startPosInBlock - posDiff; + + /* Ensure that current block position is not outside of the match */ +@@ -971,11 +1009,11 @@ static void ZSTD_optLdm_maybeAddMatch(ZS + } + + if (*nbMatches == 0 || ((candidateMatchLength > matches[*nbMatches-1].len) && *nbMatches < ZSTD_OPT_NUM)) { +- U32 const candidateOffCode = STORE_OFFSET(optLdm->offset); +- DEBUGLOG(6, "ZSTD_optLdm_maybeAddMatch(): Adding ldm candidate match (offCode: %u matchLength %u) at block position=%u", +- candidateOffCode, candidateMatchLength, currPosInBlock); ++ U32 const candidateOffBase = OFFSET_TO_OFFBASE(optLdm->offset); ++ DEBUGLOG(6, "ZSTD_optLdm_maybeAddMatch(): Adding ldm candidate match (offBase: %u matchLength %u) at block position=%u", ++ candidateOffBase, candidateMatchLength, currPosInBlock); + matches[*nbMatches].len = candidateMatchLength; +- matches[*nbMatches].off = candidateOffCode; ++ matches[*nbMatches].off = candidateOffBase; + (*nbMatches)++; + } + } +@@ -1011,11 +1049,6 @@ ZSTD_optLdm_processMatchCandidate(ZSTD_o + * Optimal parser + *********************************/ + +-static U32 ZSTD_totalLen(ZSTD_optimal_t sol) +-{ +- return sol.litlen + sol.mlen; +-} +- + #if 0 /* debug */ + + static void +@@ -1033,7 +1066,13 @@ listStats(const U32* table, int lastEltI + + #endif + +-FORCE_INLINE_TEMPLATE size_t ++#define LIT_PRICE(_p) (int)ZSTD_rawLiteralsCost(_p, 1, optStatePtr, optLevel) ++#define LL_PRICE(_l) (int)ZSTD_litLengthPrice(_l, optStatePtr, optLevel) ++#define LL_INCPRICE(_l) (LL_PRICE(_l) - LL_PRICE(_l-1)) ++ ++FORCE_INLINE_TEMPLATE ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++size_t + ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms, + seqStore_t* seqStore, + U32 rep[ZSTD_REP_NUM], +@@ -1059,9 +1098,11 @@ ZSTD_compressBlock_opt_generic(ZSTD_matc + + ZSTD_optimal_t* const opt = optStatePtr->priceTable; + ZSTD_match_t* const matches = optStatePtr->matchTable; +- ZSTD_optimal_t lastSequence; ++ ZSTD_optimal_t lastStretch; + ZSTD_optLdm_t optLdm; + ++ ZSTD_memset(&lastStretch, 0, sizeof(ZSTD_optimal_t)); ++ + optLdm.seqStore = ms->ldmSeqStore ? *ms->ldmSeqStore : kNullRawSeqStore; + optLdm.endPosInBlock = optLdm.startPosInBlock = optLdm.offset = 0; + ZSTD_opt_getNextMatchAndUpdateSeqStore(&optLdm, (U32)(ip-istart), (U32)(iend-ip)); +@@ -1082,103 +1123,139 @@ ZSTD_compressBlock_opt_generic(ZSTD_matc + U32 const ll0 = !litlen; + U32 nbMatches = getAllMatches(matches, ms, &nextToUpdate3, ip, iend, rep, ll0, minMatch); + ZSTD_optLdm_processMatchCandidate(&optLdm, matches, &nbMatches, +- (U32)(ip-istart), (U32)(iend - ip)); +- if (!nbMatches) { ip++; continue; } ++ (U32)(ip-istart), (U32)(iend-ip)); ++ if (!nbMatches) { ++ DEBUGLOG(8, "no match found at cPos %u", (unsigned)(ip-istart)); ++ ip++; ++ continue; ++ } ++ ++ /* Match found: let's store this solution, and eventually find more candidates. ++ * During this forward pass, @opt is used to store stretches, ++ * defined as "a match followed by N literals". ++ * Note how this is different from a Sequence, which is "N literals followed by a match". ++ * Storing stretches allows us to store different match predecessors ++ * for each literal position part of a literals run. */ + + /* initialize opt[0] */ +- { U32 i ; for (i=0; i immediate encoding */ + { U32 const maxML = matches[nbMatches-1].len; +- U32 const maxOffcode = matches[nbMatches-1].off; +- DEBUGLOG(6, "found %u matches of maxLength=%u and maxOffCode=%u at cPos=%u => start new series", +- nbMatches, maxML, maxOffcode, (U32)(ip-prefixStart)); ++ U32 const maxOffBase = matches[nbMatches-1].off; ++ DEBUGLOG(6, "found %u matches of maxLength=%u and maxOffBase=%u at cPos=%u => start new series", ++ nbMatches, maxML, maxOffBase, (U32)(ip-prefixStart)); + + if (maxML > sufficient_len) { +- lastSequence.litlen = litlen; +- lastSequence.mlen = maxML; +- lastSequence.off = maxOffcode; +- DEBUGLOG(6, "large match (%u>%u), immediate encoding", ++ lastStretch.litlen = 0; ++ lastStretch.mlen = maxML; ++ lastStretch.off = maxOffBase; ++ DEBUGLOG(6, "large match (%u>%u) => immediate encoding", + maxML, sufficient_len); + cur = 0; +- last_pos = ZSTD_totalLen(lastSequence); ++ last_pos = maxML; + goto _shortestPath; + } } + + /* set prices for first matches starting position == 0 */ + assert(opt[0].price >= 0); +- { U32 const literalsPrice = (U32)opt[0].price + ZSTD_litLengthPrice(0, optStatePtr, optLevel); +- U32 pos; ++ { U32 pos; + U32 matchNb; + for (pos = 1; pos < minMatch; pos++) { +- opt[pos].price = ZSTD_MAX_PRICE; /* mlen, litlen and price will be fixed during forward scanning */ ++ opt[pos].price = ZSTD_MAX_PRICE; ++ opt[pos].mlen = 0; ++ opt[pos].litlen = litlen + pos; + } + for (matchNb = 0; matchNb < nbMatches; matchNb++) { +- U32 const offcode = matches[matchNb].off; ++ U32 const offBase = matches[matchNb].off; + U32 const end = matches[matchNb].len; + for ( ; pos <= end ; pos++ ) { +- U32 const matchPrice = ZSTD_getMatchPrice(offcode, pos, optStatePtr, optLevel); +- U32 const sequencePrice = literalsPrice + matchPrice; ++ int const matchPrice = (int)ZSTD_getMatchPrice(offBase, pos, optStatePtr, optLevel); ++ int const sequencePrice = opt[0].price + matchPrice; + DEBUGLOG(7, "rPos:%u => set initial price : %.2f", + pos, ZSTD_fCost(sequencePrice)); + opt[pos].mlen = pos; +- opt[pos].off = offcode; +- opt[pos].litlen = litlen; +- opt[pos].price = (int)sequencePrice; +- } } ++ opt[pos].off = offBase; ++ opt[pos].litlen = 0; /* end of match */ ++ opt[pos].price = sequencePrice + LL_PRICE(0); ++ } ++ } + last_pos = pos-1; ++ opt[pos].price = ZSTD_MAX_PRICE; + } + } + + /* check further positions */ + for (cur = 1; cur <= last_pos; cur++) { + const BYTE* const inr = ip + cur; +- assert(cur < ZSTD_OPT_NUM); +- DEBUGLOG(7, "cPos:%zi==rPos:%u", inr-istart, cur) ++ assert(cur <= ZSTD_OPT_NUM); ++ DEBUGLOG(7, "cPos:%zi==rPos:%u", inr-istart, cur); + + /* Fix current position with one literal if cheaper */ +- { U32 const litlen = (opt[cur-1].mlen == 0) ? opt[cur-1].litlen + 1 : 1; ++ { U32 const litlen = opt[cur-1].litlen + 1; + int const price = opt[cur-1].price +- + (int)ZSTD_rawLiteralsCost(ip+cur-1, 1, optStatePtr, optLevel) +- + (int)ZSTD_litLengthPrice(litlen, optStatePtr, optLevel) +- - (int)ZSTD_litLengthPrice(litlen-1, optStatePtr, optLevel); ++ + LIT_PRICE(ip+cur-1) ++ + LL_INCPRICE(litlen); + assert(price < 1000000000); /* overflow check */ + if (price <= opt[cur].price) { ++ ZSTD_optimal_t const prevMatch = opt[cur]; + DEBUGLOG(7, "cPos:%zi==rPos:%u : better price (%.2f<=%.2f) using literal (ll==%u) (hist:%u,%u,%u)", + inr-istart, cur, ZSTD_fCost(price), ZSTD_fCost(opt[cur].price), litlen, + opt[cur-1].rep[0], opt[cur-1].rep[1], opt[cur-1].rep[2]); +- opt[cur].mlen = 0; +- opt[cur].off = 0; ++ opt[cur] = opt[cur-1]; + opt[cur].litlen = litlen; + opt[cur].price = price; ++ if ( (optLevel >= 1) /* additional check only for higher modes */ ++ && (prevMatch.litlen == 0) /* replace a match */ ++ && (LL_INCPRICE(1) < 0) /* ll1 is cheaper than ll0 */ ++ && LIKELY(ip + cur < iend) ++ ) { ++ /* check next position, in case it would be cheaper */ ++ int with1literal = prevMatch.price + LIT_PRICE(ip+cur) + LL_INCPRICE(1); ++ int withMoreLiterals = price + LIT_PRICE(ip+cur) + LL_INCPRICE(litlen+1); ++ DEBUGLOG(7, "then at next rPos %u : match+1lit %.2f vs %ulits %.2f", ++ cur+1, ZSTD_fCost(with1literal), litlen+1, ZSTD_fCost(withMoreLiterals)); ++ if ( (with1literal < withMoreLiterals) ++ && (with1literal < opt[cur+1].price) ) { ++ /* update offset history - before it disappears */ ++ U32 const prev = cur - prevMatch.mlen; ++ repcodes_t const newReps = ZSTD_newRep(opt[prev].rep, prevMatch.off, opt[prev].litlen==0); ++ assert(cur >= prevMatch.mlen); ++ DEBUGLOG(7, "==> match+1lit is cheaper (%.2f < %.2f) (hist:%u,%u,%u) !", ++ ZSTD_fCost(with1literal), ZSTD_fCost(withMoreLiterals), ++ newReps.rep[0], newReps.rep[1], newReps.rep[2] ); ++ opt[cur+1] = prevMatch; /* mlen & offbase */ ++ ZSTD_memcpy(opt[cur+1].rep, &newReps, sizeof(repcodes_t)); ++ opt[cur+1].litlen = 1; ++ opt[cur+1].price = with1literal; ++ if (last_pos < cur+1) last_pos = cur+1; ++ } ++ } + } else { +- DEBUGLOG(7, "cPos:%zi==rPos:%u : literal would cost more (%.2f>%.2f) (hist:%u,%u,%u)", +- inr-istart, cur, ZSTD_fCost(price), ZSTD_fCost(opt[cur].price), +- opt[cur].rep[0], opt[cur].rep[1], opt[cur].rep[2]); ++ DEBUGLOG(7, "cPos:%zi==rPos:%u : literal would cost more (%.2f>%.2f)", ++ inr-istart, cur, ZSTD_fCost(price), ZSTD_fCost(opt[cur].price)); + } + } + +- /* Set the repcodes of the current position. We must do it here +- * because we rely on the repcodes of the 2nd to last sequence being +- * correct to set the next chunks repcodes during the backward +- * traversal. ++ /* Offset history is not updated during match comparison. ++ * Do it here, now that the match is selected and confirmed. + */ + ZSTD_STATIC_ASSERT(sizeof(opt[cur].rep) == sizeof(repcodes_t)); + assert(cur >= opt[cur].mlen); +- if (opt[cur].mlen != 0) { ++ if (opt[cur].litlen == 0) { ++ /* just finished a match => alter offset history */ + U32 const prev = cur - opt[cur].mlen; +- repcodes_t const newReps = ZSTD_newRep(opt[prev].rep, opt[cur].off, opt[cur].litlen==0); ++ repcodes_t const newReps = ZSTD_newRep(opt[prev].rep, opt[cur].off, opt[prev].litlen==0); + ZSTD_memcpy(opt[cur].rep, &newReps, sizeof(repcodes_t)); +- } else { +- ZSTD_memcpy(opt[cur].rep, opt[cur - 1].rep, sizeof(repcodes_t)); + } + + /* last match must start at a minimum distance of 8 from oend */ +@@ -1188,15 +1265,14 @@ ZSTD_compressBlock_opt_generic(ZSTD_matc + + if ( (optLevel==0) /*static_test*/ + && (opt[cur+1].price <= opt[cur].price + (BITCOST_MULTIPLIER/2)) ) { +- DEBUGLOG(7, "move to next rPos:%u : price is <=", cur+1); ++ DEBUGLOG(7, "skip current position : next rPos(%u) price is cheaper", cur+1); + continue; /* skip unpromising positions; about ~+6% speed, -0.01 ratio */ + } + + assert(opt[cur].price >= 0); +- { U32 const ll0 = (opt[cur].mlen != 0); +- U32 const litlen = (opt[cur].mlen == 0) ? opt[cur].litlen : 0; +- U32 const previousPrice = (U32)opt[cur].price; +- U32 const basePrice = previousPrice + ZSTD_litLengthPrice(0, optStatePtr, optLevel); ++ { U32 const ll0 = (opt[cur].litlen == 0); ++ int const previousPrice = opt[cur].price; ++ int const basePrice = previousPrice + LL_PRICE(0); + U32 nbMatches = getAllMatches(matches, ms, &nextToUpdate3, inr, iend, opt[cur].rep, ll0, minMatch); + U32 matchNb; + +@@ -1208,18 +1284,17 @@ ZSTD_compressBlock_opt_generic(ZSTD_matc + continue; + } + +- { U32 const maxML = matches[nbMatches-1].len; +- DEBUGLOG(7, "cPos:%zi==rPos:%u, found %u matches, of maxLength=%u", +- inr-istart, cur, nbMatches, maxML); +- +- if ( (maxML > sufficient_len) +- || (cur + maxML >= ZSTD_OPT_NUM) ) { +- lastSequence.mlen = maxML; +- lastSequence.off = matches[nbMatches-1].off; +- lastSequence.litlen = litlen; +- cur -= (opt[cur].mlen==0) ? opt[cur].litlen : 0; /* last sequence is actually only literals, fix cur to last match - note : may underflow, in which case, it's first sequence, and it's okay */ +- last_pos = cur + ZSTD_totalLen(lastSequence); +- if (cur > ZSTD_OPT_NUM) cur = 0; /* underflow => first match */ ++ { U32 const longestML = matches[nbMatches-1].len; ++ DEBUGLOG(7, "cPos:%zi==rPos:%u, found %u matches, of longest ML=%u", ++ inr-istart, cur, nbMatches, longestML); ++ ++ if ( (longestML > sufficient_len) ++ || (cur + longestML >= ZSTD_OPT_NUM) ++ || (ip + cur + longestML >= iend) ) { ++ lastStretch.mlen = longestML; ++ lastStretch.off = matches[nbMatches-1].off; ++ lastStretch.litlen = 0; ++ last_pos = cur + longestML; + goto _shortestPath; + } } + +@@ -1230,20 +1305,25 @@ ZSTD_compressBlock_opt_generic(ZSTD_matc + U32 const startML = (matchNb>0) ? matches[matchNb-1].len+1 : minMatch; + U32 mlen; + +- DEBUGLOG(7, "testing match %u => offCode=%4u, mlen=%2u, llen=%2u", +- matchNb, matches[matchNb].off, lastML, litlen); ++ DEBUGLOG(7, "testing match %u => offBase=%4u, mlen=%2u, llen=%2u", ++ matchNb, matches[matchNb].off, lastML, opt[cur].litlen); + + for (mlen = lastML; mlen >= startML; mlen--) { /* scan downward */ + U32 const pos = cur + mlen; +- int const price = (int)basePrice + (int)ZSTD_getMatchPrice(offset, mlen, optStatePtr, optLevel); ++ int const price = basePrice + (int)ZSTD_getMatchPrice(offset, mlen, optStatePtr, optLevel); + + if ((pos > last_pos) || (price < opt[pos].price)) { + DEBUGLOG(7, "rPos:%u (ml=%2u) => new better price (%.2f<%.2f)", + pos, mlen, ZSTD_fCost(price), ZSTD_fCost(opt[pos].price)); +- while (last_pos < pos) { opt[last_pos+1].price = ZSTD_MAX_PRICE; last_pos++; } /* fill empty positions */ ++ while (last_pos < pos) { ++ /* fill empty positions, for future comparisons */ ++ last_pos++; ++ opt[last_pos].price = ZSTD_MAX_PRICE; ++ opt[last_pos].litlen = !0; /* just needs to be != 0, to mean "not an end of match" */ ++ } + opt[pos].mlen = mlen; + opt[pos].off = offset; +- opt[pos].litlen = litlen; ++ opt[pos].litlen = 0; + opt[pos].price = price; + } else { + DEBUGLOG(7, "rPos:%u (ml=%2u) => new price is worse (%.2f>=%.2f)", +@@ -1251,52 +1331,86 @@ ZSTD_compressBlock_opt_generic(ZSTD_matc + if (optLevel==0) break; /* early update abort; gets ~+10% speed for about -0.01 ratio loss */ + } + } } } ++ opt[last_pos+1].price = ZSTD_MAX_PRICE; + } /* for (cur = 1; cur <= last_pos; cur++) */ + +- lastSequence = opt[last_pos]; +- cur = last_pos > ZSTD_totalLen(lastSequence) ? last_pos - ZSTD_totalLen(lastSequence) : 0; /* single sequence, and it starts before `ip` */ +- assert(cur < ZSTD_OPT_NUM); /* control overflow*/ ++ lastStretch = opt[last_pos]; ++ assert(cur >= lastStretch.mlen); ++ cur = last_pos - lastStretch.mlen; + + _shortestPath: /* cur, last_pos, best_mlen, best_off have to be set */ + assert(opt[0].mlen == 0); ++ assert(last_pos >= lastStretch.mlen); ++ assert(cur == last_pos - lastStretch.mlen); + +- /* Set the next chunk's repcodes based on the repcodes of the beginning +- * of the last match, and the last sequence. This avoids us having to +- * update them while traversing the sequences. +- */ +- if (lastSequence.mlen != 0) { +- repcodes_t const reps = ZSTD_newRep(opt[cur].rep, lastSequence.off, lastSequence.litlen==0); +- ZSTD_memcpy(rep, &reps, sizeof(reps)); ++ if (lastStretch.mlen==0) { ++ /* no solution : all matches have been converted into literals */ ++ assert(lastStretch.litlen == (ip - anchor) + last_pos); ++ ip += last_pos; ++ continue; ++ } ++ assert(lastStretch.off > 0); ++ ++ /* Update offset history */ ++ if (lastStretch.litlen == 0) { ++ /* finishing on a match : update offset history */ ++ repcodes_t const reps = ZSTD_newRep(opt[cur].rep, lastStretch.off, opt[cur].litlen==0); ++ ZSTD_memcpy(rep, &reps, sizeof(repcodes_t)); + } else { +- ZSTD_memcpy(rep, opt[cur].rep, sizeof(repcodes_t)); ++ ZSTD_memcpy(rep, lastStretch.rep, sizeof(repcodes_t)); ++ assert(cur >= lastStretch.litlen); ++ cur -= lastStretch.litlen; + } + +- { U32 const storeEnd = cur + 1; ++ /* Let's write the shortest path solution. ++ * It is stored in @opt in reverse order, ++ * starting from @storeEnd (==cur+2), ++ * effectively partially @opt overwriting. ++ * Content is changed too: ++ * - So far, @opt stored stretches, aka a match followed by literals ++ * - Now, it will store sequences, aka literals followed by a match ++ */ ++ { U32 const storeEnd = cur + 2; + U32 storeStart = storeEnd; +- U32 seqPos = cur; ++ U32 stretchPos = cur; + + DEBUGLOG(6, "start reverse traversal (last_pos:%u, cur:%u)", + last_pos, cur); (void)last_pos; +- assert(storeEnd < ZSTD_OPT_NUM); +- DEBUGLOG(6, "last sequence copied into pos=%u (llen=%u,mlen=%u,ofc=%u)", +- storeEnd, lastSequence.litlen, lastSequence.mlen, lastSequence.off); +- opt[storeEnd] = lastSequence; +- while (seqPos > 0) { +- U32 const backDist = ZSTD_totalLen(opt[seqPos]); ++ assert(storeEnd < ZSTD_OPT_SIZE); ++ DEBUGLOG(6, "last stretch copied into pos=%u (llen=%u,mlen=%u,ofc=%u)", ++ storeEnd, lastStretch.litlen, lastStretch.mlen, lastStretch.off); ++ if (lastStretch.litlen > 0) { ++ /* last "sequence" is unfinished: just a bunch of literals */ ++ opt[storeEnd].litlen = lastStretch.litlen; ++ opt[storeEnd].mlen = 0; ++ storeStart = storeEnd-1; ++ opt[storeStart] = lastStretch; ++ } { ++ opt[storeEnd] = lastStretch; /* note: litlen will be fixed */ ++ storeStart = storeEnd; ++ } ++ while (1) { ++ ZSTD_optimal_t nextStretch = opt[stretchPos]; ++ opt[storeStart].litlen = nextStretch.litlen; ++ DEBUGLOG(6, "selected sequence (llen=%u,mlen=%u,ofc=%u)", ++ opt[storeStart].litlen, opt[storeStart].mlen, opt[storeStart].off); ++ if (nextStretch.mlen == 0) { ++ /* reaching beginning of segment */ ++ break; ++ } + storeStart--; +- DEBUGLOG(6, "sequence from rPos=%u copied into pos=%u (llen=%u,mlen=%u,ofc=%u)", +- seqPos, storeStart, opt[seqPos].litlen, opt[seqPos].mlen, opt[seqPos].off); +- opt[storeStart] = opt[seqPos]; +- seqPos = (seqPos > backDist) ? seqPos - backDist : 0; ++ opt[storeStart] = nextStretch; /* note: litlen will be fixed */ ++ assert(nextStretch.litlen + nextStretch.mlen <= stretchPos); ++ stretchPos -= nextStretch.litlen + nextStretch.mlen; + } + + /* save sequences */ +- DEBUGLOG(6, "sending selected sequences into seqStore") ++ DEBUGLOG(6, "sending selected sequences into seqStore"); + { U32 storePos; + for (storePos=storeStart; storePos <= storeEnd; storePos++) { + U32 const llen = opt[storePos].litlen; + U32 const mlen = opt[storePos].mlen; +- U32 const offCode = opt[storePos].off; ++ U32 const offBase = opt[storePos].off; + U32 const advance = llen + mlen; + DEBUGLOG(6, "considering seq starting at %zi, llen=%u, mlen=%u", + anchor - istart, (unsigned)llen, (unsigned)mlen); +@@ -1308,11 +1422,14 @@ _shortestPath: /* cur, last_pos, best_ + } + + assert(anchor + llen <= iend); +- ZSTD_updateStats(optStatePtr, llen, anchor, offCode, mlen); +- ZSTD_storeSeq(seqStore, llen, anchor, iend, offCode, mlen); ++ ZSTD_updateStats(optStatePtr, llen, anchor, offBase, mlen); ++ ZSTD_storeSeq(seqStore, llen, anchor, iend, offBase, mlen); + anchor += advance; + ip = anchor; + } } ++ DEBUGLOG(7, "new offset history : %u, %u, %u", rep[0], rep[1], rep[2]); ++ ++ /* update all costs */ + ZSTD_setBasePrices(optStatePtr, optLevel); + } + } /* while (ip < ilimit) */ +@@ -1320,21 +1437,27 @@ _shortestPath: /* cur, last_pos, best_ + /* Return the last literals size */ + return (size_t)(iend - anchor); + } ++#endif /* build exclusions */ + ++#ifndef ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR + static size_t ZSTD_compressBlock_opt0( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + const void* src, size_t srcSize, const ZSTD_dictMode_e dictMode) + { + return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 0 /* optLevel */, dictMode); + } ++#endif + ++#ifndef ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR + static size_t ZSTD_compressBlock_opt2( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + const void* src, size_t srcSize, const ZSTD_dictMode_e dictMode) + { + return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 2 /* optLevel */, dictMode); + } ++#endif + ++#ifndef ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR + size_t ZSTD_compressBlock_btopt( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + const void* src, size_t srcSize) +@@ -1342,20 +1465,23 @@ size_t ZSTD_compressBlock_btopt( + DEBUGLOG(5, "ZSTD_compressBlock_btopt"); + return ZSTD_compressBlock_opt0(ms, seqStore, rep, src, srcSize, ZSTD_noDict); + } ++#endif + + + + ++#ifndef ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR + /* ZSTD_initStats_ultra(): + * make a first compression pass, just to seed stats with more accurate starting values. + * only works on first block, with no dictionary and no ldm. +- * this function cannot error, hence its contract must be respected. ++ * this function cannot error out, its narrow contract must be respected. + */ +-static void +-ZSTD_initStats_ultra(ZSTD_matchState_t* ms, +- seqStore_t* seqStore, +- U32 rep[ZSTD_REP_NUM], +- const void* src, size_t srcSize) ++static ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++void ZSTD_initStats_ultra(ZSTD_matchState_t* ms, ++ seqStore_t* seqStore, ++ U32 rep[ZSTD_REP_NUM], ++ const void* src, size_t srcSize) + { + U32 tmpRep[ZSTD_REP_NUM]; /* updated rep codes will sink here */ + ZSTD_memcpy(tmpRep, rep, sizeof(tmpRep)); +@@ -1368,7 +1494,7 @@ ZSTD_initStats_ultra(ZSTD_matchState_t* + + ZSTD_compressBlock_opt2(ms, seqStore, tmpRep, src, srcSize, ZSTD_noDict); /* generate stats into ms->opt*/ + +- /* invalidate first scan from history */ ++ /* invalidate first scan from history, only keep entropy stats */ + ZSTD_resetSeqStore(seqStore); + ms->window.base -= srcSize; + ms->window.dictLimit += (U32)srcSize; +@@ -1392,10 +1518,10 @@ size_t ZSTD_compressBlock_btultra2( + U32 const curr = (U32)((const BYTE*)src - ms->window.base); + DEBUGLOG(5, "ZSTD_compressBlock_btultra2 (srcSize=%zu)", srcSize); + +- /* 2-pass strategy: ++ /* 2-passes strategy: + * this strategy makes a first pass over first block to collect statistics +- * and seed next round's statistics with it. +- * After 1st pass, function forgets everything, and starts a new block. ++ * in order to seed next round's statistics with it. ++ * After 1st pass, function forgets history, and starts a new block. + * Consequently, this can only work if no data has been previously loaded in tables, + * aka, no dictionary, no prefix, no ldm preprocessing. + * The compression ratio gain is generally small (~0.5% on first block), +@@ -1404,15 +1530,17 @@ size_t ZSTD_compressBlock_btultra2( + if ( (ms->opt.litLengthSum==0) /* first block */ + && (seqStore->sequences == seqStore->sequencesStart) /* no ldm */ + && (ms->window.dictLimit == ms->window.lowLimit) /* no dictionary */ +- && (curr == ms->window.dictLimit) /* start of frame, nothing already loaded nor skipped */ +- && (srcSize > ZSTD_PREDEF_THRESHOLD) ++ && (curr == ms->window.dictLimit) /* start of frame, nothing already loaded nor skipped */ ++ && (srcSize > ZSTD_PREDEF_THRESHOLD) /* input large enough to not employ default stats */ + ) { + ZSTD_initStats_ultra(ms, seqStore, rep, src, srcSize); + } + + return ZSTD_compressBlock_opt2(ms, seqStore, rep, src, srcSize, ZSTD_noDict); + } ++#endif + ++#ifndef ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR + size_t ZSTD_compressBlock_btopt_dictMatchState( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + const void* src, size_t srcSize) +@@ -1420,18 +1548,20 @@ size_t ZSTD_compressBlock_btopt_dictMatc + return ZSTD_compressBlock_opt0(ms, seqStore, rep, src, srcSize, ZSTD_dictMatchState); + } + +-size_t ZSTD_compressBlock_btultra_dictMatchState( ++size_t ZSTD_compressBlock_btopt_extDict( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + const void* src, size_t srcSize) + { +- return ZSTD_compressBlock_opt2(ms, seqStore, rep, src, srcSize, ZSTD_dictMatchState); ++ return ZSTD_compressBlock_opt0(ms, seqStore, rep, src, srcSize, ZSTD_extDict); + } ++#endif + +-size_t ZSTD_compressBlock_btopt_extDict( ++#ifndef ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR ++size_t ZSTD_compressBlock_btultra_dictMatchState( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + const void* src, size_t srcSize) + { +- return ZSTD_compressBlock_opt0(ms, seqStore, rep, src, srcSize, ZSTD_extDict); ++ return ZSTD_compressBlock_opt2(ms, seqStore, rep, src, srcSize, ZSTD_dictMatchState); + } + + size_t ZSTD_compressBlock_btultra_extDict( +@@ -1440,6 +1570,7 @@ size_t ZSTD_compressBlock_btultra_extDic + { + return ZSTD_compressBlock_opt2(ms, seqStore, rep, src, srcSize, ZSTD_extDict); + } ++#endif + + /* note : no btultra2 variant for extDict nor dictMatchState, + * because btultra2 is not meant to work with dictionaries +--- a/lib/zstd/compress/zstd_opt.h ++++ b/lib/zstd/compress/zstd_opt.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -14,30 +15,40 @@ + + #include "zstd_compress_internal.h" + ++#if !defined(ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR) \ ++ || !defined(ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR) \ ++ || !defined(ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR) + /* used in ZSTD_loadDictionaryContent() */ + void ZSTD_updateTree(ZSTD_matchState_t* ms, const BYTE* ip, const BYTE* iend); ++#endif + ++#ifndef ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR + size_t ZSTD_compressBlock_btopt( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +-size_t ZSTD_compressBlock_btultra( ++size_t ZSTD_compressBlock_btopt_dictMatchState( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +-size_t ZSTD_compressBlock_btultra2( ++size_t ZSTD_compressBlock_btopt_extDict( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); + ++#define ZSTD_COMPRESSBLOCK_BTOPT ZSTD_compressBlock_btopt ++#define ZSTD_COMPRESSBLOCK_BTOPT_DICTMATCHSTATE ZSTD_compressBlock_btopt_dictMatchState ++#define ZSTD_COMPRESSBLOCK_BTOPT_EXTDICT ZSTD_compressBlock_btopt_extDict ++#else ++#define ZSTD_COMPRESSBLOCK_BTOPT NULL ++#define ZSTD_COMPRESSBLOCK_BTOPT_DICTMATCHSTATE NULL ++#define ZSTD_COMPRESSBLOCK_BTOPT_EXTDICT NULL ++#endif + +-size_t ZSTD_compressBlock_btopt_dictMatchState( ++#ifndef ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR ++size_t ZSTD_compressBlock_btultra( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); + size_t ZSTD_compressBlock_btultra_dictMatchState( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +- +-size_t ZSTD_compressBlock_btopt_extDict( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +- void const* src, size_t srcSize); + size_t ZSTD_compressBlock_btultra_extDict( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +@@ -45,6 +56,20 @@ size_t ZSTD_compressBlock_btultra_extDic + /* note : no btultra2 variant for extDict nor dictMatchState, + * because btultra2 is not meant to work with dictionaries + * and is only specific for the first block (no prefix) */ ++size_t ZSTD_compressBlock_btultra2( ++ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ void const* src, size_t srcSize); ++ ++#define ZSTD_COMPRESSBLOCK_BTULTRA ZSTD_compressBlock_btultra ++#define ZSTD_COMPRESSBLOCK_BTULTRA_DICTMATCHSTATE ZSTD_compressBlock_btultra_dictMatchState ++#define ZSTD_COMPRESSBLOCK_BTULTRA_EXTDICT ZSTD_compressBlock_btultra_extDict ++#define ZSTD_COMPRESSBLOCK_BTULTRA2 ZSTD_compressBlock_btultra2 ++#else ++#define ZSTD_COMPRESSBLOCK_BTULTRA NULL ++#define ZSTD_COMPRESSBLOCK_BTULTRA_DICTMATCHSTATE NULL ++#define ZSTD_COMPRESSBLOCK_BTULTRA_EXTDICT NULL ++#define ZSTD_COMPRESSBLOCK_BTULTRA2 NULL ++#endif + + + #endif /* ZSTD_OPT_H */ +--- a/lib/zstd/decompress/huf_decompress.c ++++ b/lib/zstd/decompress/huf_decompress.c +@@ -1,7 +1,8 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* ****************************************************************** + * huff0 huffman decoder, + * part of Finite State Entropy library +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * You can contact the author at : + * - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy +@@ -19,10 +20,10 @@ + #include "../common/compiler.h" + #include "../common/bitstream.h" /* BIT_* */ + #include "../common/fse.h" /* to compress headers */ +-#define HUF_STATIC_LINKING_ONLY + #include "../common/huf.h" + #include "../common/error_private.h" + #include "../common/zstd_internal.h" ++#include "../common/bits.h" /* ZSTD_highbit32, ZSTD_countTrailingZeros64 */ + + /* ************************************************************** + * Constants +@@ -34,6 +35,12 @@ + * Macros + ****************************************************************/ + ++#ifdef HUF_DISABLE_FAST_DECODE ++# define HUF_ENABLE_FAST_DECODE 0 ++#else ++# define HUF_ENABLE_FAST_DECODE 1 ++#endif ++ + /* These two optional macros force the use one way or another of the two + * Huffman decompression implementations. You can't force in both directions + * at the same time. +@@ -43,27 +50,25 @@ + #error "Cannot force the use of the X1 and X2 decoders at the same time!" + #endif + +-#if ZSTD_ENABLE_ASM_X86_64_BMI2 && DYNAMIC_BMI2 +-# define HUF_ASM_X86_64_BMI2_ATTRS BMI2_TARGET_ATTRIBUTE ++/* When DYNAMIC_BMI2 is enabled, fast decoders are only called when bmi2 is ++ * supported at runtime, so we can add the BMI2 target attribute. ++ * When it is disabled, we will still get BMI2 if it is enabled statically. ++ */ ++#if DYNAMIC_BMI2 ++# define HUF_FAST_BMI2_ATTRS BMI2_TARGET_ATTRIBUTE + #else +-# define HUF_ASM_X86_64_BMI2_ATTRS ++# define HUF_FAST_BMI2_ATTRS + #endif + + #define HUF_EXTERN_C + #define HUF_ASM_DECL HUF_EXTERN_C + +-#if DYNAMIC_BMI2 || (ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__)) ++#if DYNAMIC_BMI2 + # define HUF_NEED_BMI2_FUNCTION 1 + #else + # define HUF_NEED_BMI2_FUNCTION 0 + #endif + +-#if !(ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__)) +-# define HUF_NEED_DEFAULT_FUNCTION 1 +-#else +-# define HUF_NEED_DEFAULT_FUNCTION 0 +-#endif +- + /* ************************************************************** + * Error Management + ****************************************************************/ +@@ -80,6 +85,11 @@ + /* ************************************************************** + * BMI2 Variant Wrappers + ****************************************************************/ ++typedef size_t (*HUF_DecompressUsingDTableFn)(void *dst, size_t dstSize, ++ const void *cSrc, ++ size_t cSrcSize, ++ const HUF_DTable *DTable); ++ + #if DYNAMIC_BMI2 + + #define HUF_DGEN(fn) \ +@@ -101,9 +111,9 @@ + } \ + \ + static size_t fn(void* dst, size_t dstSize, void const* cSrc, \ +- size_t cSrcSize, HUF_DTable const* DTable, int bmi2) \ ++ size_t cSrcSize, HUF_DTable const* DTable, int flags) \ + { \ +- if (bmi2) { \ ++ if (flags & HUF_flags_bmi2) { \ + return fn##_bmi2(dst, dstSize, cSrc, cSrcSize, DTable); \ + } \ + return fn##_default(dst, dstSize, cSrc, cSrcSize, DTable); \ +@@ -113,9 +123,9 @@ + + #define HUF_DGEN(fn) \ + static size_t fn(void* dst, size_t dstSize, void const* cSrc, \ +- size_t cSrcSize, HUF_DTable const* DTable, int bmi2) \ ++ size_t cSrcSize, HUF_DTable const* DTable, int flags) \ + { \ +- (void)bmi2; \ ++ (void)flags; \ + return fn##_body(dst, dstSize, cSrc, cSrcSize, DTable); \ + } + +@@ -134,43 +144,66 @@ static DTableDesc HUF_getDTableDesc(cons + return dtd; + } + +-#if ZSTD_ENABLE_ASM_X86_64_BMI2 +- +-static size_t HUF_initDStream(BYTE const* ip) { ++static size_t HUF_initFastDStream(BYTE const* ip) { + BYTE const lastByte = ip[7]; +- size_t const bitsConsumed = lastByte ? 8 - BIT_highbit32(lastByte) : 0; ++ size_t const bitsConsumed = lastByte ? 8 - ZSTD_highbit32(lastByte) : 0; + size_t const value = MEM_readLEST(ip) | 1; + assert(bitsConsumed <= 8); ++ assert(sizeof(size_t) == 8); + return value << bitsConsumed; + } ++ ++ ++/* ++ * The input/output arguments to the Huffman fast decoding loop: ++ * ++ * ip [in/out] - The input pointers, must be updated to reflect what is consumed. ++ * op [in/out] - The output pointers, must be updated to reflect what is written. ++ * bits [in/out] - The bitstream containers, must be updated to reflect the current state. ++ * dt [in] - The decoding table. ++ * ilowest [in] - The beginning of the valid range of the input. Decoders may read ++ * down to this pointer. It may be below iend[0]. ++ * oend [in] - The end of the output stream. op[3] must not cross oend. ++ * iend [in] - The end of each input stream. ip[i] may cross iend[i], ++ * as long as it is above ilowest, but that indicates corruption. ++ */ + typedef struct { + BYTE const* ip[4]; + BYTE* op[4]; + U64 bits[4]; + void const* dt; +- BYTE const* ilimit; ++ BYTE const* ilowest; + BYTE* oend; + BYTE const* iend[4]; +-} HUF_DecompressAsmArgs; ++} HUF_DecompressFastArgs; ++ ++typedef void (*HUF_DecompressFastLoopFn)(HUF_DecompressFastArgs*); + + /* +- * Initializes args for the asm decoding loop. +- * @returns 0 on success +- * 1 if the fallback implementation should be used. ++ * Initializes args for the fast decoding loop. ++ * @returns 1 on success ++ * 0 if the fallback implementation should be used. + * Or an error code on failure. + */ +-static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst, size_t dstSize, void const* src, size_t srcSize, const HUF_DTable* DTable) ++static size_t HUF_DecompressFastArgs_init(HUF_DecompressFastArgs* args, void* dst, size_t dstSize, void const* src, size_t srcSize, const HUF_DTable* DTable) + { + void const* dt = DTable + 1; + U32 const dtLog = HUF_getDTableDesc(DTable).tableLog; + +- const BYTE* const ilimit = (const BYTE*)src + 6 + 8; ++ const BYTE* const istart = (const BYTE*)src; + +- BYTE* const oend = (BYTE*)dst + dstSize; ++ BYTE* const oend = ZSTD_maybeNullPtrAdd((BYTE*)dst, dstSize); + +- /* The following condition is false on x32 platform, +- * but HUF_asm is not compatible with this ABI */ +- if (!(MEM_isLittleEndian() && !MEM_32bits())) return 1; ++ /* The fast decoding loop assumes 64-bit little-endian. ++ * This condition is false on x32. ++ */ ++ if (!MEM_isLittleEndian() || MEM_32bits()) ++ return 0; ++ ++ /* Avoid nullptr addition */ ++ if (dstSize == 0) ++ return 0; ++ assert(dst != NULL); + + /* strict minimum : jump table + 1 byte per stream */ + if (srcSize < 10) +@@ -181,11 +214,10 @@ static size_t HUF_DecompressAsmArgs_init + * On small inputs we don't have enough data to trigger the fast loop, so use the old decoder. + */ + if (dtLog != HUF_DECODER_FAST_TABLELOG) +- return 1; ++ return 0; + + /* Read the jump table. */ + { +- const BYTE* const istart = (const BYTE*)src; + size_t const length1 = MEM_readLE16(istart); + size_t const length2 = MEM_readLE16(istart+2); + size_t const length3 = MEM_readLE16(istart+4); +@@ -195,13 +227,11 @@ static size_t HUF_DecompressAsmArgs_init + args->iend[2] = args->iend[1] + length2; + args->iend[3] = args->iend[2] + length3; + +- /* HUF_initDStream() requires this, and this small of an input ++ /* HUF_initFastDStream() requires this, and this small of an input + * won't benefit from the ASM loop anyways. +- * length1 must be >= 16 so that ip[0] >= ilimit before the loop +- * starts. + */ +- if (length1 < 16 || length2 < 8 || length3 < 8 || length4 < 8) +- return 1; ++ if (length1 < 8 || length2 < 8 || length3 < 8 || length4 < 8) ++ return 0; + if (length4 > srcSize) return ERROR(corruption_detected); /* overflow */ + } + /* ip[] contains the position that is currently loaded into bits[]. */ +@@ -218,7 +248,7 @@ static size_t HUF_DecompressAsmArgs_init + + /* No point to call the ASM loop for tiny outputs. */ + if (args->op[3] >= oend) +- return 1; ++ return 0; + + /* bits[] is the bit container. + * It is read from the MSB down to the LSB. +@@ -227,24 +257,25 @@ static size_t HUF_DecompressAsmArgs_init + * set, so that CountTrailingZeros(bits[]) can be used + * to count how many bits we've consumed. + */ +- args->bits[0] = HUF_initDStream(args->ip[0]); +- args->bits[1] = HUF_initDStream(args->ip[1]); +- args->bits[2] = HUF_initDStream(args->ip[2]); +- args->bits[3] = HUF_initDStream(args->ip[3]); +- +- /* If ip[] >= ilimit, it is guaranteed to be safe to +- * reload bits[]. It may be beyond its section, but is +- * guaranteed to be valid (>= istart). +- */ +- args->ilimit = ilimit; ++ args->bits[0] = HUF_initFastDStream(args->ip[0]); ++ args->bits[1] = HUF_initFastDStream(args->ip[1]); ++ args->bits[2] = HUF_initFastDStream(args->ip[2]); ++ args->bits[3] = HUF_initFastDStream(args->ip[3]); ++ ++ /* The decoders must be sure to never read beyond ilowest. ++ * This is lower than iend[0], but allowing decoders to read ++ * down to ilowest can allow an extra iteration or two in the ++ * fast loop. ++ */ ++ args->ilowest = istart; + + args->oend = oend; + args->dt = dt; + +- return 0; ++ return 1; + } + +-static size_t HUF_initRemainingDStream(BIT_DStream_t* bit, HUF_DecompressAsmArgs const* args, int stream, BYTE* segmentEnd) ++static size_t HUF_initRemainingDStream(BIT_DStream_t* bit, HUF_DecompressFastArgs const* args, int stream, BYTE* segmentEnd) + { + /* Validate that we haven't overwritten. */ + if (args->op[stream] > segmentEnd) +@@ -258,15 +289,33 @@ static size_t HUF_initRemainingDStream(B + return ERROR(corruption_detected); + + /* Construct the BIT_DStream_t. */ +- bit->bitContainer = MEM_readLE64(args->ip[stream]); +- bit->bitsConsumed = ZSTD_countTrailingZeros((size_t)args->bits[stream]); +- bit->start = (const char*)args->iend[0]; ++ assert(sizeof(size_t) == 8); ++ bit->bitContainer = MEM_readLEST(args->ip[stream]); ++ bit->bitsConsumed = ZSTD_countTrailingZeros64(args->bits[stream]); ++ bit->start = (const char*)args->ilowest; + bit->limitPtr = bit->start + sizeof(size_t); + bit->ptr = (const char*)args->ip[stream]; + + return 0; + } +-#endif ++ ++/* Calls X(N) for each stream 0, 1, 2, 3. */ ++#define HUF_4X_FOR_EACH_STREAM(X) \ ++ do { \ ++ X(0); \ ++ X(1); \ ++ X(2); \ ++ X(3); \ ++ } while (0) ++ ++/* Calls X(N, var) for each stream 0, 1, 2, 3. */ ++#define HUF_4X_FOR_EACH_STREAM_WITH_VAR(X, var) \ ++ do { \ ++ X(0, (var)); \ ++ X(1, (var)); \ ++ X(2, (var)); \ ++ X(3, (var)); \ ++ } while (0) + + + #ifndef HUF_FORCE_DECOMPRESS_X2 +@@ -283,10 +332,11 @@ typedef struct { BYTE nbBits; BYTE byte; + static U64 HUF_DEltX1_set4(BYTE symbol, BYTE nbBits) { + U64 D4; + if (MEM_isLittleEndian()) { +- D4 = (symbol << 8) + nbBits; ++ D4 = (U64)((symbol << 8) + nbBits); + } else { +- D4 = symbol + (nbBits << 8); ++ D4 = (U64)(symbol + (nbBits << 8)); + } ++ assert(D4 < (1U << 16)); + D4 *= 0x0001000100010001ULL; + return D4; + } +@@ -329,13 +379,7 @@ typedef struct { + BYTE huffWeight[HUF_SYMBOLVALUE_MAX + 1]; + } HUF_ReadDTableX1_Workspace; + +- +-size_t HUF_readDTableX1_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize) +-{ +- return HUF_readDTableX1_wksp_bmi2(DTable, src, srcSize, workSpace, wkspSize, /* bmi2 */ 0); +-} +- +-size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int bmi2) ++size_t HUF_readDTableX1_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int flags) + { + U32 tableLog = 0; + U32 nbSymbols = 0; +@@ -350,7 +394,7 @@ size_t HUF_readDTableX1_wksp_bmi2(HUF_DT + DEBUG_STATIC_ASSERT(sizeof(DTableDesc) == sizeof(HUF_DTable)); + /* ZSTD_memset(huffWeight, 0, sizeof(huffWeight)); */ /* is not necessary, even though some analyzer complain ... */ + +- iSize = HUF_readStats_wksp(wksp->huffWeight, HUF_SYMBOLVALUE_MAX + 1, wksp->rankVal, &nbSymbols, &tableLog, src, srcSize, wksp->statsWksp, sizeof(wksp->statsWksp), bmi2); ++ iSize = HUF_readStats_wksp(wksp->huffWeight, HUF_SYMBOLVALUE_MAX + 1, wksp->rankVal, &nbSymbols, &tableLog, src, srcSize, wksp->statsWksp, sizeof(wksp->statsWksp), flags); + if (HUF_isError(iSize)) return iSize; + + +@@ -377,9 +421,8 @@ size_t HUF_readDTableX1_wksp_bmi2(HUF_DT + * rankStart[0] is not filled because there are no entries in the table for + * weight 0. + */ +- { +- int n; +- int nextRankStart = 0; ++ { int n; ++ U32 nextRankStart = 0; + int const unroll = 4; + int const nLimit = (int)nbSymbols - unroll + 1; + for (n=0; n<(int)tableLog+1; n++) { +@@ -406,10 +449,9 @@ size_t HUF_readDTableX1_wksp_bmi2(HUF_DT + * We can switch based on the length to a different inner loop which is + * optimized for that particular case. + */ +- { +- U32 w; +- int symbol=wksp->rankVal[0]; +- int rankStart=0; ++ { U32 w; ++ int symbol = wksp->rankVal[0]; ++ int rankStart = 0; + for (w=1; wrankVal[w]; + int const length = (1 << w) >> 1; +@@ -483,15 +525,19 @@ HUF_decodeSymbolX1(BIT_DStream_t* Dstrea + } + + #define HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr) \ +- *ptr++ = HUF_decodeSymbolX1(DStreamPtr, dt, dtLog) ++ do { *ptr++ = HUF_decodeSymbolX1(DStreamPtr, dt, dtLog); } while (0) + +-#define HUF_DECODE_SYMBOLX1_1(ptr, DStreamPtr) \ +- if (MEM_64bits() || (HUF_TABLELOG_MAX<=12)) \ +- HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr) +- +-#define HUF_DECODE_SYMBOLX1_2(ptr, DStreamPtr) \ +- if (MEM_64bits()) \ +- HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr) ++#define HUF_DECODE_SYMBOLX1_1(ptr, DStreamPtr) \ ++ do { \ ++ if (MEM_64bits() || (HUF_TABLELOG_MAX<=12)) \ ++ HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr); \ ++ } while (0) ++ ++#define HUF_DECODE_SYMBOLX1_2(ptr, DStreamPtr) \ ++ do { \ ++ if (MEM_64bits()) \ ++ HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr); \ ++ } while (0) + + HINT_INLINE size_t + HUF_decodeStreamX1(BYTE* p, BIT_DStream_t* const bitDPtr, BYTE* const pEnd, const HUF_DEltX1* const dt, const U32 dtLog) +@@ -519,7 +565,7 @@ HUF_decodeStreamX1(BYTE* p, BIT_DStream_ + while (p < pEnd) + HUF_DECODE_SYMBOLX1_0(p, bitDPtr); + +- return pEnd-pStart; ++ return (size_t)(pEnd-pStart); + } + + FORCE_INLINE_TEMPLATE size_t +@@ -529,7 +575,7 @@ HUF_decompress1X1_usingDTable_internal_b + const HUF_DTable* DTable) + { + BYTE* op = (BYTE*)dst; +- BYTE* const oend = op + dstSize; ++ BYTE* const oend = ZSTD_maybeNullPtrAdd(op, dstSize); + const void* dtPtr = DTable + 1; + const HUF_DEltX1* const dt = (const HUF_DEltX1*)dtPtr; + BIT_DStream_t bitD; +@@ -545,6 +591,10 @@ HUF_decompress1X1_usingDTable_internal_b + return dstSize; + } + ++/* HUF_decompress4X1_usingDTable_internal_body(): ++ * Conditions : ++ * @dstSize >= 6 ++ */ + FORCE_INLINE_TEMPLATE size_t + HUF_decompress4X1_usingDTable_internal_body( + void* dst, size_t dstSize, +@@ -553,6 +603,7 @@ HUF_decompress4X1_usingDTable_internal_b + { + /* Check */ + if (cSrcSize < 10) return ERROR(corruption_detected); /* strict minimum : jump table + 1 byte per stream */ ++ if (dstSize < 6) return ERROR(corruption_detected); /* stream 4-split doesn't work */ + + { const BYTE* const istart = (const BYTE*) cSrc; + BYTE* const ostart = (BYTE*) dst; +@@ -588,6 +639,7 @@ HUF_decompress4X1_usingDTable_internal_b + + if (length4 > cSrcSize) return ERROR(corruption_detected); /* overflow */ + if (opStart4 > oend) return ERROR(corruption_detected); /* overflow */ ++ assert(dstSize >= 6); /* validated above */ + CHECK_F( BIT_initDStream(&bitD1, istart1, length1) ); + CHECK_F( BIT_initDStream(&bitD2, istart2, length2) ); + CHECK_F( BIT_initDStream(&bitD3, istart3, length3) ); +@@ -650,52 +702,173 @@ size_t HUF_decompress4X1_usingDTable_int + } + #endif + +-#if HUF_NEED_DEFAULT_FUNCTION + static + size_t HUF_decompress4X1_usingDTable_internal_default(void* dst, size_t dstSize, void const* cSrc, + size_t cSrcSize, HUF_DTable const* DTable) { + return HUF_decompress4X1_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable); + } +-#endif + + #if ZSTD_ENABLE_ASM_X86_64_BMI2 + +-HUF_ASM_DECL void HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop(HUF_DecompressAsmArgs* args) ZSTDLIB_HIDDEN; ++HUF_ASM_DECL void HUF_decompress4X1_usingDTable_internal_fast_asm_loop(HUF_DecompressFastArgs* args) ZSTDLIB_HIDDEN; ++ ++#endif ++ ++static HUF_FAST_BMI2_ATTRS ++void HUF_decompress4X1_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs* args) ++{ ++ U64 bits[4]; ++ BYTE const* ip[4]; ++ BYTE* op[4]; ++ U16 const* const dtable = (U16 const*)args->dt; ++ BYTE* const oend = args->oend; ++ BYTE const* const ilowest = args->ilowest; ++ ++ /* Copy the arguments to local variables */ ++ ZSTD_memcpy(&bits, &args->bits, sizeof(bits)); ++ ZSTD_memcpy((void*)(&ip), &args->ip, sizeof(ip)); ++ ZSTD_memcpy(&op, &args->op, sizeof(op)); ++ ++ assert(MEM_isLittleEndian()); ++ assert(!MEM_32bits()); ++ ++ for (;;) { ++ BYTE* olimit; ++ int stream; ++ ++ /* Assert loop preconditions */ ++#ifndef NDEBUG ++ for (stream = 0; stream < 4; ++stream) { ++ assert(op[stream] <= (stream == 3 ? oend : op[stream + 1])); ++ assert(ip[stream] >= ilowest); ++ } ++#endif ++ /* Compute olimit */ ++ { ++ /* Each iteration produces 5 output symbols per stream */ ++ size_t const oiters = (size_t)(oend - op[3]) / 5; ++ /* Each iteration consumes up to 11 bits * 5 = 55 bits < 7 bytes ++ * per stream. ++ */ ++ size_t const iiters = (size_t)(ip[0] - ilowest) / 7; ++ /* We can safely run iters iterations before running bounds checks */ ++ size_t const iters = MIN(oiters, iiters); ++ size_t const symbols = iters * 5; ++ ++ /* We can simply check that op[3] < olimit, instead of checking all ++ * of our bounds, since we can't hit the other bounds until we've run ++ * iters iterations, which only happens when op[3] == olimit. ++ */ ++ olimit = op[3] + symbols; ++ ++ /* Exit fast decoding loop once we reach the end. */ ++ if (op[3] == olimit) ++ break; ++ ++ /* Exit the decoding loop if any input pointer has crossed the ++ * previous one. This indicates corruption, and a precondition ++ * to our loop is that ip[i] >= ip[0]. ++ */ ++ for (stream = 1; stream < 4; ++stream) { ++ if (ip[stream] < ip[stream - 1]) ++ goto _out; ++ } ++ } ++ ++#ifndef NDEBUG ++ for (stream = 1; stream < 4; ++stream) { ++ assert(ip[stream] >= ip[stream - 1]); ++ } ++#endif ++ ++#define HUF_4X1_DECODE_SYMBOL(_stream, _symbol) \ ++ do { \ ++ int const index = (int)(bits[(_stream)] >> 53); \ ++ int const entry = (int)dtable[index]; \ ++ bits[(_stream)] <<= (entry & 0x3F); \ ++ op[(_stream)][(_symbol)] = (BYTE)((entry >> 8) & 0xFF); \ ++ } while (0) ++ ++#define HUF_4X1_RELOAD_STREAM(_stream) \ ++ do { \ ++ int const ctz = ZSTD_countTrailingZeros64(bits[(_stream)]); \ ++ int const nbBits = ctz & 7; \ ++ int const nbBytes = ctz >> 3; \ ++ op[(_stream)] += 5; \ ++ ip[(_stream)] -= nbBytes; \ ++ bits[(_stream)] = MEM_read64(ip[(_stream)]) | 1; \ ++ bits[(_stream)] <<= nbBits; \ ++ } while (0) ++ ++ /* Manually unroll the loop because compilers don't consistently ++ * unroll the inner loops, which destroys performance. ++ */ ++ do { ++ /* Decode 5 symbols in each of the 4 streams */ ++ HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 0); ++ HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 1); ++ HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 2); ++ HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 3); ++ HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 4); ++ ++ /* Reload each of the 4 the bitstreams */ ++ HUF_4X_FOR_EACH_STREAM(HUF_4X1_RELOAD_STREAM); ++ } while (op[3] < olimit); ++ ++#undef HUF_4X1_DECODE_SYMBOL ++#undef HUF_4X1_RELOAD_STREAM ++ } ++ ++_out: ++ ++ /* Save the final values of each of the state variables back to args. */ ++ ZSTD_memcpy(&args->bits, &bits, sizeof(bits)); ++ ZSTD_memcpy((void*)(&args->ip), &ip, sizeof(ip)); ++ ZSTD_memcpy(&args->op, &op, sizeof(op)); ++} + +-static HUF_ASM_X86_64_BMI2_ATTRS ++/* ++ * @returns @p dstSize on success (>= 6) ++ * 0 if the fallback implementation should be used ++ * An error if an error occurred ++ */ ++static HUF_FAST_BMI2_ATTRS + size_t +-HUF_decompress4X1_usingDTable_internal_bmi2_asm( ++HUF_decompress4X1_usingDTable_internal_fast( + void* dst, size_t dstSize, + const void* cSrc, size_t cSrcSize, +- const HUF_DTable* DTable) ++ const HUF_DTable* DTable, ++ HUF_DecompressFastLoopFn loopFn) + { + void const* dt = DTable + 1; +- const BYTE* const iend = (const BYTE*)cSrc + 6; +- BYTE* const oend = (BYTE*)dst + dstSize; +- HUF_DecompressAsmArgs args; +- { +- size_t const ret = HUF_DecompressAsmArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable); +- FORWARD_IF_ERROR(ret, "Failed to init asm args"); +- if (ret != 0) +- return HUF_decompress4X1_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable); ++ BYTE const* const ilowest = (BYTE const*)cSrc; ++ BYTE* const oend = ZSTD_maybeNullPtrAdd((BYTE*)dst, dstSize); ++ HUF_DecompressFastArgs args; ++ { size_t const ret = HUF_DecompressFastArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable); ++ FORWARD_IF_ERROR(ret, "Failed to init fast loop args"); ++ if (ret == 0) ++ return 0; + } + +- assert(args.ip[0] >= args.ilimit); +- HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop(&args); ++ assert(args.ip[0] >= args.ilowest); ++ loopFn(&args); + +- /* Our loop guarantees that ip[] >= ilimit and that we haven't ++ /* Our loop guarantees that ip[] >= ilowest and that we haven't + * overwritten any op[]. + */ +- assert(args.ip[0] >= iend); +- assert(args.ip[1] >= iend); +- assert(args.ip[2] >= iend); +- assert(args.ip[3] >= iend); ++ assert(args.ip[0] >= ilowest); ++ assert(args.ip[0] >= ilowest); ++ assert(args.ip[1] >= ilowest); ++ assert(args.ip[2] >= ilowest); ++ assert(args.ip[3] >= ilowest); + assert(args.op[3] <= oend); +- (void)iend; ++ ++ assert(ilowest == args.ilowest); ++ assert(ilowest + 6 == args.iend[0]); ++ (void)ilowest; + + /* finish bit streams one by one. */ +- { +- size_t const segmentSize = (dstSize+3) / 4; ++ { size_t const segmentSize = (dstSize+3) / 4; + BYTE* segmentEnd = (BYTE*)dst; + int i; + for (i = 0; i < 4; ++i) { +@@ -712,97 +885,59 @@ HUF_decompress4X1_usingDTable_internal_b + } + + /* decoded size */ ++ assert(dstSize != 0); + return dstSize; + } +-#endif /* ZSTD_ENABLE_ASM_X86_64_BMI2 */ +- +-typedef size_t (*HUF_decompress_usingDTable_t)(void *dst, size_t dstSize, +- const void *cSrc, +- size_t cSrcSize, +- const HUF_DTable *DTable); + + HUF_DGEN(HUF_decompress1X1_usingDTable_internal) + + static size_t HUF_decompress4X1_usingDTable_internal(void* dst, size_t dstSize, void const* cSrc, +- size_t cSrcSize, HUF_DTable const* DTable, int bmi2) ++ size_t cSrcSize, HUF_DTable const* DTable, int flags) + { ++ HUF_DecompressUsingDTableFn fallbackFn = HUF_decompress4X1_usingDTable_internal_default; ++ HUF_DecompressFastLoopFn loopFn = HUF_decompress4X1_usingDTable_internal_fast_c_loop; ++ + #if DYNAMIC_BMI2 +- if (bmi2) { ++ if (flags & HUF_flags_bmi2) { ++ fallbackFn = HUF_decompress4X1_usingDTable_internal_bmi2; + # if ZSTD_ENABLE_ASM_X86_64_BMI2 +- return HUF_decompress4X1_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable); +-# else +- return HUF_decompress4X1_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable); ++ if (!(flags & HUF_flags_disableAsm)) { ++ loopFn = HUF_decompress4X1_usingDTable_internal_fast_asm_loop; ++ } + # endif ++ } else { ++ return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable); + } +-#else +- (void)bmi2; + #endif + + #if ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__) +- return HUF_decompress4X1_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable); +-#else +- return HUF_decompress4X1_usingDTable_internal_default(dst, dstSize, cSrc, cSrcSize, DTable); ++ if (!(flags & HUF_flags_disableAsm)) { ++ loopFn = HUF_decompress4X1_usingDTable_internal_fast_asm_loop; ++ } + #endif +-} +- +- +-size_t HUF_decompress1X1_usingDTable( +- void* dst, size_t dstSize, +- const void* cSrc, size_t cSrcSize, +- const HUF_DTable* DTable) +-{ +- DTableDesc dtd = HUF_getDTableDesc(DTable); +- if (dtd.tableType != 0) return ERROR(GENERIC); +- return HUF_decompress1X1_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0); +-} + +-size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* DCtx, void* dst, size_t dstSize, +- const void* cSrc, size_t cSrcSize, +- void* workSpace, size_t wkspSize) +-{ +- const BYTE* ip = (const BYTE*) cSrc; +- +- size_t const hSize = HUF_readDTableX1_wksp(DCtx, cSrc, cSrcSize, workSpace, wkspSize); +- if (HUF_isError(hSize)) return hSize; +- if (hSize >= cSrcSize) return ERROR(srcSize_wrong); +- ip += hSize; cSrcSize -= hSize; +- +- return HUF_decompress1X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx, /* bmi2 */ 0); +-} +- +- +-size_t HUF_decompress4X1_usingDTable( +- void* dst, size_t dstSize, +- const void* cSrc, size_t cSrcSize, +- const HUF_DTable* DTable) +-{ +- DTableDesc dtd = HUF_getDTableDesc(DTable); +- if (dtd.tableType != 0) return ERROR(GENERIC); +- return HUF_decompress4X1_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0); ++ if (HUF_ENABLE_FAST_DECODE && !(flags & HUF_flags_disableFast)) { ++ size_t const ret = HUF_decompress4X1_usingDTable_internal_fast(dst, dstSize, cSrc, cSrcSize, DTable, loopFn); ++ if (ret != 0) ++ return ret; ++ } ++ return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable); + } + +-static size_t HUF_decompress4X1_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, ++static size_t HUF_decompress4X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, + const void* cSrc, size_t cSrcSize, +- void* workSpace, size_t wkspSize, int bmi2) ++ void* workSpace, size_t wkspSize, int flags) + { + const BYTE* ip = (const BYTE*) cSrc; + +- size_t const hSize = HUF_readDTableX1_wksp_bmi2(dctx, cSrc, cSrcSize, workSpace, wkspSize, bmi2); ++ size_t const hSize = HUF_readDTableX1_wksp(dctx, cSrc, cSrcSize, workSpace, wkspSize, flags); + if (HUF_isError(hSize)) return hSize; + if (hSize >= cSrcSize) return ERROR(srcSize_wrong); + ip += hSize; cSrcSize -= hSize; + +- return HUF_decompress4X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2); ++ return HUF_decompress4X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, flags); + } + +-size_t HUF_decompress4X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, +- const void* cSrc, size_t cSrcSize, +- void* workSpace, size_t wkspSize) +-{ +- return HUF_decompress4X1_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, 0); +-} +- +- + #endif /* HUF_FORCE_DECOMPRESS_X2 */ + + +@@ -985,7 +1120,7 @@ static void HUF_fillDTableX2Level2(HUF_D + + static void HUF_fillDTableX2(HUF_DEltX2* DTable, const U32 targetLog, + const sortedSymbol_t* sortedList, +- const U32* rankStart, rankValCol_t *rankValOrigin, const U32 maxWeight, ++ const U32* rankStart, rankValCol_t* rankValOrigin, const U32 maxWeight, + const U32 nbBitsBaseline) + { + U32* const rankVal = rankValOrigin[0]; +@@ -1040,14 +1175,7 @@ typedef struct { + + size_t HUF_readDTableX2_wksp(HUF_DTable* DTable, + const void* src, size_t srcSize, +- void* workSpace, size_t wkspSize) +-{ +- return HUF_readDTableX2_wksp_bmi2(DTable, src, srcSize, workSpace, wkspSize, /* bmi2 */ 0); +-} +- +-size_t HUF_readDTableX2_wksp_bmi2(HUF_DTable* DTable, +- const void* src, size_t srcSize, +- void* workSpace, size_t wkspSize, int bmi2) ++ void* workSpace, size_t wkspSize, int flags) + { + U32 tableLog, maxW, nbSymbols; + DTableDesc dtd = HUF_getDTableDesc(DTable); +@@ -1069,7 +1197,7 @@ size_t HUF_readDTableX2_wksp_bmi2(HUF_DT + if (maxTableLog > HUF_TABLELOG_MAX) return ERROR(tableLog_tooLarge); + /* ZSTD_memset(weightList, 0, sizeof(weightList)); */ /* is not necessary, even though some analyzer complain ... */ + +- iSize = HUF_readStats_wksp(wksp->weightList, HUF_SYMBOLVALUE_MAX + 1, wksp->rankStats, &nbSymbols, &tableLog, src, srcSize, wksp->calleeWksp, sizeof(wksp->calleeWksp), bmi2); ++ iSize = HUF_readStats_wksp(wksp->weightList, HUF_SYMBOLVALUE_MAX + 1, wksp->rankStats, &nbSymbols, &tableLog, src, srcSize, wksp->calleeWksp, sizeof(wksp->calleeWksp), flags); + if (HUF_isError(iSize)) return iSize; + + /* check result */ +@@ -1159,15 +1287,19 @@ HUF_decodeLastSymbolX2(void* op, BIT_DSt + } + + #define HUF_DECODE_SYMBOLX2_0(ptr, DStreamPtr) \ +- ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog) ++ do { ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog); } while (0) + +-#define HUF_DECODE_SYMBOLX2_1(ptr, DStreamPtr) \ +- if (MEM_64bits() || (HUF_TABLELOG_MAX<=12)) \ +- ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog) +- +-#define HUF_DECODE_SYMBOLX2_2(ptr, DStreamPtr) \ +- if (MEM_64bits()) \ +- ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog) ++#define HUF_DECODE_SYMBOLX2_1(ptr, DStreamPtr) \ ++ do { \ ++ if (MEM_64bits() || (HUF_TABLELOG_MAX<=12)) \ ++ ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog); \ ++ } while (0) ++ ++#define HUF_DECODE_SYMBOLX2_2(ptr, DStreamPtr) \ ++ do { \ ++ if (MEM_64bits()) \ ++ ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog); \ ++ } while (0) + + HINT_INLINE size_t + HUF_decodeStreamX2(BYTE* p, BIT_DStream_t* bitDPtr, BYTE* const pEnd, +@@ -1227,7 +1359,7 @@ HUF_decompress1X2_usingDTable_internal_b + + /* decode */ + { BYTE* const ostart = (BYTE*) dst; +- BYTE* const oend = ostart + dstSize; ++ BYTE* const oend = ZSTD_maybeNullPtrAdd(ostart, dstSize); + const void* const dtPtr = DTable+1; /* force compiler to not use strict-aliasing */ + const HUF_DEltX2* const dt = (const HUF_DEltX2*)dtPtr; + DTableDesc const dtd = HUF_getDTableDesc(DTable); +@@ -1240,6 +1372,11 @@ HUF_decompress1X2_usingDTable_internal_b + /* decoded size */ + return dstSize; + } ++ ++/* HUF_decompress4X2_usingDTable_internal_body(): ++ * Conditions: ++ * @dstSize >= 6 ++ */ + FORCE_INLINE_TEMPLATE size_t + HUF_decompress4X2_usingDTable_internal_body( + void* dst, size_t dstSize, +@@ -1247,6 +1384,7 @@ HUF_decompress4X2_usingDTable_internal_b + const HUF_DTable* DTable) + { + if (cSrcSize < 10) return ERROR(corruption_detected); /* strict minimum : jump table + 1 byte per stream */ ++ if (dstSize < 6) return ERROR(corruption_detected); /* stream 4-split doesn't work */ + + { const BYTE* const istart = (const BYTE*) cSrc; + BYTE* const ostart = (BYTE*) dst; +@@ -1280,8 +1418,9 @@ HUF_decompress4X2_usingDTable_internal_b + DTableDesc const dtd = HUF_getDTableDesc(DTable); + U32 const dtLog = dtd.tableLog; + +- if (length4 > cSrcSize) return ERROR(corruption_detected); /* overflow */ +- if (opStart4 > oend) return ERROR(corruption_detected); /* overflow */ ++ if (length4 > cSrcSize) return ERROR(corruption_detected); /* overflow */ ++ if (opStart4 > oend) return ERROR(corruption_detected); /* overflow */ ++ assert(dstSize >= 6 /* validated above */); + CHECK_F( BIT_initDStream(&bitD1, istart1, length1) ); + CHECK_F( BIT_initDStream(&bitD2, istart2, length2) ); + CHECK_F( BIT_initDStream(&bitD3, istart3, length3) ); +@@ -1366,44 +1505,191 @@ size_t HUF_decompress4X2_usingDTable_int + } + #endif + +-#if HUF_NEED_DEFAULT_FUNCTION + static + size_t HUF_decompress4X2_usingDTable_internal_default(void* dst, size_t dstSize, void const* cSrc, + size_t cSrcSize, HUF_DTable const* DTable) { + return HUF_decompress4X2_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable); + } +-#endif + + #if ZSTD_ENABLE_ASM_X86_64_BMI2 + +-HUF_ASM_DECL void HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop(HUF_DecompressAsmArgs* args) ZSTDLIB_HIDDEN; ++HUF_ASM_DECL void HUF_decompress4X2_usingDTable_internal_fast_asm_loop(HUF_DecompressFastArgs* args) ZSTDLIB_HIDDEN; ++ ++#endif ++ ++static HUF_FAST_BMI2_ATTRS ++void HUF_decompress4X2_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs* args) ++{ ++ U64 bits[4]; ++ BYTE const* ip[4]; ++ BYTE* op[4]; ++ BYTE* oend[4]; ++ HUF_DEltX2 const* const dtable = (HUF_DEltX2 const*)args->dt; ++ BYTE const* const ilowest = args->ilowest; ++ ++ /* Copy the arguments to local registers. */ ++ ZSTD_memcpy(&bits, &args->bits, sizeof(bits)); ++ ZSTD_memcpy((void*)(&ip), &args->ip, sizeof(ip)); ++ ZSTD_memcpy(&op, &args->op, sizeof(op)); ++ ++ oend[0] = op[1]; ++ oend[1] = op[2]; ++ oend[2] = op[3]; ++ oend[3] = args->oend; ++ ++ assert(MEM_isLittleEndian()); ++ assert(!MEM_32bits()); ++ ++ for (;;) { ++ BYTE* olimit; ++ int stream; ++ ++ /* Assert loop preconditions */ ++#ifndef NDEBUG ++ for (stream = 0; stream < 4; ++stream) { ++ assert(op[stream] <= oend[stream]); ++ assert(ip[stream] >= ilowest); ++ } ++#endif ++ /* Compute olimit */ ++ { ++ /* Each loop does 5 table lookups for each of the 4 streams. ++ * Each table lookup consumes up to 11 bits of input, and produces ++ * up to 2 bytes of output. ++ */ ++ /* We can consume up to 7 bytes of input per iteration per stream. ++ * We also know that each input pointer is >= ip[0]. So we can run ++ * iters loops before running out of input. ++ */ ++ size_t iters = (size_t)(ip[0] - ilowest) / 7; ++ /* Each iteration can produce up to 10 bytes of output per stream. ++ * Each output stream my advance at different rates. So take the ++ * minimum number of safe iterations among all the output streams. ++ */ ++ for (stream = 0; stream < 4; ++stream) { ++ size_t const oiters = (size_t)(oend[stream] - op[stream]) / 10; ++ iters = MIN(iters, oiters); ++ } ++ ++ /* Each iteration produces at least 5 output symbols. So until ++ * op[3] crosses olimit, we know we haven't executed iters ++ * iterations yet. This saves us maintaining an iters counter, ++ * at the expense of computing the remaining # of iterations ++ * more frequently. ++ */ ++ olimit = op[3] + (iters * 5); ++ ++ /* Exit the fast decoding loop once we reach the end. */ ++ if (op[3] == olimit) ++ break; ++ ++ /* Exit the decoding loop if any input pointer has crossed the ++ * previous one. This indicates corruption, and a precondition ++ * to our loop is that ip[i] >= ip[0]. ++ */ ++ for (stream = 1; stream < 4; ++stream) { ++ if (ip[stream] < ip[stream - 1]) ++ goto _out; ++ } ++ } ++ ++#ifndef NDEBUG ++ for (stream = 1; stream < 4; ++stream) { ++ assert(ip[stream] >= ip[stream - 1]); ++ } ++#endif + +-static HUF_ASM_X86_64_BMI2_ATTRS size_t +-HUF_decompress4X2_usingDTable_internal_bmi2_asm( ++#define HUF_4X2_DECODE_SYMBOL(_stream, _decode3) \ ++ do { \ ++ if ((_decode3) || (_stream) != 3) { \ ++ int const index = (int)(bits[(_stream)] >> 53); \ ++ HUF_DEltX2 const entry = dtable[index]; \ ++ MEM_write16(op[(_stream)], entry.sequence); \ ++ bits[(_stream)] <<= (entry.nbBits) & 0x3F; \ ++ op[(_stream)] += (entry.length); \ ++ } \ ++ } while (0) ++ ++#define HUF_4X2_RELOAD_STREAM(_stream) \ ++ do { \ ++ HUF_4X2_DECODE_SYMBOL(3, 1); \ ++ { \ ++ int const ctz = ZSTD_countTrailingZeros64(bits[(_stream)]); \ ++ int const nbBits = ctz & 7; \ ++ int const nbBytes = ctz >> 3; \ ++ ip[(_stream)] -= nbBytes; \ ++ bits[(_stream)] = MEM_read64(ip[(_stream)]) | 1; \ ++ bits[(_stream)] <<= nbBits; \ ++ } \ ++ } while (0) ++ ++ /* Manually unroll the loop because compilers don't consistently ++ * unroll the inner loops, which destroys performance. ++ */ ++ do { ++ /* Decode 5 symbols from each of the first 3 streams. ++ * The final stream will be decoded during the reload phase ++ * to reduce register pressure. ++ */ ++ HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0); ++ HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0); ++ HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0); ++ HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0); ++ HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0); ++ ++ /* Decode one symbol from the final stream */ ++ HUF_4X2_DECODE_SYMBOL(3, 1); ++ ++ /* Decode 4 symbols from the final stream & reload bitstreams. ++ * The final stream is reloaded last, meaning that all 5 symbols ++ * are decoded from the final stream before it is reloaded. ++ */ ++ HUF_4X_FOR_EACH_STREAM(HUF_4X2_RELOAD_STREAM); ++ } while (op[3] < olimit); ++ } ++ ++#undef HUF_4X2_DECODE_SYMBOL ++#undef HUF_4X2_RELOAD_STREAM ++ ++_out: ++ ++ /* Save the final values of each of the state variables back to args. */ ++ ZSTD_memcpy(&args->bits, &bits, sizeof(bits)); ++ ZSTD_memcpy((void*)(&args->ip), &ip, sizeof(ip)); ++ ZSTD_memcpy(&args->op, &op, sizeof(op)); ++} ++ ++ ++static HUF_FAST_BMI2_ATTRS size_t ++HUF_decompress4X2_usingDTable_internal_fast( + void* dst, size_t dstSize, + const void* cSrc, size_t cSrcSize, +- const HUF_DTable* DTable) { ++ const HUF_DTable* DTable, ++ HUF_DecompressFastLoopFn loopFn) { + void const* dt = DTable + 1; +- const BYTE* const iend = (const BYTE*)cSrc + 6; +- BYTE* const oend = (BYTE*)dst + dstSize; +- HUF_DecompressAsmArgs args; ++ const BYTE* const ilowest = (const BYTE*)cSrc; ++ BYTE* const oend = ZSTD_maybeNullPtrAdd((BYTE*)dst, dstSize); ++ HUF_DecompressFastArgs args; + { +- size_t const ret = HUF_DecompressAsmArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable); ++ size_t const ret = HUF_DecompressFastArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable); + FORWARD_IF_ERROR(ret, "Failed to init asm args"); +- if (ret != 0) +- return HUF_decompress4X2_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable); ++ if (ret == 0) ++ return 0; + } + +- assert(args.ip[0] >= args.ilimit); +- HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop(&args); ++ assert(args.ip[0] >= args.ilowest); ++ loopFn(&args); + + /* note : op4 already verified within main loop */ +- assert(args.ip[0] >= iend); +- assert(args.ip[1] >= iend); +- assert(args.ip[2] >= iend); +- assert(args.ip[3] >= iend); ++ assert(args.ip[0] >= ilowest); ++ assert(args.ip[1] >= ilowest); ++ assert(args.ip[2] >= ilowest); ++ assert(args.ip[3] >= ilowest); + assert(args.op[3] <= oend); +- (void)iend; ++ ++ assert(ilowest == args.ilowest); ++ assert(ilowest + 6 == args.iend[0]); ++ (void)ilowest; + + /* finish bitStreams one by one */ + { +@@ -1426,91 +1712,72 @@ HUF_decompress4X2_usingDTable_internal_b + /* decoded size */ + return dstSize; + } +-#endif /* ZSTD_ENABLE_ASM_X86_64_BMI2 */ + + static size_t HUF_decompress4X2_usingDTable_internal(void* dst, size_t dstSize, void const* cSrc, +- size_t cSrcSize, HUF_DTable const* DTable, int bmi2) ++ size_t cSrcSize, HUF_DTable const* DTable, int flags) + { ++ HUF_DecompressUsingDTableFn fallbackFn = HUF_decompress4X2_usingDTable_internal_default; ++ HUF_DecompressFastLoopFn loopFn = HUF_decompress4X2_usingDTable_internal_fast_c_loop; ++ + #if DYNAMIC_BMI2 +- if (bmi2) { ++ if (flags & HUF_flags_bmi2) { ++ fallbackFn = HUF_decompress4X2_usingDTable_internal_bmi2; + # if ZSTD_ENABLE_ASM_X86_64_BMI2 +- return HUF_decompress4X2_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable); +-# else +- return HUF_decompress4X2_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable); ++ if (!(flags & HUF_flags_disableAsm)) { ++ loopFn = HUF_decompress4X2_usingDTable_internal_fast_asm_loop; ++ } + # endif ++ } else { ++ return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable); + } +-#else +- (void)bmi2; + #endif + + #if ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__) +- return HUF_decompress4X2_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable); +-#else +- return HUF_decompress4X2_usingDTable_internal_default(dst, dstSize, cSrc, cSrcSize, DTable); ++ if (!(flags & HUF_flags_disableAsm)) { ++ loopFn = HUF_decompress4X2_usingDTable_internal_fast_asm_loop; ++ } + #endif ++ ++ if (HUF_ENABLE_FAST_DECODE && !(flags & HUF_flags_disableFast)) { ++ size_t const ret = HUF_decompress4X2_usingDTable_internal_fast(dst, dstSize, cSrc, cSrcSize, DTable, loopFn); ++ if (ret != 0) ++ return ret; ++ } ++ return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable); + } + + HUF_DGEN(HUF_decompress1X2_usingDTable_internal) + +-size_t HUF_decompress1X2_usingDTable( +- void* dst, size_t dstSize, +- const void* cSrc, size_t cSrcSize, +- const HUF_DTable* DTable) +-{ +- DTableDesc dtd = HUF_getDTableDesc(DTable); +- if (dtd.tableType != 1) return ERROR(GENERIC); +- return HUF_decompress1X2_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0); +-} +- + size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable* DCtx, void* dst, size_t dstSize, + const void* cSrc, size_t cSrcSize, +- void* workSpace, size_t wkspSize) ++ void* workSpace, size_t wkspSize, int flags) + { + const BYTE* ip = (const BYTE*) cSrc; + + size_t const hSize = HUF_readDTableX2_wksp(DCtx, cSrc, cSrcSize, +- workSpace, wkspSize); ++ workSpace, wkspSize, flags); + if (HUF_isError(hSize)) return hSize; + if (hSize >= cSrcSize) return ERROR(srcSize_wrong); + ip += hSize; cSrcSize -= hSize; + +- return HUF_decompress1X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx, /* bmi2 */ 0); +-} +- +- +-size_t HUF_decompress4X2_usingDTable( +- void* dst, size_t dstSize, +- const void* cSrc, size_t cSrcSize, +- const HUF_DTable* DTable) +-{ +- DTableDesc dtd = HUF_getDTableDesc(DTable); +- if (dtd.tableType != 1) return ERROR(GENERIC); +- return HUF_decompress4X2_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0); ++ return HUF_decompress1X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx, flags); + } + +-static size_t HUF_decompress4X2_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, ++static size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, + const void* cSrc, size_t cSrcSize, +- void* workSpace, size_t wkspSize, int bmi2) ++ void* workSpace, size_t wkspSize, int flags) + { + const BYTE* ip = (const BYTE*) cSrc; + + size_t hSize = HUF_readDTableX2_wksp(dctx, cSrc, cSrcSize, +- workSpace, wkspSize); ++ workSpace, wkspSize, flags); + if (HUF_isError(hSize)) return hSize; + if (hSize >= cSrcSize) return ERROR(srcSize_wrong); + ip += hSize; cSrcSize -= hSize; + +- return HUF_decompress4X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2); ++ return HUF_decompress4X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, flags); + } + +-size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, +- const void* cSrc, size_t cSrcSize, +- void* workSpace, size_t wkspSize) +-{ +- return HUF_decompress4X2_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, /* bmi2 */ 0); +-} +- +- + #endif /* HUF_FORCE_DECOMPRESS_X1 */ + + +@@ -1518,44 +1785,6 @@ size_t HUF_decompress4X2_DCtx_wksp(HUF_D + /* Universal decompression selectors */ + /* ***********************************/ + +-size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize, +- const void* cSrc, size_t cSrcSize, +- const HUF_DTable* DTable) +-{ +- DTableDesc const dtd = HUF_getDTableDesc(DTable); +-#if defined(HUF_FORCE_DECOMPRESS_X1) +- (void)dtd; +- assert(dtd.tableType == 0); +- return HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0); +-#elif defined(HUF_FORCE_DECOMPRESS_X2) +- (void)dtd; +- assert(dtd.tableType == 1); +- return HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0); +-#else +- return dtd.tableType ? HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0) : +- HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0); +-#endif +-} +- +-size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize, +- const void* cSrc, size_t cSrcSize, +- const HUF_DTable* DTable) +-{ +- DTableDesc const dtd = HUF_getDTableDesc(DTable); +-#if defined(HUF_FORCE_DECOMPRESS_X1) +- (void)dtd; +- assert(dtd.tableType == 0); +- return HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0); +-#elif defined(HUF_FORCE_DECOMPRESS_X2) +- (void)dtd; +- assert(dtd.tableType == 1); +- return HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0); +-#else +- return dtd.tableType ? HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0) : +- HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0); +-#endif +-} +- + + #if !defined(HUF_FORCE_DECOMPRESS_X1) && !defined(HUF_FORCE_DECOMPRESS_X2) + typedef struct { U32 tableTime; U32 decode256Time; } algo_time_t; +@@ -1610,36 +1839,9 @@ U32 HUF_selectDecoder (size_t dstSize, s + #endif + } + +- +-size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst, +- size_t dstSize, const void* cSrc, +- size_t cSrcSize, void* workSpace, +- size_t wkspSize) +-{ +- /* validation checks */ +- if (dstSize == 0) return ERROR(dstSize_tooSmall); +- if (cSrcSize == 0) return ERROR(corruption_detected); +- +- { U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize); +-#if defined(HUF_FORCE_DECOMPRESS_X1) +- (void)algoNb; +- assert(algoNb == 0); +- return HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize); +-#elif defined(HUF_FORCE_DECOMPRESS_X2) +- (void)algoNb; +- assert(algoNb == 1); +- return HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize); +-#else +- return algoNb ? HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, +- cSrcSize, workSpace, wkspSize): +- HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize); +-#endif +- } +-} +- + size_t HUF_decompress1X_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, + const void* cSrc, size_t cSrcSize, +- void* workSpace, size_t wkspSize) ++ void* workSpace, size_t wkspSize, int flags) + { + /* validation checks */ + if (dstSize == 0) return ERROR(dstSize_tooSmall); +@@ -1652,71 +1854,71 @@ size_t HUF_decompress1X_DCtx_wksp(HUF_DT + (void)algoNb; + assert(algoNb == 0); + return HUF_decompress1X1_DCtx_wksp(dctx, dst, dstSize, cSrc, +- cSrcSize, workSpace, wkspSize); ++ cSrcSize, workSpace, wkspSize, flags); + #elif defined(HUF_FORCE_DECOMPRESS_X2) + (void)algoNb; + assert(algoNb == 1); + return HUF_decompress1X2_DCtx_wksp(dctx, dst, dstSize, cSrc, +- cSrcSize, workSpace, wkspSize); ++ cSrcSize, workSpace, wkspSize, flags); + #else + return algoNb ? HUF_decompress1X2_DCtx_wksp(dctx, dst, dstSize, cSrc, +- cSrcSize, workSpace, wkspSize): ++ cSrcSize, workSpace, wkspSize, flags): + HUF_decompress1X1_DCtx_wksp(dctx, dst, dstSize, cSrc, +- cSrcSize, workSpace, wkspSize); ++ cSrcSize, workSpace, wkspSize, flags); + #endif + } + } + + +-size_t HUF_decompress1X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2) ++size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int flags) + { + DTableDesc const dtd = HUF_getDTableDesc(DTable); + #if defined(HUF_FORCE_DECOMPRESS_X1) + (void)dtd; + assert(dtd.tableType == 0); +- return HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2); ++ return HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags); + #elif defined(HUF_FORCE_DECOMPRESS_X2) + (void)dtd; + assert(dtd.tableType == 1); +- return HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2); ++ return HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags); + #else +- return dtd.tableType ? HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2) : +- HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2); ++ return dtd.tableType ? HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags) : ++ HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags); + #endif + } + + #ifndef HUF_FORCE_DECOMPRESS_X2 +-size_t HUF_decompress1X1_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2) ++size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags) + { + const BYTE* ip = (const BYTE*) cSrc; + +- size_t const hSize = HUF_readDTableX1_wksp_bmi2(dctx, cSrc, cSrcSize, workSpace, wkspSize, bmi2); ++ size_t const hSize = HUF_readDTableX1_wksp(dctx, cSrc, cSrcSize, workSpace, wkspSize, flags); + if (HUF_isError(hSize)) return hSize; + if (hSize >= cSrcSize) return ERROR(srcSize_wrong); + ip += hSize; cSrcSize -= hSize; + +- return HUF_decompress1X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2); ++ return HUF_decompress1X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, flags); + } + #endif + +-size_t HUF_decompress4X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2) ++size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int flags) + { + DTableDesc const dtd = HUF_getDTableDesc(DTable); + #if defined(HUF_FORCE_DECOMPRESS_X1) + (void)dtd; + assert(dtd.tableType == 0); +- return HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2); ++ return HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags); + #elif defined(HUF_FORCE_DECOMPRESS_X2) + (void)dtd; + assert(dtd.tableType == 1); +- return HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2); ++ return HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags); + #else +- return dtd.tableType ? HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2) : +- HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2); ++ return dtd.tableType ? HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags) : ++ HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags); + #endif + } + +-size_t HUF_decompress4X_hufOnly_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2) ++size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags) + { + /* validation checks */ + if (dstSize == 0) return ERROR(dstSize_tooSmall); +@@ -1726,15 +1928,14 @@ size_t HUF_decompress4X_hufOnly_wksp_bmi + #if defined(HUF_FORCE_DECOMPRESS_X1) + (void)algoNb; + assert(algoNb == 0); +- return HUF_decompress4X1_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2); ++ return HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags); + #elif defined(HUF_FORCE_DECOMPRESS_X2) + (void)algoNb; + assert(algoNb == 1); +- return HUF_decompress4X2_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2); ++ return HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags); + #else +- return algoNb ? HUF_decompress4X2_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2) : +- HUF_decompress4X1_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2); ++ return algoNb ? HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags) : ++ HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags); + #endif + } + } +- +--- a/lib/zstd/decompress/zstd_ddict.c ++++ b/lib/zstd/decompress/zstd_ddict.c +@@ -1,5 +1,6 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -14,12 +15,12 @@ + /*-******************************************************* + * Dependencies + *********************************************************/ ++#include "../common/allocations.h" /* ZSTD_customMalloc, ZSTD_customFree */ + #include "../common/zstd_deps.h" /* ZSTD_memcpy, ZSTD_memmove, ZSTD_memset */ + #include "../common/cpu.h" /* bmi2 */ + #include "../common/mem.h" /* low level memory routines */ + #define FSE_STATIC_LINKING_ONLY + #include "../common/fse.h" +-#define HUF_STATIC_LINKING_ONLY + #include "../common/huf.h" + #include "zstd_decompress_internal.h" + #include "zstd_ddict.h" +@@ -131,7 +132,7 @@ static size_t ZSTD_initDDict_internal(ZS + ZSTD_memcpy(internalBuffer, dict, dictSize); + } + ddict->dictSize = dictSize; +- ddict->entropy.hufTable[0] = (HUF_DTable)((HufLog)*0x1000001); /* cover both little and big endian */ ++ ddict->entropy.hufTable[0] = (HUF_DTable)((ZSTD_HUFFDTABLE_CAPACITY_LOG)*0x1000001); /* cover both little and big endian */ + + /* parse dictionary content */ + FORWARD_IF_ERROR( ZSTD_loadEntropy_intoDDict(ddict, dictContentType) , ""); +@@ -237,5 +238,5 @@ size_t ZSTD_sizeof_DDict(const ZSTD_DDic + unsigned ZSTD_getDictID_fromDDict(const ZSTD_DDict* ddict) + { + if (ddict==NULL) return 0; +- return ZSTD_getDictID_fromDict(ddict->dictContent, ddict->dictSize); ++ return ddict->dictID; + } +--- a/lib/zstd/decompress/zstd_ddict.h ++++ b/lib/zstd/decompress/zstd_ddict.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +--- a/lib/zstd/decompress/zstd_decompress.c ++++ b/lib/zstd/decompress/zstd_decompress.c +@@ -1,5 +1,6 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -53,13 +54,15 @@ + * Dependencies + *********************************************************/ + #include "../common/zstd_deps.h" /* ZSTD_memcpy, ZSTD_memmove, ZSTD_memset */ ++#include "../common/allocations.h" /* ZSTD_customMalloc, ZSTD_customCalloc, ZSTD_customFree */ ++#include "../common/error_private.h" ++#include "../common/zstd_internal.h" /* blockProperties_t */ + #include "../common/mem.h" /* low level memory routines */ ++#include "../common/bits.h" /* ZSTD_highbit32 */ + #define FSE_STATIC_LINKING_ONLY + #include "../common/fse.h" +-#define HUF_STATIC_LINKING_ONLY + #include "../common/huf.h" + #include /* xxh64_reset, xxh64_update, xxh64_digest, XXH64 */ +-#include "../common/zstd_internal.h" /* blockProperties_t */ + #include "zstd_decompress_internal.h" /* ZSTD_DCtx */ + #include "zstd_ddict.h" /* ZSTD_DDictDictContent */ + #include "zstd_decompress_block.h" /* ZSTD_decompressBlock_internal */ +@@ -72,11 +75,11 @@ + *************************************/ + + #define DDICT_HASHSET_MAX_LOAD_FACTOR_COUNT_MULT 4 +-#define DDICT_HASHSET_MAX_LOAD_FACTOR_SIZE_MULT 3 /* These two constants represent SIZE_MULT/COUNT_MULT load factor without using a float. +- * Currently, that means a 0.75 load factor. +- * So, if count * COUNT_MULT / size * SIZE_MULT != 0, then we've exceeded +- * the load factor of the ddict hash set. +- */ ++#define DDICT_HASHSET_MAX_LOAD_FACTOR_SIZE_MULT 3 /* These two constants represent SIZE_MULT/COUNT_MULT load factor without using a float. ++ * Currently, that means a 0.75 load factor. ++ * So, if count * COUNT_MULT / size * SIZE_MULT != 0, then we've exceeded ++ * the load factor of the ddict hash set. ++ */ + + #define DDICT_HASHSET_TABLE_BASE_SIZE 64 + #define DDICT_HASHSET_RESIZE_FACTOR 2 +@@ -237,6 +240,8 @@ static void ZSTD_DCtx_resetParameters(ZS + dctx->outBufferMode = ZSTD_bm_buffered; + dctx->forceIgnoreChecksum = ZSTD_d_validateChecksum; + dctx->refMultipleDDicts = ZSTD_rmd_refSingleDDict; ++ dctx->disableHufAsm = 0; ++ dctx->maxBlockSizeParam = 0; + } + + static void ZSTD_initDCtx_internal(ZSTD_DCtx* dctx) +@@ -253,6 +258,7 @@ static void ZSTD_initDCtx_internal(ZSTD_ + dctx->streamStage = zdss_init; + dctx->noForwardProgress = 0; + dctx->oversizedDuration = 0; ++ dctx->isFrameDecompression = 1; + #if DYNAMIC_BMI2 + dctx->bmi2 = ZSTD_cpuSupportsBmi2(); + #endif +@@ -421,16 +427,40 @@ size_t ZSTD_frameHeaderSize(const void* + * note : only works for formats ZSTD_f_zstd1 and ZSTD_f_zstd1_magicless + * @return : 0, `zfhPtr` is correctly filled, + * >0, `srcSize` is too small, value is wanted `srcSize` amount, +- * or an error code, which can be tested using ZSTD_isError() */ ++** or an error code, which can be tested using ZSTD_isError() */ + size_t ZSTD_getFrameHeader_advanced(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize, ZSTD_format_e format) + { + const BYTE* ip = (const BYTE*)src; + size_t const minInputSize = ZSTD_startingInputLength(format); + +- ZSTD_memset(zfhPtr, 0, sizeof(*zfhPtr)); /* not strictly necessary, but static analyzer do not understand that zfhPtr is only going to be read only if return value is zero, since they are 2 different signals */ +- if (srcSize < minInputSize) return minInputSize; +- RETURN_ERROR_IF(src==NULL, GENERIC, "invalid parameter"); ++ DEBUGLOG(5, "ZSTD_getFrameHeader_advanced: minInputSize = %zu, srcSize = %zu", minInputSize, srcSize); ++ ++ if (srcSize > 0) { ++ /* note : technically could be considered an assert(), since it's an invalid entry */ ++ RETURN_ERROR_IF(src==NULL, GENERIC, "invalid parameter : src==NULL, but srcSize>0"); ++ } ++ if (srcSize < minInputSize) { ++ if (srcSize > 0 && format != ZSTD_f_zstd1_magicless) { ++ /* when receiving less than @minInputSize bytes, ++ * control these bytes at least correspond to a supported magic number ++ * in order to error out early if they don't. ++ **/ ++ size_t const toCopy = MIN(4, srcSize); ++ unsigned char hbuf[4]; MEM_writeLE32(hbuf, ZSTD_MAGICNUMBER); ++ assert(src != NULL); ++ ZSTD_memcpy(hbuf, src, toCopy); ++ if ( MEM_readLE32(hbuf) != ZSTD_MAGICNUMBER ) { ++ /* not a zstd frame : let's check if it's a skippable frame */ ++ MEM_writeLE32(hbuf, ZSTD_MAGIC_SKIPPABLE_START); ++ ZSTD_memcpy(hbuf, src, toCopy); ++ if ((MEM_readLE32(hbuf) & ZSTD_MAGIC_SKIPPABLE_MASK) != ZSTD_MAGIC_SKIPPABLE_START) { ++ RETURN_ERROR(prefix_unknown, ++ "first bytes don't correspond to any supported magic number"); ++ } } } ++ return minInputSize; ++ } + ++ ZSTD_memset(zfhPtr, 0, sizeof(*zfhPtr)); /* not strictly necessary, but static analyzers may not understand that zfhPtr will be read only if return value is zero, since they are 2 different signals */ + if ( (format != ZSTD_f_zstd1_magicless) + && (MEM_readLE32(src) != ZSTD_MAGICNUMBER) ) { + if ((MEM_readLE32(src) & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) { +@@ -540,49 +570,52 @@ static size_t readSkippableFrameSize(voi + sizeU32 = MEM_readLE32((BYTE const*)src + ZSTD_FRAMEIDSIZE); + RETURN_ERROR_IF((U32)(sizeU32 + ZSTD_SKIPPABLEHEADERSIZE) < sizeU32, + frameParameter_unsupported, ""); +- { +- size_t const skippableSize = skippableHeaderSize + sizeU32; ++ { size_t const skippableSize = skippableHeaderSize + sizeU32; + RETURN_ERROR_IF(skippableSize > srcSize, srcSize_wrong, ""); + return skippableSize; + } + } + + /*! ZSTD_readSkippableFrame() : +- * Retrieves a zstd skippable frame containing data given by src, and writes it to dst buffer. ++ * Retrieves content of a skippable frame, and writes it to dst buffer. + * + * The parameter magicVariant will receive the magicVariant that was supplied when the frame was written, + * i.e. magicNumber - ZSTD_MAGIC_SKIPPABLE_START. This can be NULL if the caller is not interested + * in the magicVariant. + * +- * Returns an error if destination buffer is not large enough, or if the frame is not skippable. ++ * Returns an error if destination buffer is not large enough, or if this is not a valid skippable frame. + * + * @return : number of bytes written or a ZSTD error. + */ +-ZSTDLIB_API size_t ZSTD_readSkippableFrame(void* dst, size_t dstCapacity, unsigned* magicVariant, +- const void* src, size_t srcSize) ++size_t ZSTD_readSkippableFrame(void* dst, size_t dstCapacity, ++ unsigned* magicVariant, /* optional, can be NULL */ ++ const void* src, size_t srcSize) + { +- U32 const magicNumber = MEM_readLE32(src); +- size_t skippableFrameSize = readSkippableFrameSize(src, srcSize); +- size_t skippableContentSize = skippableFrameSize - ZSTD_SKIPPABLEHEADERSIZE; +- +- /* check input validity */ +- RETURN_ERROR_IF(!ZSTD_isSkippableFrame(src, srcSize), frameParameter_unsupported, ""); +- RETURN_ERROR_IF(skippableFrameSize < ZSTD_SKIPPABLEHEADERSIZE || skippableFrameSize > srcSize, srcSize_wrong, ""); +- RETURN_ERROR_IF(skippableContentSize > dstCapacity, dstSize_tooSmall, ""); +- +- /* deliver payload */ +- if (skippableContentSize > 0 && dst != NULL) +- ZSTD_memcpy(dst, (const BYTE *)src + ZSTD_SKIPPABLEHEADERSIZE, skippableContentSize); +- if (magicVariant != NULL) +- *magicVariant = magicNumber - ZSTD_MAGIC_SKIPPABLE_START; +- return skippableContentSize; ++ RETURN_ERROR_IF(srcSize < ZSTD_SKIPPABLEHEADERSIZE, srcSize_wrong, ""); ++ ++ { U32 const magicNumber = MEM_readLE32(src); ++ size_t skippableFrameSize = readSkippableFrameSize(src, srcSize); ++ size_t skippableContentSize = skippableFrameSize - ZSTD_SKIPPABLEHEADERSIZE; ++ ++ /* check input validity */ ++ RETURN_ERROR_IF(!ZSTD_isSkippableFrame(src, srcSize), frameParameter_unsupported, ""); ++ RETURN_ERROR_IF(skippableFrameSize < ZSTD_SKIPPABLEHEADERSIZE || skippableFrameSize > srcSize, srcSize_wrong, ""); ++ RETURN_ERROR_IF(skippableContentSize > dstCapacity, dstSize_tooSmall, ""); ++ ++ /* deliver payload */ ++ if (skippableContentSize > 0 && dst != NULL) ++ ZSTD_memcpy(dst, (const BYTE *)src + ZSTD_SKIPPABLEHEADERSIZE, skippableContentSize); ++ if (magicVariant != NULL) ++ *magicVariant = magicNumber - ZSTD_MAGIC_SKIPPABLE_START; ++ return skippableContentSize; ++ } + } + + /* ZSTD_findDecompressedSize() : +- * compatible with legacy mode + * `srcSize` must be the exact length of some number of ZSTD compressed and/or + * skippable frames +- * @return : decompressed size of the frames contained */ ++ * note: compatible with legacy mode ++ * @return : decompressed size of the frames contained */ + unsigned long long ZSTD_findDecompressedSize(const void* src, size_t srcSize) + { + unsigned long long totalDstSize = 0; +@@ -592,9 +625,7 @@ unsigned long long ZSTD_findDecompressed + + if ((magicNumber & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) { + size_t const skippableSize = readSkippableFrameSize(src, srcSize); +- if (ZSTD_isError(skippableSize)) { +- return ZSTD_CONTENTSIZE_ERROR; +- } ++ if (ZSTD_isError(skippableSize)) return ZSTD_CONTENTSIZE_ERROR; + assert(skippableSize <= srcSize); + + src = (const BYTE *)src + skippableSize; +@@ -602,17 +633,17 @@ unsigned long long ZSTD_findDecompressed + continue; + } + +- { unsigned long long const ret = ZSTD_getFrameContentSize(src, srcSize); +- if (ret >= ZSTD_CONTENTSIZE_ERROR) return ret; ++ { unsigned long long const fcs = ZSTD_getFrameContentSize(src, srcSize); ++ if (fcs >= ZSTD_CONTENTSIZE_ERROR) return fcs; + +- /* check for overflow */ +- if (totalDstSize + ret < totalDstSize) return ZSTD_CONTENTSIZE_ERROR; +- totalDstSize += ret; ++ if (totalDstSize + fcs < totalDstSize) ++ return ZSTD_CONTENTSIZE_ERROR; /* check for overflow */ ++ totalDstSize += fcs; + } ++ /* skip to next frame */ + { size_t const frameSrcSize = ZSTD_findFrameCompressedSize(src, srcSize); +- if (ZSTD_isError(frameSrcSize)) { +- return ZSTD_CONTENTSIZE_ERROR; +- } ++ if (ZSTD_isError(frameSrcSize)) return ZSTD_CONTENTSIZE_ERROR; ++ assert(frameSrcSize <= srcSize); + + src = (const BYTE *)src + frameSrcSize; + srcSize -= frameSrcSize; +@@ -676,13 +707,13 @@ static ZSTD_frameSizeInfo ZSTD_errorFram + return frameSizeInfo; + } + +-static ZSTD_frameSizeInfo ZSTD_findFrameSizeInfo(const void* src, size_t srcSize) ++static ZSTD_frameSizeInfo ZSTD_findFrameSizeInfo(const void* src, size_t srcSize, ZSTD_format_e format) + { + ZSTD_frameSizeInfo frameSizeInfo; + ZSTD_memset(&frameSizeInfo, 0, sizeof(ZSTD_frameSizeInfo)); + + +- if ((srcSize >= ZSTD_SKIPPABLEHEADERSIZE) ++ if (format == ZSTD_f_zstd1 && (srcSize >= ZSTD_SKIPPABLEHEADERSIZE) + && (MEM_readLE32(src) & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) { + frameSizeInfo.compressedSize = readSkippableFrameSize(src, srcSize); + assert(ZSTD_isError(frameSizeInfo.compressedSize) || +@@ -696,7 +727,7 @@ static ZSTD_frameSizeInfo ZSTD_findFrame + ZSTD_frameHeader zfh; + + /* Extract Frame Header */ +- { size_t const ret = ZSTD_getFrameHeader(&zfh, src, srcSize); ++ { size_t const ret = ZSTD_getFrameHeader_advanced(&zfh, src, srcSize, format); + if (ZSTD_isError(ret)) + return ZSTD_errorFrameSizeInfo(ret); + if (ret > 0) +@@ -730,23 +761,26 @@ static ZSTD_frameSizeInfo ZSTD_findFrame + ip += 4; + } + ++ frameSizeInfo.nbBlocks = nbBlocks; + frameSizeInfo.compressedSize = (size_t)(ip - ipstart); + frameSizeInfo.decompressedBound = (zfh.frameContentSize != ZSTD_CONTENTSIZE_UNKNOWN) + ? zfh.frameContentSize +- : nbBlocks * zfh.blockSizeMax; ++ : (unsigned long long)nbBlocks * zfh.blockSizeMax; + return frameSizeInfo; + } + } + ++static size_t ZSTD_findFrameCompressedSize_advanced(const void *src, size_t srcSize, ZSTD_format_e format) { ++ ZSTD_frameSizeInfo const frameSizeInfo = ZSTD_findFrameSizeInfo(src, srcSize, format); ++ return frameSizeInfo.compressedSize; ++} ++ + /* ZSTD_findFrameCompressedSize() : +- * compatible with legacy mode +- * `src` must point to the start of a ZSTD frame, ZSTD legacy frame, or skippable frame +- * `srcSize` must be at least as large as the frame contained +- * @return : the compressed size of the frame starting at `src` */ ++ * See docs in zstd.h ++ * Note: compatible with legacy mode */ + size_t ZSTD_findFrameCompressedSize(const void *src, size_t srcSize) + { +- ZSTD_frameSizeInfo const frameSizeInfo = ZSTD_findFrameSizeInfo(src, srcSize); +- return frameSizeInfo.compressedSize; ++ return ZSTD_findFrameCompressedSize_advanced(src, srcSize, ZSTD_f_zstd1); + } + + /* ZSTD_decompressBound() : +@@ -760,7 +794,7 @@ unsigned long long ZSTD_decompressBound( + unsigned long long bound = 0; + /* Iterate over each frame */ + while (srcSize > 0) { +- ZSTD_frameSizeInfo const frameSizeInfo = ZSTD_findFrameSizeInfo(src, srcSize); ++ ZSTD_frameSizeInfo const frameSizeInfo = ZSTD_findFrameSizeInfo(src, srcSize, ZSTD_f_zstd1); + size_t const compressedSize = frameSizeInfo.compressedSize; + unsigned long long const decompressedBound = frameSizeInfo.decompressedBound; + if (ZSTD_isError(compressedSize) || decompressedBound == ZSTD_CONTENTSIZE_ERROR) +@@ -773,6 +807,48 @@ unsigned long long ZSTD_decompressBound( + return bound; + } + ++size_t ZSTD_decompressionMargin(void const* src, size_t srcSize) ++{ ++ size_t margin = 0; ++ unsigned maxBlockSize = 0; ++ ++ /* Iterate over each frame */ ++ while (srcSize > 0) { ++ ZSTD_frameSizeInfo const frameSizeInfo = ZSTD_findFrameSizeInfo(src, srcSize, ZSTD_f_zstd1); ++ size_t const compressedSize = frameSizeInfo.compressedSize; ++ unsigned long long const decompressedBound = frameSizeInfo.decompressedBound; ++ ZSTD_frameHeader zfh; ++ ++ FORWARD_IF_ERROR(ZSTD_getFrameHeader(&zfh, src, srcSize), ""); ++ if (ZSTD_isError(compressedSize) || decompressedBound == ZSTD_CONTENTSIZE_ERROR) ++ return ERROR(corruption_detected); ++ ++ if (zfh.frameType == ZSTD_frame) { ++ /* Add the frame header to our margin */ ++ margin += zfh.headerSize; ++ /* Add the checksum to our margin */ ++ margin += zfh.checksumFlag ? 4 : 0; ++ /* Add 3 bytes per block */ ++ margin += 3 * frameSizeInfo.nbBlocks; ++ ++ /* Compute the max block size */ ++ maxBlockSize = MAX(maxBlockSize, zfh.blockSizeMax); ++ } else { ++ assert(zfh.frameType == ZSTD_skippableFrame); ++ /* Add the entire skippable frame size to our margin. */ ++ margin += compressedSize; ++ } ++ ++ assert(srcSize >= compressedSize); ++ src = (const BYTE*)src + compressedSize; ++ srcSize -= compressedSize; ++ } ++ ++ /* Add the max block size back to the margin. */ ++ margin += maxBlockSize; ++ ++ return margin; ++} + + /*-************************************************************* + * Frame decoding +@@ -856,6 +932,10 @@ static size_t ZSTD_decompressFrame(ZSTD_ + ip += frameHeaderSize; remainingSrcSize -= frameHeaderSize; + } + ++ /* Shrink the blockSizeMax if enabled */ ++ if (dctx->maxBlockSizeParam != 0) ++ dctx->fParams.blockSizeMax = MIN(dctx->fParams.blockSizeMax, (unsigned)dctx->maxBlockSizeParam); ++ + /* Loop on each block */ + while (1) { + BYTE* oBlockEnd = oend; +@@ -888,7 +968,8 @@ static size_t ZSTD_decompressFrame(ZSTD_ + switch(blockProperties.blockType) + { + case bt_compressed: +- decodedSize = ZSTD_decompressBlock_internal(dctx, op, (size_t)(oBlockEnd-op), ip, cBlockSize, /* frame */ 1, not_streaming); ++ assert(dctx->isFrameDecompression == 1); ++ decodedSize = ZSTD_decompressBlock_internal(dctx, op, (size_t)(oBlockEnd-op), ip, cBlockSize, not_streaming); + break; + case bt_raw : + /* Use oend instead of oBlockEnd because this function is safe to overlap. It uses memmove. */ +@@ -901,12 +982,14 @@ static size_t ZSTD_decompressFrame(ZSTD_ + default: + RETURN_ERROR(corruption_detected, "invalid block type"); + } +- +- if (ZSTD_isError(decodedSize)) return decodedSize; +- if (dctx->validateChecksum) ++ FORWARD_IF_ERROR(decodedSize, "Block decompression failure"); ++ DEBUGLOG(5, "Decompressed block of dSize = %u", (unsigned)decodedSize); ++ if (dctx->validateChecksum) { + xxh64_update(&dctx->xxhState, op, decodedSize); +- if (decodedSize != 0) ++ } ++ if (decodedSize) /* support dst = NULL,0 */ { + op += decodedSize; ++ } + assert(ip != NULL); + ip += cBlockSize; + remainingSrcSize -= cBlockSize; +@@ -930,12 +1013,15 @@ static size_t ZSTD_decompressFrame(ZSTD_ + } + ZSTD_DCtx_trace_end(dctx, (U64)(op-ostart), (U64)(ip-istart), /* streaming */ 0); + /* Allow caller to get size read */ ++ DEBUGLOG(4, "ZSTD_decompressFrame: decompressed frame of size %zi, consuming %zi bytes of input", op-ostart, ip - (const BYTE*)*srcPtr); + *srcPtr = ip; + *srcSizePtr = remainingSrcSize; + return (size_t)(op-ostart); + } + +-static size_t ZSTD_decompressMultiFrame(ZSTD_DCtx* dctx, ++static ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++size_t ZSTD_decompressMultiFrame(ZSTD_DCtx* dctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + const void* dict, size_t dictSize, +@@ -955,17 +1041,18 @@ static size_t ZSTD_decompressMultiFrame( + while (srcSize >= ZSTD_startingInputLength(dctx->format)) { + + +- { U32 const magicNumber = MEM_readLE32(src); +- DEBUGLOG(4, "reading magic number %08X (expecting %08X)", +- (unsigned)magicNumber, ZSTD_MAGICNUMBER); ++ if (dctx->format == ZSTD_f_zstd1 && srcSize >= 4) { ++ U32 const magicNumber = MEM_readLE32(src); ++ DEBUGLOG(5, "reading magic number %08X", (unsigned)magicNumber); + if ((magicNumber & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) { ++ /* skippable frame detected : skip it */ + size_t const skippableSize = readSkippableFrameSize(src, srcSize); +- FORWARD_IF_ERROR(skippableSize, "readSkippableFrameSize failed"); ++ FORWARD_IF_ERROR(skippableSize, "invalid skippable frame"); + assert(skippableSize <= srcSize); + + src = (const BYTE *)src + skippableSize; + srcSize -= skippableSize; +- continue; ++ continue; /* check next frame */ + } } + + if (ddict) { +@@ -1061,8 +1148,8 @@ size_t ZSTD_decompress(void* dst, size_t + size_t ZSTD_nextSrcSizeToDecompress(ZSTD_DCtx* dctx) { return dctx->expected; } + + /* +- * Similar to ZSTD_nextSrcSizeToDecompress(), but when a block input can be streamed, +- * we allow taking a partial block as the input. Currently only raw uncompressed blocks can ++ * Similar to ZSTD_nextSrcSizeToDecompress(), but when a block input can be streamed, we ++ * allow taking a partial block as the input. Currently only raw uncompressed blocks can + * be streamed. + * + * For blocks that can be streamed, this allows us to reduce the latency until we produce +@@ -1181,7 +1268,8 @@ size_t ZSTD_decompressContinue(ZSTD_DCtx + { + case bt_compressed: + DEBUGLOG(5, "ZSTD_decompressContinue: case bt_compressed"); +- rSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, /* frame */ 1, is_streaming); ++ assert(dctx->isFrameDecompression == 1); ++ rSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, is_streaming); + dctx->expected = 0; /* Streaming not supported */ + break; + case bt_raw : +@@ -1250,6 +1338,7 @@ size_t ZSTD_decompressContinue(ZSTD_DCtx + case ZSTDds_decodeSkippableHeader: + assert(src != NULL); + assert(srcSize <= ZSTD_SKIPPABLEHEADERSIZE); ++ assert(dctx->format != ZSTD_f_zstd1_magicless); + ZSTD_memcpy(dctx->headerBuffer + (ZSTD_SKIPPABLEHEADERSIZE - srcSize), src, srcSize); /* complete skippable header */ + dctx->expected = MEM_readLE32(dctx->headerBuffer + ZSTD_FRAMEIDSIZE); /* note : dctx->expected can grow seriously large, beyond local buffer size */ + dctx->stage = ZSTDds_skipFrame; +@@ -1262,7 +1351,7 @@ size_t ZSTD_decompressContinue(ZSTD_DCtx + + default: + assert(0); /* impossible */ +- RETURN_ERROR(GENERIC, "impossible to reach"); /* some compiler require default to do something */ ++ RETURN_ERROR(GENERIC, "impossible to reach"); /* some compilers require default to do something */ + } + } + +@@ -1303,11 +1392,11 @@ ZSTD_loadDEntropy(ZSTD_entropyDTables_t* + /* in minimal huffman, we always use X1 variants */ + size_t const hSize = HUF_readDTableX1_wksp(entropy->hufTable, + dictPtr, dictEnd - dictPtr, +- workspace, workspaceSize); ++ workspace, workspaceSize, /* flags */ 0); + #else + size_t const hSize = HUF_readDTableX2_wksp(entropy->hufTable, + dictPtr, (size_t)(dictEnd - dictPtr), +- workspace, workspaceSize); ++ workspace, workspaceSize, /* flags */ 0); + #endif + RETURN_ERROR_IF(HUF_isError(hSize), dictionary_corrupted, ""); + dictPtr += hSize; +@@ -1403,10 +1492,11 @@ size_t ZSTD_decompressBegin(ZSTD_DCtx* d + dctx->prefixStart = NULL; + dctx->virtualStart = NULL; + dctx->dictEnd = NULL; +- dctx->entropy.hufTable[0] = (HUF_DTable)((HufLog)*0x1000001); /* cover both little and big endian */ ++ dctx->entropy.hufTable[0] = (HUF_DTable)((ZSTD_HUFFDTABLE_CAPACITY_LOG)*0x1000001); /* cover both little and big endian */ + dctx->litEntropy = dctx->fseEntropy = 0; + dctx->dictID = 0; + dctx->bType = bt_reserved; ++ dctx->isFrameDecompression = 1; + ZSTD_STATIC_ASSERT(sizeof(dctx->entropy.rep) == sizeof(repStartValue)); + ZSTD_memcpy(dctx->entropy.rep, repStartValue, sizeof(repStartValue)); /* initial repcodes */ + dctx->LLTptr = dctx->entropy.LLTable; +@@ -1465,7 +1555,7 @@ unsigned ZSTD_getDictID_fromDict(const v + * This could for one of the following reasons : + * - The frame does not require a dictionary (most common case). + * - The frame was built with dictID intentionally removed. +- * Needed dictionary is a hidden information. ++ * Needed dictionary is a hidden piece of information. + * Note : this use case also happens when using a non-conformant dictionary. + * - `srcSize` is too small, and as a result, frame header could not be decoded. + * Note : possible if `srcSize < ZSTD_FRAMEHEADERSIZE_MAX`. +@@ -1474,7 +1564,7 @@ unsigned ZSTD_getDictID_fromDict(const v + * ZSTD_getFrameHeader(), which will provide a more precise error code. */ + unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize) + { +- ZSTD_frameHeader zfp = { 0, 0, 0, ZSTD_frame, 0, 0, 0 }; ++ ZSTD_frameHeader zfp = { 0, 0, 0, ZSTD_frame, 0, 0, 0, 0, 0 }; + size_t const hError = ZSTD_getFrameHeader(&zfp, src, srcSize); + if (ZSTD_isError(hError)) return 0; + return zfp.dictID; +@@ -1581,7 +1671,9 @@ size_t ZSTD_initDStream_usingDict(ZSTD_D + size_t ZSTD_initDStream(ZSTD_DStream* zds) + { + DEBUGLOG(4, "ZSTD_initDStream"); +- return ZSTD_initDStream_usingDDict(zds, NULL); ++ FORWARD_IF_ERROR(ZSTD_DCtx_reset(zds, ZSTD_reset_session_only), ""); ++ FORWARD_IF_ERROR(ZSTD_DCtx_refDDict(zds, NULL), ""); ++ return ZSTD_startingInputLength(zds->format); + } + + /* ZSTD_initDStream_usingDDict() : +@@ -1589,6 +1681,7 @@ size_t ZSTD_initDStream(ZSTD_DStream* zd + * this function cannot fail */ + size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* dctx, const ZSTD_DDict* ddict) + { ++ DEBUGLOG(4, "ZSTD_initDStream_usingDDict"); + FORWARD_IF_ERROR( ZSTD_DCtx_reset(dctx, ZSTD_reset_session_only) , ""); + FORWARD_IF_ERROR( ZSTD_DCtx_refDDict(dctx, ddict) , ""); + return ZSTD_startingInputLength(dctx->format); +@@ -1599,6 +1692,7 @@ size_t ZSTD_initDStream_usingDDict(ZSTD_ + * this function cannot fail */ + size_t ZSTD_resetDStream(ZSTD_DStream* dctx) + { ++ DEBUGLOG(4, "ZSTD_resetDStream"); + FORWARD_IF_ERROR(ZSTD_DCtx_reset(dctx, ZSTD_reset_session_only), ""); + return ZSTD_startingInputLength(dctx->format); + } +@@ -1670,6 +1764,15 @@ ZSTD_bounds ZSTD_dParam_getBounds(ZSTD_d + bounds.lowerBound = (int)ZSTD_rmd_refSingleDDict; + bounds.upperBound = (int)ZSTD_rmd_refMultipleDDicts; + return bounds; ++ case ZSTD_d_disableHuffmanAssembly: ++ bounds.lowerBound = 0; ++ bounds.upperBound = 1; ++ return bounds; ++ case ZSTD_d_maxBlockSize: ++ bounds.lowerBound = ZSTD_BLOCKSIZE_MAX_MIN; ++ bounds.upperBound = ZSTD_BLOCKSIZE_MAX; ++ return bounds; ++ + default:; + } + bounds.error = ERROR(parameter_unsupported); +@@ -1710,6 +1813,12 @@ size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* + case ZSTD_d_refMultipleDDicts: + *value = (int)dctx->refMultipleDDicts; + return 0; ++ case ZSTD_d_disableHuffmanAssembly: ++ *value = (int)dctx->disableHufAsm; ++ return 0; ++ case ZSTD_d_maxBlockSize: ++ *value = dctx->maxBlockSizeParam; ++ return 0; + default:; + } + RETURN_ERROR(parameter_unsupported, ""); +@@ -1743,6 +1852,14 @@ size_t ZSTD_DCtx_setParameter(ZSTD_DCtx* + } + dctx->refMultipleDDicts = (ZSTD_refMultipleDDicts_e)value; + return 0; ++ case ZSTD_d_disableHuffmanAssembly: ++ CHECK_DBOUNDS(ZSTD_d_disableHuffmanAssembly, value); ++ dctx->disableHufAsm = value != 0; ++ return 0; ++ case ZSTD_d_maxBlockSize: ++ if (value != 0) CHECK_DBOUNDS(ZSTD_d_maxBlockSize, value); ++ dctx->maxBlockSizeParam = value; ++ return 0; + default:; + } + RETURN_ERROR(parameter_unsupported, ""); +@@ -1754,6 +1871,7 @@ size_t ZSTD_DCtx_reset(ZSTD_DCtx* dctx, + || (reset == ZSTD_reset_session_and_parameters) ) { + dctx->streamStage = zdss_init; + dctx->noForwardProgress = 0; ++ dctx->isFrameDecompression = 1; + } + if ( (reset == ZSTD_reset_parameters) + || (reset == ZSTD_reset_session_and_parameters) ) { +@@ -1770,11 +1888,17 @@ size_t ZSTD_sizeof_DStream(const ZSTD_DS + return ZSTD_sizeof_DCtx(dctx); + } + +-size_t ZSTD_decodingBufferSize_min(unsigned long long windowSize, unsigned long long frameContentSize) ++static size_t ZSTD_decodingBufferSize_internal(unsigned long long windowSize, unsigned long long frameContentSize, size_t blockSizeMax) + { +- size_t const blockSize = (size_t) MIN(windowSize, ZSTD_BLOCKSIZE_MAX); +- /* space is needed to store the litbuffer after the output of a given block without stomping the extDict of a previous run, as well as to cover both windows against wildcopy*/ +- unsigned long long const neededRBSize = windowSize + blockSize + ZSTD_BLOCKSIZE_MAX + (WILDCOPY_OVERLENGTH * 2); ++ size_t const blockSize = MIN((size_t)MIN(windowSize, ZSTD_BLOCKSIZE_MAX), blockSizeMax); ++ /* We need blockSize + WILDCOPY_OVERLENGTH worth of buffer so that if a block ++ * ends at windowSize + WILDCOPY_OVERLENGTH + 1 bytes, we can start writing ++ * the block at the beginning of the output buffer, and maintain a full window. ++ * ++ * We need another blockSize worth of buffer so that we can store split ++ * literals at the end of the block without overwriting the extDict window. ++ */ ++ unsigned long long const neededRBSize = windowSize + (blockSize * 2) + (WILDCOPY_OVERLENGTH * 2); + unsigned long long const neededSize = MIN(frameContentSize, neededRBSize); + size_t const minRBSize = (size_t) neededSize; + RETURN_ERROR_IF((unsigned long long)minRBSize != neededSize, +@@ -1782,6 +1906,11 @@ size_t ZSTD_decodingBufferSize_min(unsig + return minRBSize; + } + ++size_t ZSTD_decodingBufferSize_min(unsigned long long windowSize, unsigned long long frameContentSize) ++{ ++ return ZSTD_decodingBufferSize_internal(windowSize, frameContentSize, ZSTD_BLOCKSIZE_MAX); ++} ++ + size_t ZSTD_estimateDStreamSize(size_t windowSize) + { + size_t const blockSize = MIN(windowSize, ZSTD_BLOCKSIZE_MAX); +@@ -1918,7 +2047,6 @@ size_t ZSTD_decompressStream(ZSTD_DStrea + if (zds->refMultipleDDicts && zds->ddictSet) { + ZSTD_DCtx_selectFrameDDict(zds); + } +- DEBUGLOG(5, "header size : %u", (U32)hSize); + if (ZSTD_isError(hSize)) { + return hSize; /* error */ + } +@@ -1932,6 +2060,11 @@ size_t ZSTD_decompressStream(ZSTD_DStrea + zds->lhSize += remainingInput; + } + input->pos = input->size; ++ /* check first few bytes */ ++ FORWARD_IF_ERROR( ++ ZSTD_getFrameHeader_advanced(&zds->fParams, zds->headerBuffer, zds->lhSize, zds->format), ++ "First few bytes detected incorrect" ); ++ /* return hint input size */ + return (MAX((size_t)ZSTD_FRAMEHEADERSIZE_MIN(zds->format), hSize) - zds->lhSize) + ZSTD_blockHeaderSize; /* remaining header bytes + next block header */ + } + assert(ip != NULL); +@@ -1943,14 +2076,15 @@ size_t ZSTD_decompressStream(ZSTD_DStrea + if (zds->fParams.frameContentSize != ZSTD_CONTENTSIZE_UNKNOWN + && zds->fParams.frameType != ZSTD_skippableFrame + && (U64)(size_t)(oend-op) >= zds->fParams.frameContentSize) { +- size_t const cSize = ZSTD_findFrameCompressedSize(istart, (size_t)(iend-istart)); ++ size_t const cSize = ZSTD_findFrameCompressedSize_advanced(istart, (size_t)(iend-istart), zds->format); + if (cSize <= (size_t)(iend-istart)) { + /* shortcut : using single-pass mode */ + size_t const decompressedSize = ZSTD_decompress_usingDDict(zds, op, (size_t)(oend-op), istart, cSize, ZSTD_getDDict(zds)); + if (ZSTD_isError(decompressedSize)) return decompressedSize; +- DEBUGLOG(4, "shortcut to single-pass ZSTD_decompress_usingDDict()") ++ DEBUGLOG(4, "shortcut to single-pass ZSTD_decompress_usingDDict()"); ++ assert(istart != NULL); + ip = istart + cSize; +- op += decompressedSize; ++ op = op ? op + decompressedSize : op; /* can occur if frameContentSize = 0 (empty frame) */ + zds->expected = 0; + zds->streamStage = zdss_init; + someMoreWork = 0; +@@ -1969,7 +2103,8 @@ size_t ZSTD_decompressStream(ZSTD_DStrea + DEBUGLOG(4, "Consume header"); + FORWARD_IF_ERROR(ZSTD_decompressBegin_usingDDict(zds, ZSTD_getDDict(zds)), ""); + +- if ((MEM_readLE32(zds->headerBuffer) & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) { /* skippable frame */ ++ if (zds->format == ZSTD_f_zstd1 ++ && (MEM_readLE32(zds->headerBuffer) & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) { /* skippable frame */ + zds->expected = MEM_readLE32(zds->headerBuffer + ZSTD_FRAMEIDSIZE); + zds->stage = ZSTDds_skipFrame; + } else { +@@ -1985,11 +2120,13 @@ size_t ZSTD_decompressStream(ZSTD_DStrea + zds->fParams.windowSize = MAX(zds->fParams.windowSize, 1U << ZSTD_WINDOWLOG_ABSOLUTEMIN); + RETURN_ERROR_IF(zds->fParams.windowSize > zds->maxWindowSize, + frameParameter_windowTooLarge, ""); ++ if (zds->maxBlockSizeParam != 0) ++ zds->fParams.blockSizeMax = MIN(zds->fParams.blockSizeMax, (unsigned)zds->maxBlockSizeParam); + + /* Adapt buffer sizes to frame header instructions */ + { size_t const neededInBuffSize = MAX(zds->fParams.blockSizeMax, 4 /* frame checksum */); + size_t const neededOutBuffSize = zds->outBufferMode == ZSTD_bm_buffered +- ? ZSTD_decodingBufferSize_min(zds->fParams.windowSize, zds->fParams.frameContentSize) ++ ? ZSTD_decodingBufferSize_internal(zds->fParams.windowSize, zds->fParams.frameContentSize, zds->fParams.blockSizeMax) + : 0; + + ZSTD_DCtx_updateOversizedDuration(zds, neededInBuffSize, neededOutBuffSize); +@@ -2034,6 +2171,7 @@ size_t ZSTD_decompressStream(ZSTD_DStrea + } + if ((size_t)(iend-ip) >= neededInSize) { /* decode directly from src */ + FORWARD_IF_ERROR(ZSTD_decompressContinueStream(zds, &op, oend, ip, neededInSize), ""); ++ assert(ip != NULL); + ip += neededInSize; + /* Function modifies the stage so we must break */ + break; +@@ -2048,7 +2186,7 @@ size_t ZSTD_decompressStream(ZSTD_DStrea + int const isSkipFrame = ZSTD_isSkipFrame(zds); + size_t loadedSize; + /* At this point we shouldn't be decompressing a block that we can stream. */ +- assert(neededInSize == ZSTD_nextSrcSizeToDecompressWithInputSize(zds, iend - ip)); ++ assert(neededInSize == ZSTD_nextSrcSizeToDecompressWithInputSize(zds, (size_t)(iend - ip))); + if (isSkipFrame) { + loadedSize = MIN(toLoad, (size_t)(iend-ip)); + } else { +@@ -2057,8 +2195,11 @@ size_t ZSTD_decompressStream(ZSTD_DStrea + "should never happen"); + loadedSize = ZSTD_limitCopy(zds->inBuff + zds->inPos, toLoad, ip, (size_t)(iend-ip)); + } +- ip += loadedSize; +- zds->inPos += loadedSize; ++ if (loadedSize != 0) { ++ /* ip may be NULL */ ++ ip += loadedSize; ++ zds->inPos += loadedSize; ++ } + if (loadedSize < toLoad) { someMoreWork = 0; break; } /* not enough input, wait for more */ + + /* decode loaded input */ +@@ -2068,14 +2209,17 @@ size_t ZSTD_decompressStream(ZSTD_DStrea + break; + } + case zdss_flush: +- { size_t const toFlushSize = zds->outEnd - zds->outStart; ++ { ++ size_t const toFlushSize = zds->outEnd - zds->outStart; + size_t const flushedSize = ZSTD_limitCopy(op, (size_t)(oend-op), zds->outBuff + zds->outStart, toFlushSize); +- op += flushedSize; ++ ++ op = op ? op + flushedSize : op; ++ + zds->outStart += flushedSize; + if (flushedSize == toFlushSize) { /* flush completed */ + zds->streamStage = zdss_read; + if ( (zds->outBuffSize < zds->fParams.frameContentSize) +- && (zds->outStart + zds->fParams.blockSizeMax > zds->outBuffSize) ) { ++ && (zds->outStart + zds->fParams.blockSizeMax > zds->outBuffSize) ) { + DEBUGLOG(5, "restart filling outBuff from beginning (left:%i, needed:%u)", + (int)(zds->outBuffSize - zds->outStart), + (U32)zds->fParams.blockSizeMax); +@@ -2089,7 +2233,7 @@ size_t ZSTD_decompressStream(ZSTD_DStrea + + default: + assert(0); /* impossible */ +- RETURN_ERROR(GENERIC, "impossible to reach"); /* some compiler require default to do something */ ++ RETURN_ERROR(GENERIC, "impossible to reach"); /* some compilers require default to do something */ + } } + + /* result */ +@@ -2102,8 +2246,8 @@ size_t ZSTD_decompressStream(ZSTD_DStrea + if ((ip==istart) && (op==ostart)) { /* no forward progress */ + zds->noForwardProgress ++; + if (zds->noForwardProgress >= ZSTD_NO_FORWARD_PROGRESS_MAX) { +- RETURN_ERROR_IF(op==oend, dstSize_tooSmall, ""); +- RETURN_ERROR_IF(ip==iend, srcSize_wrong, ""); ++ RETURN_ERROR_IF(op==oend, noForwardProgress_destFull, ""); ++ RETURN_ERROR_IF(ip==iend, noForwardProgress_inputEmpty, ""); + assert(0); + } + } else { +@@ -2140,11 +2284,17 @@ size_t ZSTD_decompressStream_simpleArgs + void* dst, size_t dstCapacity, size_t* dstPos, + const void* src, size_t srcSize, size_t* srcPos) + { +- ZSTD_outBuffer output = { dst, dstCapacity, *dstPos }; +- ZSTD_inBuffer input = { src, srcSize, *srcPos }; +- /* ZSTD_compress_generic() will check validity of dstPos and srcPos */ +- size_t const cErr = ZSTD_decompressStream(dctx, &output, &input); +- *dstPos = output.pos; +- *srcPos = input.pos; +- return cErr; ++ ZSTD_outBuffer output; ++ ZSTD_inBuffer input; ++ output.dst = dst; ++ output.size = dstCapacity; ++ output.pos = *dstPos; ++ input.src = src; ++ input.size = srcSize; ++ input.pos = *srcPos; ++ { size_t const cErr = ZSTD_decompressStream(dctx, &output, &input); ++ *dstPos = output.pos; ++ *srcPos = input.pos; ++ return cErr; ++ } + } +--- a/lib/zstd/decompress/zstd_decompress_block.c ++++ b/lib/zstd/decompress/zstd_decompress_block.c +@@ -1,5 +1,6 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -20,12 +21,12 @@ + #include "../common/mem.h" /* low level memory routines */ + #define FSE_STATIC_LINKING_ONLY + #include "../common/fse.h" +-#define HUF_STATIC_LINKING_ONLY + #include "../common/huf.h" + #include "../common/zstd_internal.h" + #include "zstd_decompress_internal.h" /* ZSTD_DCtx */ + #include "zstd_ddict.h" /* ZSTD_DDictDictContent */ + #include "zstd_decompress_block.h" ++#include "../common/bits.h" /* ZSTD_highbit32 */ + + /*_******************************************************* + * Macros +@@ -51,6 +52,13 @@ static void ZSTD_copy4(void* dst, const + * Block decoding + ***************************************************************/ + ++static size_t ZSTD_blockSizeMax(ZSTD_DCtx const* dctx) ++{ ++ size_t const blockSizeMax = dctx->isFrameDecompression ? dctx->fParams.blockSizeMax : ZSTD_BLOCKSIZE_MAX; ++ assert(blockSizeMax <= ZSTD_BLOCKSIZE_MAX); ++ return blockSizeMax; ++} ++ + /*! ZSTD_getcBlockSize() : + * Provides the size of compressed block from block header `src` */ + size_t ZSTD_getcBlockSize(const void* src, size_t srcSize, +@@ -73,41 +81,49 @@ size_t ZSTD_getcBlockSize(const void* sr + static void ZSTD_allocateLiteralsBuffer(ZSTD_DCtx* dctx, void* const dst, const size_t dstCapacity, const size_t litSize, + const streaming_operation streaming, const size_t expectedWriteSize, const unsigned splitImmediately) + { +- if (streaming == not_streaming && dstCapacity > ZSTD_BLOCKSIZE_MAX + WILDCOPY_OVERLENGTH + litSize + WILDCOPY_OVERLENGTH) +- { +- /* room for litbuffer to fit without read faulting */ +- dctx->litBuffer = (BYTE*)dst + ZSTD_BLOCKSIZE_MAX + WILDCOPY_OVERLENGTH; ++ size_t const blockSizeMax = ZSTD_blockSizeMax(dctx); ++ assert(litSize <= blockSizeMax); ++ assert(dctx->isFrameDecompression || streaming == not_streaming); ++ assert(expectedWriteSize <= blockSizeMax); ++ if (streaming == not_streaming && dstCapacity > blockSizeMax + WILDCOPY_OVERLENGTH + litSize + WILDCOPY_OVERLENGTH) { ++ /* If we aren't streaming, we can just put the literals after the output ++ * of the current block. We don't need to worry about overwriting the ++ * extDict of our window, because it doesn't exist. ++ * So if we have space after the end of the block, just put it there. ++ */ ++ dctx->litBuffer = (BYTE*)dst + blockSizeMax + WILDCOPY_OVERLENGTH; + dctx->litBufferEnd = dctx->litBuffer + litSize; + dctx->litBufferLocation = ZSTD_in_dst; +- } +- else if (litSize > ZSTD_LITBUFFEREXTRASIZE) +- { +- /* won't fit in litExtraBuffer, so it will be split between end of dst and extra buffer */ ++ } else if (litSize <= ZSTD_LITBUFFEREXTRASIZE) { ++ /* Literals fit entirely within the extra buffer, put them there to avoid ++ * having to split the literals. ++ */ ++ dctx->litBuffer = dctx->litExtraBuffer; ++ dctx->litBufferEnd = dctx->litBuffer + litSize; ++ dctx->litBufferLocation = ZSTD_not_in_dst; ++ } else { ++ assert(blockSizeMax > ZSTD_LITBUFFEREXTRASIZE); ++ /* Literals must be split between the output block and the extra lit ++ * buffer. We fill the extra lit buffer with the tail of the literals, ++ * and put the rest of the literals at the end of the block, with ++ * WILDCOPY_OVERLENGTH of buffer room to allow for overreads. ++ * This MUST not write more than our maxBlockSize beyond dst, because in ++ * streaming mode, that could overwrite part of our extDict window. ++ */ + if (splitImmediately) { + /* won't fit in litExtraBuffer, so it will be split between end of dst and extra buffer */ + dctx->litBuffer = (BYTE*)dst + expectedWriteSize - litSize + ZSTD_LITBUFFEREXTRASIZE - WILDCOPY_OVERLENGTH; + dctx->litBufferEnd = dctx->litBuffer + litSize - ZSTD_LITBUFFEREXTRASIZE; +- } +- else { +- /* initially this will be stored entirely in dst during huffman decoding, it will partially shifted to litExtraBuffer after */ ++ } else { ++ /* initially this will be stored entirely in dst during huffman decoding, it will partially be shifted to litExtraBuffer after */ + dctx->litBuffer = (BYTE*)dst + expectedWriteSize - litSize; + dctx->litBufferEnd = (BYTE*)dst + expectedWriteSize; + } + dctx->litBufferLocation = ZSTD_split; +- } +- else +- { +- /* fits entirely within litExtraBuffer, so no split is necessary */ +- dctx->litBuffer = dctx->litExtraBuffer; +- dctx->litBufferEnd = dctx->litBuffer + litSize; +- dctx->litBufferLocation = ZSTD_not_in_dst; ++ assert(dctx->litBufferEnd <= (BYTE*)dst + expectedWriteSize); + } + } + +-/* Hidden declaration for fullbench */ +-size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, +- const void* src, size_t srcSize, +- void* dst, size_t dstCapacity, const streaming_operation streaming); + /*! ZSTD_decodeLiteralsBlock() : + * Where it is possible to do so without being stomped by the output during decompression, the literals block will be stored + * in the dstBuffer. If there is room to do so, it will be stored in full in the excess dst space after where the current +@@ -116,7 +132,7 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCt + * + * @return : nb of bytes read from src (< srcSize ) + * note : symbol not declared but exposed for fullbench */ +-size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, ++static size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, + const void* src, size_t srcSize, /* note : srcSize < BLOCKSIZE */ + void* dst, size_t dstCapacity, const streaming_operation streaming) + { +@@ -125,6 +141,7 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCt + + { const BYTE* const istart = (const BYTE*) src; + symbolEncodingType_e const litEncType = (symbolEncodingType_e)(istart[0] & 3); ++ size_t const blockSizeMax = ZSTD_blockSizeMax(dctx); + + switch(litEncType) + { +@@ -134,13 +151,16 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCt + ZSTD_FALLTHROUGH; + + case set_compressed: +- RETURN_ERROR_IF(srcSize < 5, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 3; here we need up to 5 for case 3"); ++ RETURN_ERROR_IF(srcSize < 5, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need up to 5 for case 3"); + { size_t lhSize, litSize, litCSize; + U32 singleStream=0; + U32 const lhlCode = (istart[0] >> 2) & 3; + U32 const lhc = MEM_readLE32(istart); + size_t hufSuccess; +- size_t expectedWriteSize = MIN(ZSTD_BLOCKSIZE_MAX, dstCapacity); ++ size_t expectedWriteSize = MIN(blockSizeMax, dstCapacity); ++ int const flags = 0 ++ | (ZSTD_DCtx_get_bmi2(dctx) ? HUF_flags_bmi2 : 0) ++ | (dctx->disableHufAsm ? HUF_flags_disableAsm : 0); + switch(lhlCode) + { + case 0: case 1: default: /* note : default is impossible, since lhlCode into [0..3] */ +@@ -164,7 +184,11 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCt + break; + } + RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled"); +- RETURN_ERROR_IF(litSize > ZSTD_BLOCKSIZE_MAX, corruption_detected, ""); ++ RETURN_ERROR_IF(litSize > blockSizeMax, corruption_detected, ""); ++ if (!singleStream) ++ RETURN_ERROR_IF(litSize < MIN_LITERALS_FOR_4_STREAMS, literals_headerWrong, ++ "Not enough literals (%zu) for the 4-streams mode (min %u)", ++ litSize, MIN_LITERALS_FOR_4_STREAMS); + RETURN_ERROR_IF(litCSize + lhSize > srcSize, corruption_detected, ""); + RETURN_ERROR_IF(expectedWriteSize < litSize , dstSize_tooSmall, ""); + ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSize, streaming, expectedWriteSize, 0); +@@ -176,13 +200,14 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCt + + if (litEncType==set_repeat) { + if (singleStream) { +- hufSuccess = HUF_decompress1X_usingDTable_bmi2( ++ hufSuccess = HUF_decompress1X_usingDTable( + dctx->litBuffer, litSize, istart+lhSize, litCSize, +- dctx->HUFptr, ZSTD_DCtx_get_bmi2(dctx)); ++ dctx->HUFptr, flags); + } else { +- hufSuccess = HUF_decompress4X_usingDTable_bmi2( ++ assert(litSize >= MIN_LITERALS_FOR_4_STREAMS); ++ hufSuccess = HUF_decompress4X_usingDTable( + dctx->litBuffer, litSize, istart+lhSize, litCSize, +- dctx->HUFptr, ZSTD_DCtx_get_bmi2(dctx)); ++ dctx->HUFptr, flags); + } + } else { + if (singleStream) { +@@ -190,26 +215,28 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCt + hufSuccess = HUF_decompress1X_DCtx_wksp( + dctx->entropy.hufTable, dctx->litBuffer, litSize, + istart+lhSize, litCSize, dctx->workspace, +- sizeof(dctx->workspace)); ++ sizeof(dctx->workspace), flags); + #else +- hufSuccess = HUF_decompress1X1_DCtx_wksp_bmi2( ++ hufSuccess = HUF_decompress1X1_DCtx_wksp( + dctx->entropy.hufTable, dctx->litBuffer, litSize, + istart+lhSize, litCSize, dctx->workspace, +- sizeof(dctx->workspace), ZSTD_DCtx_get_bmi2(dctx)); ++ sizeof(dctx->workspace), flags); + #endif + } else { +- hufSuccess = HUF_decompress4X_hufOnly_wksp_bmi2( ++ hufSuccess = HUF_decompress4X_hufOnly_wksp( + dctx->entropy.hufTable, dctx->litBuffer, litSize, + istart+lhSize, litCSize, dctx->workspace, +- sizeof(dctx->workspace), ZSTD_DCtx_get_bmi2(dctx)); ++ sizeof(dctx->workspace), flags); + } + } + if (dctx->litBufferLocation == ZSTD_split) + { ++ assert(litSize > ZSTD_LITBUFFEREXTRASIZE); + ZSTD_memcpy(dctx->litExtraBuffer, dctx->litBufferEnd - ZSTD_LITBUFFEREXTRASIZE, ZSTD_LITBUFFEREXTRASIZE); + ZSTD_memmove(dctx->litBuffer + ZSTD_LITBUFFEREXTRASIZE - WILDCOPY_OVERLENGTH, dctx->litBuffer, litSize - ZSTD_LITBUFFEREXTRASIZE); + dctx->litBuffer += ZSTD_LITBUFFEREXTRASIZE - WILDCOPY_OVERLENGTH; + dctx->litBufferEnd -= WILDCOPY_OVERLENGTH; ++ assert(dctx->litBufferEnd <= (BYTE*)dst + blockSizeMax); + } + + RETURN_ERROR_IF(HUF_isError(hufSuccess), corruption_detected, ""); +@@ -224,7 +251,7 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCt + case set_basic: + { size_t litSize, lhSize; + U32 const lhlCode = ((istart[0]) >> 2) & 3; +- size_t expectedWriteSize = MIN(ZSTD_BLOCKSIZE_MAX, dstCapacity); ++ size_t expectedWriteSize = MIN(blockSizeMax, dstCapacity); + switch(lhlCode) + { + case 0: case 2: default: /* note : default is impossible, since lhlCode into [0..3] */ +@@ -237,11 +264,13 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCt + break; + case 3: + lhSize = 3; ++ RETURN_ERROR_IF(srcSize<3, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need lhSize = 3"); + litSize = MEM_readLE24(istart) >> 4; + break; + } + + RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled"); ++ RETURN_ERROR_IF(litSize > blockSizeMax, corruption_detected, ""); + RETURN_ERROR_IF(expectedWriteSize < litSize, dstSize_tooSmall, ""); + ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSize, streaming, expectedWriteSize, 1); + if (lhSize+litSize+WILDCOPY_OVERLENGTH > srcSize) { /* risk reading beyond src buffer with wildcopy */ +@@ -270,7 +299,7 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCt + case set_rle: + { U32 const lhlCode = ((istart[0]) >> 2) & 3; + size_t litSize, lhSize; +- size_t expectedWriteSize = MIN(ZSTD_BLOCKSIZE_MAX, dstCapacity); ++ size_t expectedWriteSize = MIN(blockSizeMax, dstCapacity); + switch(lhlCode) + { + case 0: case 2: default: /* note : default is impossible, since lhlCode into [0..3] */ +@@ -279,16 +308,17 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCt + break; + case 1: + lhSize = 2; ++ RETURN_ERROR_IF(srcSize<3, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need lhSize+1 = 3"); + litSize = MEM_readLE16(istart) >> 4; + break; + case 3: + lhSize = 3; ++ RETURN_ERROR_IF(srcSize<4, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need lhSize+1 = 4"); + litSize = MEM_readLE24(istart) >> 4; +- RETURN_ERROR_IF(srcSize<4, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 3; here we need lhSize+1 = 4"); + break; + } + RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled"); +- RETURN_ERROR_IF(litSize > ZSTD_BLOCKSIZE_MAX, corruption_detected, ""); ++ RETURN_ERROR_IF(litSize > blockSizeMax, corruption_detected, ""); + RETURN_ERROR_IF(expectedWriteSize < litSize, dstSize_tooSmall, ""); + ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSize, streaming, expectedWriteSize, 1); + if (dctx->litBufferLocation == ZSTD_split) +@@ -310,6 +340,18 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCt + } + } + ++/* Hidden declaration for fullbench */ ++size_t ZSTD_decodeLiteralsBlock_wrapper(ZSTD_DCtx* dctx, ++ const void* src, size_t srcSize, ++ void* dst, size_t dstCapacity); ++size_t ZSTD_decodeLiteralsBlock_wrapper(ZSTD_DCtx* dctx, ++ const void* src, size_t srcSize, ++ void* dst, size_t dstCapacity) ++{ ++ dctx->isFrameDecompression = 0; ++ return ZSTD_decodeLiteralsBlock(dctx, src, srcSize, dst, dstCapacity, not_streaming); ++} ++ + /* Default FSE distribution tables. + * These are pre-calculated FSE decoding tables using default distributions as defined in specification : + * https://github.com/facebook/zstd/blob/release/doc/zstd_compression_format.md#default-distributions +@@ -506,14 +548,15 @@ void ZSTD_buildFSETable_body(ZSTD_seqSym + for (i = 8; i < n; i += 8) { + MEM_write64(spread + pos + i, sv); + } +- pos += n; ++ assert(n>=0); ++ pos += (size_t)n; + } + } + /* Now we spread those positions across the table. +- * The benefit of doing it in two stages is that we avoid the the ++ * The benefit of doing it in two stages is that we avoid the + * variable size inner loop, which caused lots of branch misses. + * Now we can run through all the positions without any branch misses. +- * We unroll the loop twice, since that is what emperically worked best. ++ * We unroll the loop twice, since that is what empirically worked best. + */ + { + size_t position = 0; +@@ -540,7 +583,7 @@ void ZSTD_buildFSETable_body(ZSTD_seqSym + for (i=0; i highThreshold) position = (position + step) & tableMask; /* lowprob area */ ++ while (UNLIKELY(position > highThreshold)) position = (position + step) & tableMask; /* lowprob area */ + } } + assert(position == 0); /* position must reach all cells once, otherwise normalizedCounter is incorrect */ + } +@@ -551,7 +594,7 @@ void ZSTD_buildFSETable_body(ZSTD_seqSym + for (u=0; u 0x7F) { + if (nbSeq == 0xFF) { + RETURN_ERROR_IF(ip+2 > iend, srcSize_wrong, ""); +@@ -681,8 +719,16 @@ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* + } + *nbSeqPtr = nbSeq; + ++ if (nbSeq == 0) { ++ /* No sequence : section ends immediately */ ++ RETURN_ERROR_IF(ip != iend, corruption_detected, ++ "extraneous data present in the Sequences section"); ++ return (size_t)(ip - istart); ++ } ++ + /* FSE table descriptors */ + RETURN_ERROR_IF(ip+1 > iend, srcSize_wrong, ""); /* minimum possible size: 1 byte for symbol encoding types */ ++ RETURN_ERROR_IF(*ip & 3, corruption_detected, ""); /* The last field, Reserved, must be all-zeroes. */ + { symbolEncodingType_e const LLtype = (symbolEncodingType_e)(*ip >> 6); + symbolEncodingType_e const OFtype = (symbolEncodingType_e)((*ip >> 4) & 3); + symbolEncodingType_e const MLtype = (symbolEncodingType_e)((*ip >> 2) & 3); +@@ -829,7 +875,7 @@ static void ZSTD_safecopy(BYTE* op, cons + /* ZSTD_safecopyDstBeforeSrc(): + * This version allows overlap with dst before src, or handles the non-overlap case with dst after src + * Kept separate from more common ZSTD_safecopy case to avoid performance impact to the safecopy common case */ +-static void ZSTD_safecopyDstBeforeSrc(BYTE* op, BYTE const* ip, ptrdiff_t length) { ++static void ZSTD_safecopyDstBeforeSrc(BYTE* op, const BYTE* ip, ptrdiff_t length) { + ptrdiff_t const diff = op - ip; + BYTE* const oend = op + length; + +@@ -858,6 +904,7 @@ static void ZSTD_safecopyDstBeforeSrc(BY + * to be optimized for many small sequences, since those fall into ZSTD_execSequence(). + */ + FORCE_NOINLINE ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR + size_t ZSTD_execSequenceEnd(BYTE* op, + BYTE* const oend, seq_t sequence, + const BYTE** litPtr, const BYTE* const litLimit, +@@ -905,6 +952,7 @@ size_t ZSTD_execSequenceEnd(BYTE* op, + * This version is intended to be used during instances where the litBuffer is still split. It is kept separate to avoid performance impact for the good case. + */ + FORCE_NOINLINE ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR + size_t ZSTD_execSequenceEndSplitLitBuffer(BYTE* op, + BYTE* const oend, const BYTE* const oend_w, seq_t sequence, + const BYTE** litPtr, const BYTE* const litLimit, +@@ -950,6 +998,7 @@ size_t ZSTD_execSequenceEndSplitLitBuffe + } + + HINT_INLINE ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR + size_t ZSTD_execSequence(BYTE* op, + BYTE* const oend, seq_t sequence, + const BYTE** litPtr, const BYTE* const litLimit, +@@ -964,6 +1013,11 @@ size_t ZSTD_execSequence(BYTE* op, + + assert(op != NULL /* Precondition */); + assert(oend_w < oend /* No underflow */); ++ ++#if defined(__aarch64__) ++ /* prefetch sequence starting from match that will be used for copy later */ ++ PREFETCH_L1(match); ++#endif + /* Handle edge cases in a slow path: + * - Read beyond end of literals + * - Match end is within WILDCOPY_OVERLIMIT of oend +@@ -1043,6 +1097,7 @@ size_t ZSTD_execSequence(BYTE* op, + } + + HINT_INLINE ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR + size_t ZSTD_execSequenceSplitLitBuffer(BYTE* op, + BYTE* const oend, const BYTE* const oend_w, seq_t sequence, + const BYTE** litPtr, const BYTE* const litLimit, +@@ -1154,7 +1209,7 @@ ZSTD_updateFseStateWithDInfo(ZSTD_fseSta + } + + /* We need to add at most (ZSTD_WINDOWLOG_MAX_32 - 1) bits to read the maximum +- * offset bits. But we can only read at most (STREAM_ACCUMULATOR_MIN_32 - 1) ++ * offset bits. But we can only read at most STREAM_ACCUMULATOR_MIN_32 + * bits before reloading. This value is the maximum number of bytes we read + * after reloading when we are decoding long offsets. + */ +@@ -1165,13 +1220,37 @@ ZSTD_updateFseStateWithDInfo(ZSTD_fseSta + + typedef enum { ZSTD_lo_isRegularOffset, ZSTD_lo_isLongOffset=1 } ZSTD_longOffset_e; + ++/* ++ * ZSTD_decodeSequence(): ++ * @p longOffsets : tells the decoder to reload more bit while decoding large offsets ++ * only used in 32-bit mode ++ * @return : Sequence (litL + matchL + offset) ++ */ + FORCE_INLINE_TEMPLATE seq_t +-ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets) ++ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets, const int isLastSeq) + { + seq_t seq; ++ /* ++ * ZSTD_seqSymbol is a 64 bits wide structure. ++ * It can be loaded in one operation ++ * and its fields extracted by simply shifting or bit-extracting on aarch64. ++ * GCC doesn't recognize this and generates more unnecessary ldr/ldrb/ldrh ++ * operations that cause performance drop. This can be avoided by using this ++ * ZSTD_memcpy hack. ++ */ ++#if defined(__aarch64__) && (defined(__GNUC__) && !defined(__clang__)) ++ ZSTD_seqSymbol llDInfoS, mlDInfoS, ofDInfoS; ++ ZSTD_seqSymbol* const llDInfo = &llDInfoS; ++ ZSTD_seqSymbol* const mlDInfo = &mlDInfoS; ++ ZSTD_seqSymbol* const ofDInfo = &ofDInfoS; ++ ZSTD_memcpy(llDInfo, seqState->stateLL.table + seqState->stateLL.state, sizeof(ZSTD_seqSymbol)); ++ ZSTD_memcpy(mlDInfo, seqState->stateML.table + seqState->stateML.state, sizeof(ZSTD_seqSymbol)); ++ ZSTD_memcpy(ofDInfo, seqState->stateOffb.table + seqState->stateOffb.state, sizeof(ZSTD_seqSymbol)); ++#else + const ZSTD_seqSymbol* const llDInfo = seqState->stateLL.table + seqState->stateLL.state; + const ZSTD_seqSymbol* const mlDInfo = seqState->stateML.table + seqState->stateML.state; + const ZSTD_seqSymbol* const ofDInfo = seqState->stateOffb.table + seqState->stateOffb.state; ++#endif + seq.matchLength = mlDInfo->baseValue; + seq.litLength = llDInfo->baseValue; + { U32 const ofBase = ofDInfo->baseValue; +@@ -1186,28 +1265,31 @@ ZSTD_decodeSequence(seqState_t* seqState + U32 const llnbBits = llDInfo->nbBits; + U32 const mlnbBits = mlDInfo->nbBits; + U32 const ofnbBits = ofDInfo->nbBits; ++ ++ assert(llBits <= MaxLLBits); ++ assert(mlBits <= MaxMLBits); ++ assert(ofBits <= MaxOff); + /* + * As gcc has better branch and block analyzers, sometimes it is only +- * valuable to mark likelyness for clang, it gives around 3-4% of ++ * valuable to mark likeliness for clang, it gives around 3-4% of + * performance. + */ + + /* sequence */ + { size_t offset; +- #if defined(__clang__) +- if (LIKELY(ofBits > 1)) { +- #else + if (ofBits > 1) { +- #endif + ZSTD_STATIC_ASSERT(ZSTD_lo_isLongOffset == 1); + ZSTD_STATIC_ASSERT(LONG_OFFSETS_MAX_EXTRA_BITS_32 == 5); +- assert(ofBits <= MaxOff); ++ ZSTD_STATIC_ASSERT(STREAM_ACCUMULATOR_MIN_32 > LONG_OFFSETS_MAX_EXTRA_BITS_32); ++ ZSTD_STATIC_ASSERT(STREAM_ACCUMULATOR_MIN_32 - LONG_OFFSETS_MAX_EXTRA_BITS_32 >= MaxMLBits); + if (MEM_32bits() && longOffsets && (ofBits >= STREAM_ACCUMULATOR_MIN_32)) { +- U32 const extraBits = ofBits - MIN(ofBits, 32 - seqState->DStream.bitsConsumed); ++ /* Always read extra bits, this keeps the logic simple, ++ * avoids branches, and avoids accidentally reading 0 bits. ++ */ ++ U32 const extraBits = LONG_OFFSETS_MAX_EXTRA_BITS_32; + offset = ofBase + (BIT_readBitsFast(&seqState->DStream, ofBits - extraBits) << extraBits); + BIT_reloadDStream(&seqState->DStream); +- if (extraBits) offset += BIT_readBitsFast(&seqState->DStream, extraBits); +- assert(extraBits <= LONG_OFFSETS_MAX_EXTRA_BITS_32); /* to avoid another reload */ ++ offset += BIT_readBitsFast(&seqState->DStream, extraBits); + } else { + offset = ofBase + BIT_readBitsFast(&seqState->DStream, ofBits/*>0*/); /* <= (ZSTD_WINDOWLOG_MAX-1) bits */ + if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); +@@ -1224,7 +1306,7 @@ ZSTD_decodeSequence(seqState_t* seqState + } else { + offset = ofBase + ll0 + BIT_readBitsFast(&seqState->DStream, 1); + { size_t temp = (offset==3) ? seqState->prevOffset[0] - 1 : seqState->prevOffset[offset]; +- temp += !temp; /* 0 is not valid; input is corrupted; force offset to 1 */ ++ temp -= !temp; /* 0 is not valid: input corrupted => force offset to -1 => corruption detected at execSequence */ + if (offset != 1) seqState->prevOffset[2] = seqState->prevOffset[1]; + seqState->prevOffset[1] = seqState->prevOffset[0]; + seqState->prevOffset[0] = offset = temp; +@@ -1232,11 +1314,7 @@ ZSTD_decodeSequence(seqState_t* seqState + seq.offset = offset; + } + +- #if defined(__clang__) +- if (UNLIKELY(mlBits > 0)) +- #else + if (mlBits > 0) +- #endif + seq.matchLength += BIT_readBitsFast(&seqState->DStream, mlBits/*>0*/); + + if (MEM_32bits() && (mlBits+llBits >= STREAM_ACCUMULATOR_MIN_32-LONG_OFFSETS_MAX_EXTRA_BITS_32)) +@@ -1246,11 +1324,7 @@ ZSTD_decodeSequence(seqState_t* seqState + /* Ensure there are enough bits to read the rest of data in 64-bit mode. */ + ZSTD_STATIC_ASSERT(16+LLFSELog+MLFSELog+OffFSELog < STREAM_ACCUMULATOR_MIN_64); + +- #if defined(__clang__) +- if (UNLIKELY(llBits > 0)) +- #else + if (llBits > 0) +- #endif + seq.litLength += BIT_readBitsFast(&seqState->DStream, llBits/*>0*/); + + if (MEM_32bits()) +@@ -1259,17 +1333,22 @@ ZSTD_decodeSequence(seqState_t* seqState + DEBUGLOG(6, "seq: litL=%u, matchL=%u, offset=%u", + (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset); + +- ZSTD_updateFseStateWithDInfo(&seqState->stateLL, &seqState->DStream, llNext, llnbBits); /* <= 9 bits */ +- ZSTD_updateFseStateWithDInfo(&seqState->stateML, &seqState->DStream, mlNext, mlnbBits); /* <= 9 bits */ +- if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); /* <= 18 bits */ +- ZSTD_updateFseStateWithDInfo(&seqState->stateOffb, &seqState->DStream, ofNext, ofnbBits); /* <= 8 bits */ ++ if (!isLastSeq) { ++ /* don't update FSE state for last Sequence */ ++ ZSTD_updateFseStateWithDInfo(&seqState->stateLL, &seqState->DStream, llNext, llnbBits); /* <= 9 bits */ ++ ZSTD_updateFseStateWithDInfo(&seqState->stateML, &seqState->DStream, mlNext, mlnbBits); /* <= 9 bits */ ++ if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); /* <= 18 bits */ ++ ZSTD_updateFseStateWithDInfo(&seqState->stateOffb, &seqState->DStream, ofNext, ofnbBits); /* <= 8 bits */ ++ BIT_reloadDStream(&seqState->DStream); ++ } + } + + return seq; + } + +-#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION +-MEM_STATIC int ZSTD_dictionaryIsActive(ZSTD_DCtx const* dctx, BYTE const* prefixStart, BYTE const* oLitEnd) ++#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) ++#if DEBUGLEVEL >= 1 ++static int ZSTD_dictionaryIsActive(ZSTD_DCtx const* dctx, BYTE const* prefixStart, BYTE const* oLitEnd) + { + size_t const windowSize = dctx->fParams.windowSize; + /* No dictionary used. */ +@@ -1283,30 +1362,33 @@ MEM_STATIC int ZSTD_dictionaryIsActive(Z + /* Dictionary is active. */ + return 1; + } ++#endif + +-MEM_STATIC void ZSTD_assertValidSequence( ++static void ZSTD_assertValidSequence( + ZSTD_DCtx const* dctx, + BYTE const* op, BYTE const* oend, + seq_t const seq, + BYTE const* prefixStart, BYTE const* virtualStart) + { + #if DEBUGLEVEL >= 1 +- size_t const windowSize = dctx->fParams.windowSize; +- size_t const sequenceSize = seq.litLength + seq.matchLength; +- BYTE const* const oLitEnd = op + seq.litLength; +- DEBUGLOG(6, "Checking sequence: litL=%u matchL=%u offset=%u", +- (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset); +- assert(op <= oend); +- assert((size_t)(oend - op) >= sequenceSize); +- assert(sequenceSize <= ZSTD_BLOCKSIZE_MAX); +- if (ZSTD_dictionaryIsActive(dctx, prefixStart, oLitEnd)) { +- size_t const dictSize = (size_t)((char const*)dctx->dictContentEndForFuzzing - (char const*)dctx->dictContentBeginForFuzzing); +- /* Offset must be within the dictionary. */ +- assert(seq.offset <= (size_t)(oLitEnd - virtualStart)); +- assert(seq.offset <= windowSize + dictSize); +- } else { +- /* Offset must be within our window. */ +- assert(seq.offset <= windowSize); ++ if (dctx->isFrameDecompression) { ++ size_t const windowSize = dctx->fParams.windowSize; ++ size_t const sequenceSize = seq.litLength + seq.matchLength; ++ BYTE const* const oLitEnd = op + seq.litLength; ++ DEBUGLOG(6, "Checking sequence: litL=%u matchL=%u offset=%u", ++ (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset); ++ assert(op <= oend); ++ assert((size_t)(oend - op) >= sequenceSize); ++ assert(sequenceSize <= ZSTD_blockSizeMax(dctx)); ++ if (ZSTD_dictionaryIsActive(dctx, prefixStart, oLitEnd)) { ++ size_t const dictSize = (size_t)((char const*)dctx->dictContentEndForFuzzing - (char const*)dctx->dictContentBeginForFuzzing); ++ /* Offset must be within the dictionary. */ ++ assert(seq.offset <= (size_t)(oLitEnd - virtualStart)); ++ assert(seq.offset <= windowSize + dictSize); ++ } else { ++ /* Offset must be within our window. */ ++ assert(seq.offset <= windowSize); ++ } + } + #else + (void)dctx, (void)op, (void)oend, (void)seq, (void)prefixStart, (void)virtualStart; +@@ -1322,23 +1404,21 @@ DONT_VECTORIZE + ZSTD_decompressSequences_bodySplitLitBuffer( ZSTD_DCtx* dctx, + void* dst, size_t maxDstSize, + const void* seqStart, size_t seqSize, int nbSeq, +- const ZSTD_longOffset_e isLongOffset, +- const int frame) ++ const ZSTD_longOffset_e isLongOffset) + { + const BYTE* ip = (const BYTE*)seqStart; + const BYTE* const iend = ip + seqSize; + BYTE* const ostart = (BYTE*)dst; +- BYTE* const oend = ostart + maxDstSize; ++ BYTE* const oend = ZSTD_maybeNullPtrAdd(ostart, maxDstSize); + BYTE* op = ostart; + const BYTE* litPtr = dctx->litPtr; + const BYTE* litBufferEnd = dctx->litBufferEnd; + const BYTE* const prefixStart = (const BYTE*) (dctx->prefixStart); + const BYTE* const vBase = (const BYTE*) (dctx->virtualStart); + const BYTE* const dictEnd = (const BYTE*) (dctx->dictEnd); +- DEBUGLOG(5, "ZSTD_decompressSequences_bodySplitLitBuffer"); +- (void)frame; ++ DEBUGLOG(5, "ZSTD_decompressSequences_bodySplitLitBuffer (%i seqs)", nbSeq); + +- /* Regen sequences */ ++ /* Literals are split between internal buffer & output buffer */ + if (nbSeq) { + seqState_t seqState; + dctx->fseEntropy = 1; +@@ -1357,8 +1437,7 @@ ZSTD_decompressSequences_bodySplitLitBuf + BIT_DStream_completed < BIT_DStream_overflow); + + /* decompress without overrunning litPtr begins */ +- { +- seq_t sequence = ZSTD_decodeSequence(&seqState, isLongOffset); ++ { seq_t sequence = {0,0,0}; /* some static analyzer believe that @sequence is not initialized (it necessarily is, since for(;;) loop as at least one iteration) */ + /* Align the decompression loop to 32 + 16 bytes. + * + * zstd compiled with gcc-9 on an Intel i9-9900k shows 10% decompression +@@ -1420,27 +1499,26 @@ ZSTD_decompressSequences_bodySplitLitBuf + #endif + + /* Handle the initial state where litBuffer is currently split between dst and litExtraBuffer */ +- for (; litPtr + sequence.litLength <= dctx->litBufferEnd; ) { +- size_t const oneSeqSize = ZSTD_execSequenceSplitLitBuffer(op, oend, litPtr + sequence.litLength - WILDCOPY_OVERLENGTH, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd); ++ for ( ; nbSeq; nbSeq--) { ++ sequence = ZSTD_decodeSequence(&seqState, isLongOffset, nbSeq==1); ++ if (litPtr + sequence.litLength > dctx->litBufferEnd) break; ++ { size_t const oneSeqSize = ZSTD_execSequenceSplitLitBuffer(op, oend, litPtr + sequence.litLength - WILDCOPY_OVERLENGTH, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd); + #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) +- assert(!ZSTD_isError(oneSeqSize)); +- if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase); ++ assert(!ZSTD_isError(oneSeqSize)); ++ ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase); + #endif +- if (UNLIKELY(ZSTD_isError(oneSeqSize))) +- return oneSeqSize; +- DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize); +- op += oneSeqSize; +- if (UNLIKELY(!--nbSeq)) +- break; +- BIT_reloadDStream(&(seqState.DStream)); +- sequence = ZSTD_decodeSequence(&seqState, isLongOffset); +- } ++ if (UNLIKELY(ZSTD_isError(oneSeqSize))) ++ return oneSeqSize; ++ DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize); ++ op += oneSeqSize; ++ } } ++ DEBUGLOG(6, "reached: (litPtr + sequence.litLength > dctx->litBufferEnd)"); + + /* If there are more sequences, they will need to read literals from litExtraBuffer; copy over the remainder from dst and update litPtr and litEnd */ + if (nbSeq > 0) { + const size_t leftoverLit = dctx->litBufferEnd - litPtr; +- if (leftoverLit) +- { ++ DEBUGLOG(6, "There are %i sequences left, and %zu/%zu literals left in buffer", nbSeq, leftoverLit, sequence.litLength); ++ if (leftoverLit) { + RETURN_ERROR_IF(leftoverLit > (size_t)(oend - op), dstSize_tooSmall, "remaining lit must fit within dstBuffer"); + ZSTD_safecopyDstBeforeSrc(op, litPtr, leftoverLit); + sequence.litLength -= leftoverLit; +@@ -1449,24 +1527,22 @@ ZSTD_decompressSequences_bodySplitLitBuf + litPtr = dctx->litExtraBuffer; + litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE; + dctx->litBufferLocation = ZSTD_not_in_dst; +- { +- size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd); ++ { size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd); + #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) + assert(!ZSTD_isError(oneSeqSize)); +- if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase); ++ ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase); + #endif + if (UNLIKELY(ZSTD_isError(oneSeqSize))) + return oneSeqSize; + DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize); + op += oneSeqSize; +- if (--nbSeq) +- BIT_reloadDStream(&(seqState.DStream)); + } ++ nbSeq--; + } + } + +- if (nbSeq > 0) /* there is remaining lit from extra buffer */ +- { ++ if (nbSeq > 0) { ++ /* there is remaining lit from extra buffer */ + + #if defined(__x86_64__) + __asm__(".p2align 6"); +@@ -1485,35 +1561,34 @@ ZSTD_decompressSequences_bodySplitLitBuf + # endif + #endif + +- for (; ; ) { +- seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset); ++ for ( ; nbSeq ; nbSeq--) { ++ seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset, nbSeq==1); + size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd); + #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) + assert(!ZSTD_isError(oneSeqSize)); +- if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase); ++ ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase); + #endif + if (UNLIKELY(ZSTD_isError(oneSeqSize))) + return oneSeqSize; + DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize); + op += oneSeqSize; +- if (UNLIKELY(!--nbSeq)) +- break; +- BIT_reloadDStream(&(seqState.DStream)); + } + } + + /* check if reached exact end */ + DEBUGLOG(5, "ZSTD_decompressSequences_bodySplitLitBuffer: after decode loop, remaining nbSeq : %i", nbSeq); + RETURN_ERROR_IF(nbSeq, corruption_detected, ""); +- RETURN_ERROR_IF(BIT_reloadDStream(&seqState.DStream) < BIT_DStream_completed, corruption_detected, ""); ++ DEBUGLOG(5, "bitStream : start=%p, ptr=%p, bitsConsumed=%u", seqState.DStream.start, seqState.DStream.ptr, seqState.DStream.bitsConsumed); ++ RETURN_ERROR_IF(!BIT_endOfDStream(&seqState.DStream), corruption_detected, ""); + /* save reps for next block */ + { U32 i; for (i=0; ientropy.rep[i] = (U32)(seqState.prevOffset[i]); } + } + + /* last literal segment */ +- if (dctx->litBufferLocation == ZSTD_split) /* split hasn't been reached yet, first get dst then copy litExtraBuffer */ +- { +- size_t const lastLLSize = litBufferEnd - litPtr; ++ if (dctx->litBufferLocation == ZSTD_split) { ++ /* split hasn't been reached yet, first get dst then copy litExtraBuffer */ ++ size_t const lastLLSize = (size_t)(litBufferEnd - litPtr); ++ DEBUGLOG(6, "copy last literals from segment : %u", (U32)lastLLSize); + RETURN_ERROR_IF(lastLLSize > (size_t)(oend - op), dstSize_tooSmall, ""); + if (op != NULL) { + ZSTD_memmove(op, litPtr, lastLLSize); +@@ -1523,15 +1598,17 @@ ZSTD_decompressSequences_bodySplitLitBuf + litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE; + dctx->litBufferLocation = ZSTD_not_in_dst; + } +- { size_t const lastLLSize = litBufferEnd - litPtr; ++ /* copy last literals from internal buffer */ ++ { size_t const lastLLSize = (size_t)(litBufferEnd - litPtr); ++ DEBUGLOG(6, "copy last literals from internal buffer : %u", (U32)lastLLSize); + RETURN_ERROR_IF(lastLLSize > (size_t)(oend-op), dstSize_tooSmall, ""); + if (op != NULL) { + ZSTD_memcpy(op, litPtr, lastLLSize); + op += lastLLSize; +- } +- } ++ } } + +- return op-ostart; ++ DEBUGLOG(6, "decoded block of size %u bytes", (U32)(op - ostart)); ++ return (size_t)(op - ostart); + } + + FORCE_INLINE_TEMPLATE size_t +@@ -1539,21 +1616,19 @@ DONT_VECTORIZE + ZSTD_decompressSequences_body(ZSTD_DCtx* dctx, + void* dst, size_t maxDstSize, + const void* seqStart, size_t seqSize, int nbSeq, +- const ZSTD_longOffset_e isLongOffset, +- const int frame) ++ const ZSTD_longOffset_e isLongOffset) + { + const BYTE* ip = (const BYTE*)seqStart; + const BYTE* const iend = ip + seqSize; + BYTE* const ostart = (BYTE*)dst; +- BYTE* const oend = dctx->litBufferLocation == ZSTD_not_in_dst ? ostart + maxDstSize : dctx->litBuffer; ++ BYTE* const oend = dctx->litBufferLocation == ZSTD_not_in_dst ? ZSTD_maybeNullPtrAdd(ostart, maxDstSize) : dctx->litBuffer; + BYTE* op = ostart; + const BYTE* litPtr = dctx->litPtr; + const BYTE* const litEnd = litPtr + dctx->litSize; + const BYTE* const prefixStart = (const BYTE*)(dctx->prefixStart); + const BYTE* const vBase = (const BYTE*)(dctx->virtualStart); + const BYTE* const dictEnd = (const BYTE*)(dctx->dictEnd); +- DEBUGLOG(5, "ZSTD_decompressSequences_body"); +- (void)frame; ++ DEBUGLOG(5, "ZSTD_decompressSequences_body: nbSeq = %d", nbSeq); + + /* Regen sequences */ + if (nbSeq) { +@@ -1568,11 +1643,6 @@ ZSTD_decompressSequences_body(ZSTD_DCtx* + ZSTD_initFseState(&seqState.stateML, &seqState.DStream, dctx->MLTptr); + assert(dst != NULL); + +- ZSTD_STATIC_ASSERT( +- BIT_DStream_unfinished < BIT_DStream_completed && +- BIT_DStream_endOfBuffer < BIT_DStream_completed && +- BIT_DStream_completed < BIT_DStream_overflow); +- + #if defined(__x86_64__) + __asm__(".p2align 6"); + __asm__("nop"); +@@ -1587,73 +1657,70 @@ ZSTD_decompressSequences_body(ZSTD_DCtx* + # endif + #endif + +- for ( ; ; ) { +- seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset); ++ for ( ; nbSeq ; nbSeq--) { ++ seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset, nbSeq==1); + size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litEnd, prefixStart, vBase, dictEnd); + #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) + assert(!ZSTD_isError(oneSeqSize)); +- if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase); ++ ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase); + #endif + if (UNLIKELY(ZSTD_isError(oneSeqSize))) + return oneSeqSize; + DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize); + op += oneSeqSize; +- if (UNLIKELY(!--nbSeq)) +- break; +- BIT_reloadDStream(&(seqState.DStream)); + } + + /* check if reached exact end */ +- DEBUGLOG(5, "ZSTD_decompressSequences_body: after decode loop, remaining nbSeq : %i", nbSeq); +- RETURN_ERROR_IF(nbSeq, corruption_detected, ""); +- RETURN_ERROR_IF(BIT_reloadDStream(&seqState.DStream) < BIT_DStream_completed, corruption_detected, ""); ++ assert(nbSeq == 0); ++ RETURN_ERROR_IF(!BIT_endOfDStream(&seqState.DStream), corruption_detected, ""); + /* save reps for next block */ + { U32 i; for (i=0; ientropy.rep[i] = (U32)(seqState.prevOffset[i]); } + } + + /* last literal segment */ +- { size_t const lastLLSize = litEnd - litPtr; ++ { size_t const lastLLSize = (size_t)(litEnd - litPtr); ++ DEBUGLOG(6, "copy last literals : %u", (U32)lastLLSize); + RETURN_ERROR_IF(lastLLSize > (size_t)(oend-op), dstSize_tooSmall, ""); + if (op != NULL) { + ZSTD_memcpy(op, litPtr, lastLLSize); + op += lastLLSize; +- } +- } ++ } } + +- return op-ostart; ++ DEBUGLOG(6, "decoded block of size %u bytes", (U32)(op - ostart)); ++ return (size_t)(op - ostart); + } + + static size_t + ZSTD_decompressSequences_default(ZSTD_DCtx* dctx, + void* dst, size_t maxDstSize, + const void* seqStart, size_t seqSize, int nbSeq, +- const ZSTD_longOffset_e isLongOffset, +- const int frame) ++ const ZSTD_longOffset_e isLongOffset) + { +- return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); ++ return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); + } + + static size_t + ZSTD_decompressSequencesSplitLitBuffer_default(ZSTD_DCtx* dctx, + void* dst, size_t maxDstSize, + const void* seqStart, size_t seqSize, int nbSeq, +- const ZSTD_longOffset_e isLongOffset, +- const int frame) ++ const ZSTD_longOffset_e isLongOffset) + { +- return ZSTD_decompressSequences_bodySplitLitBuffer(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); ++ return ZSTD_decompressSequences_bodySplitLitBuffer(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); + } + #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */ + + #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT + +-FORCE_INLINE_TEMPLATE size_t +-ZSTD_prefetchMatch(size_t prefetchPos, seq_t const sequence, ++FORCE_INLINE_TEMPLATE ++ ++size_t ZSTD_prefetchMatch(size_t prefetchPos, seq_t const sequence, + const BYTE* const prefixStart, const BYTE* const dictEnd) + { + prefetchPos += sequence.litLength; + { const BYTE* const matchBase = (sequence.offset > prefetchPos) ? dictEnd : prefixStart; +- const BYTE* const match = matchBase + prefetchPos - sequence.offset; /* note : this operation can overflow when seq.offset is really too large, which can only happen when input is corrupted. +- * No consequence though : memory address is only used for prefetching, not for dereferencing */ ++ /* note : this operation can overflow when seq.offset is really too large, which can only happen when input is corrupted. ++ * No consequence though : memory address is only used for prefetching, not for dereferencing */ ++ const BYTE* const match = ZSTD_wrappedPtrSub(ZSTD_wrappedPtrAdd(matchBase, prefetchPos), sequence.offset); + PREFETCH_L1(match); PREFETCH_L1(match+CACHELINE_SIZE); /* note : it's safe to invoke PREFETCH() on any memory address, including invalid ones */ + } + return prefetchPos + sequence.matchLength; +@@ -1668,20 +1735,18 @@ ZSTD_decompressSequencesLong_body( + ZSTD_DCtx* dctx, + void* dst, size_t maxDstSize, + const void* seqStart, size_t seqSize, int nbSeq, +- const ZSTD_longOffset_e isLongOffset, +- const int frame) ++ const ZSTD_longOffset_e isLongOffset) + { + const BYTE* ip = (const BYTE*)seqStart; + const BYTE* const iend = ip + seqSize; + BYTE* const ostart = (BYTE*)dst; +- BYTE* const oend = dctx->litBufferLocation == ZSTD_in_dst ? dctx->litBuffer : ostart + maxDstSize; ++ BYTE* const oend = dctx->litBufferLocation == ZSTD_in_dst ? dctx->litBuffer : ZSTD_maybeNullPtrAdd(ostart, maxDstSize); + BYTE* op = ostart; + const BYTE* litPtr = dctx->litPtr; + const BYTE* litBufferEnd = dctx->litBufferEnd; + const BYTE* const prefixStart = (const BYTE*) (dctx->prefixStart); + const BYTE* const dictStart = (const BYTE*) (dctx->virtualStart); + const BYTE* const dictEnd = (const BYTE*) (dctx->dictEnd); +- (void)frame; + + /* Regen sequences */ + if (nbSeq) { +@@ -1706,20 +1771,17 @@ ZSTD_decompressSequencesLong_body( + ZSTD_initFseState(&seqState.stateML, &seqState.DStream, dctx->MLTptr); + + /* prepare in advance */ +- for (seqNb=0; (BIT_reloadDStream(&seqState.DStream) <= BIT_DStream_completed) && (seqNblitBufferLocation == ZSTD_split && litPtr + sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK].litLength > dctx->litBufferEnd) +- { ++ if (dctx->litBufferLocation == ZSTD_split && litPtr + sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK].litLength > dctx->litBufferEnd) { + /* lit buffer is reaching split point, empty out the first buffer and transition to litExtraBuffer */ + const size_t leftoverLit = dctx->litBufferEnd - litPtr; + if (leftoverLit) +@@ -1732,26 +1794,26 @@ ZSTD_decompressSequencesLong_body( + litPtr = dctx->litExtraBuffer; + litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE; + dctx->litBufferLocation = ZSTD_not_in_dst; +- oneSeqSize = ZSTD_execSequence(op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd); ++ { size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd); + #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) +- assert(!ZSTD_isError(oneSeqSize)); +- if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart); ++ assert(!ZSTD_isError(oneSeqSize)); ++ ZSTD_assertValidSequence(dctx, op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart); + #endif +- if (ZSTD_isError(oneSeqSize)) return oneSeqSize; ++ if (ZSTD_isError(oneSeqSize)) return oneSeqSize; + +- prefetchPos = ZSTD_prefetchMatch(prefetchPos, sequence, prefixStart, dictEnd); +- sequences[seqNb & STORED_SEQS_MASK] = sequence; +- op += oneSeqSize; +- } ++ prefetchPos = ZSTD_prefetchMatch(prefetchPos, sequence, prefixStart, dictEnd); ++ sequences[seqNb & STORED_SEQS_MASK] = sequence; ++ op += oneSeqSize; ++ } } + else + { + /* lit buffer is either wholly contained in first or second split, or not split at all*/ +- oneSeqSize = dctx->litBufferLocation == ZSTD_split ? ++ size_t const oneSeqSize = dctx->litBufferLocation == ZSTD_split ? + ZSTD_execSequenceSplitLitBuffer(op, oend, litPtr + sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK].litLength - WILDCOPY_OVERLENGTH, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd) : + ZSTD_execSequence(op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd); + #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) + assert(!ZSTD_isError(oneSeqSize)); +- if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart); ++ ZSTD_assertValidSequence(dctx, op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart); + #endif + if (ZSTD_isError(oneSeqSize)) return oneSeqSize; + +@@ -1760,17 +1822,15 @@ ZSTD_decompressSequencesLong_body( + op += oneSeqSize; + } + } +- RETURN_ERROR_IF(seqNblitBufferLocation == ZSTD_split && litPtr + sequence->litLength > dctx->litBufferEnd) +- { ++ if (dctx->litBufferLocation == ZSTD_split && litPtr + sequence->litLength > dctx->litBufferEnd) { + const size_t leftoverLit = dctx->litBufferEnd - litPtr; +- if (leftoverLit) +- { ++ if (leftoverLit) { + RETURN_ERROR_IF(leftoverLit > (size_t)(oend - op), dstSize_tooSmall, "remaining lit must fit within dstBuffer"); + ZSTD_safecopyDstBeforeSrc(op, litPtr, leftoverLit); + sequence->litLength -= leftoverLit; +@@ -1779,11 +1839,10 @@ ZSTD_decompressSequencesLong_body( + litPtr = dctx->litExtraBuffer; + litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE; + dctx->litBufferLocation = ZSTD_not_in_dst; +- { +- size_t const oneSeqSize = ZSTD_execSequence(op, oend, *sequence, &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd); ++ { size_t const oneSeqSize = ZSTD_execSequence(op, oend, *sequence, &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd); + #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) + assert(!ZSTD_isError(oneSeqSize)); +- if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[seqNb&STORED_SEQS_MASK], prefixStart, dictStart); ++ ZSTD_assertValidSequence(dctx, op, oend, sequences[seqNb&STORED_SEQS_MASK], prefixStart, dictStart); + #endif + if (ZSTD_isError(oneSeqSize)) return oneSeqSize; + op += oneSeqSize; +@@ -1796,7 +1855,7 @@ ZSTD_decompressSequencesLong_body( + ZSTD_execSequence(op, oend, *sequence, &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd); + #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) + assert(!ZSTD_isError(oneSeqSize)); +- if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[seqNb&STORED_SEQS_MASK], prefixStart, dictStart); ++ ZSTD_assertValidSequence(dctx, op, oend, sequences[seqNb&STORED_SEQS_MASK], prefixStart, dictStart); + #endif + if (ZSTD_isError(oneSeqSize)) return oneSeqSize; + op += oneSeqSize; +@@ -1808,8 +1867,7 @@ ZSTD_decompressSequencesLong_body( + } + + /* last literal segment */ +- if (dctx->litBufferLocation == ZSTD_split) /* first deplete literal buffer in dst, then copy litExtraBuffer */ +- { ++ if (dctx->litBufferLocation == ZSTD_split) { /* first deplete literal buffer in dst, then copy litExtraBuffer */ + size_t const lastLLSize = litBufferEnd - litPtr; + RETURN_ERROR_IF(lastLLSize > (size_t)(oend - op), dstSize_tooSmall, ""); + if (op != NULL) { +@@ -1827,17 +1885,16 @@ ZSTD_decompressSequencesLong_body( + } + } + +- return op-ostart; ++ return (size_t)(op - ostart); + } + + static size_t + ZSTD_decompressSequencesLong_default(ZSTD_DCtx* dctx, + void* dst, size_t maxDstSize, + const void* seqStart, size_t seqSize, int nbSeq, +- const ZSTD_longOffset_e isLongOffset, +- const int frame) ++ const ZSTD_longOffset_e isLongOffset) + { +- return ZSTD_decompressSequencesLong_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); ++ return ZSTD_decompressSequencesLong_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); + } + #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */ + +@@ -1851,20 +1908,18 @@ DONT_VECTORIZE + ZSTD_decompressSequences_bmi2(ZSTD_DCtx* dctx, + void* dst, size_t maxDstSize, + const void* seqStart, size_t seqSize, int nbSeq, +- const ZSTD_longOffset_e isLongOffset, +- const int frame) ++ const ZSTD_longOffset_e isLongOffset) + { +- return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); ++ return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); + } + static BMI2_TARGET_ATTRIBUTE size_t + DONT_VECTORIZE + ZSTD_decompressSequencesSplitLitBuffer_bmi2(ZSTD_DCtx* dctx, + void* dst, size_t maxDstSize, + const void* seqStart, size_t seqSize, int nbSeq, +- const ZSTD_longOffset_e isLongOffset, +- const int frame) ++ const ZSTD_longOffset_e isLongOffset) + { +- return ZSTD_decompressSequences_bodySplitLitBuffer(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); ++ return ZSTD_decompressSequences_bodySplitLitBuffer(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); + } + #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */ + +@@ -1873,10 +1928,9 @@ static BMI2_TARGET_ATTRIBUTE size_t + ZSTD_decompressSequencesLong_bmi2(ZSTD_DCtx* dctx, + void* dst, size_t maxDstSize, + const void* seqStart, size_t seqSize, int nbSeq, +- const ZSTD_longOffset_e isLongOffset, +- const int frame) ++ const ZSTD_longOffset_e isLongOffset) + { +- return ZSTD_decompressSequencesLong_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); ++ return ZSTD_decompressSequencesLong_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); + } + #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */ + +@@ -1886,37 +1940,34 @@ typedef size_t (*ZSTD_decompressSequence + ZSTD_DCtx* dctx, + void* dst, size_t maxDstSize, + const void* seqStart, size_t seqSize, int nbSeq, +- const ZSTD_longOffset_e isLongOffset, +- const int frame); ++ const ZSTD_longOffset_e isLongOffset); + + #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG + static size_t + ZSTD_decompressSequences(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize, + const void* seqStart, size_t seqSize, int nbSeq, +- const ZSTD_longOffset_e isLongOffset, +- const int frame) ++ const ZSTD_longOffset_e isLongOffset) + { + DEBUGLOG(5, "ZSTD_decompressSequences"); + #if DYNAMIC_BMI2 + if (ZSTD_DCtx_get_bmi2(dctx)) { +- return ZSTD_decompressSequences_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); ++ return ZSTD_decompressSequences_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); + } + #endif +- return ZSTD_decompressSequences_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); ++ return ZSTD_decompressSequences_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); + } + static size_t + ZSTD_decompressSequencesSplitLitBuffer(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize, + const void* seqStart, size_t seqSize, int nbSeq, +- const ZSTD_longOffset_e isLongOffset, +- const int frame) ++ const ZSTD_longOffset_e isLongOffset) + { + DEBUGLOG(5, "ZSTD_decompressSequencesSplitLitBuffer"); + #if DYNAMIC_BMI2 + if (ZSTD_DCtx_get_bmi2(dctx)) { +- return ZSTD_decompressSequencesSplitLitBuffer_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); ++ return ZSTD_decompressSequencesSplitLitBuffer_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); + } + #endif +- return ZSTD_decompressSequencesSplitLitBuffer_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); ++ return ZSTD_decompressSequencesSplitLitBuffer_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); + } + #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */ + +@@ -1931,69 +1982,114 @@ static size_t + ZSTD_decompressSequencesLong(ZSTD_DCtx* dctx, + void* dst, size_t maxDstSize, + const void* seqStart, size_t seqSize, int nbSeq, +- const ZSTD_longOffset_e isLongOffset, +- const int frame) ++ const ZSTD_longOffset_e isLongOffset) + { + DEBUGLOG(5, "ZSTD_decompressSequencesLong"); + #if DYNAMIC_BMI2 + if (ZSTD_DCtx_get_bmi2(dctx)) { +- return ZSTD_decompressSequencesLong_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); ++ return ZSTD_decompressSequencesLong_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); + } + #endif +- return ZSTD_decompressSequencesLong_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); ++ return ZSTD_decompressSequencesLong_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); + } + #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */ + + ++/* ++ * @returns The total size of the history referenceable by zstd, including ++ * both the prefix and the extDict. At @p op any offset larger than this ++ * is invalid. ++ */ ++static size_t ZSTD_totalHistorySize(BYTE* op, BYTE const* virtualStart) ++{ ++ return (size_t)(op - virtualStart); ++} ++ ++typedef struct { ++ unsigned longOffsetShare; ++ unsigned maxNbAdditionalBits; ++} ZSTD_OffsetInfo; + +-#if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \ +- !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG) +-/* ZSTD_getLongOffsetsShare() : ++/* ZSTD_getOffsetInfo() : + * condition : offTable must be valid + * @return : "share" of long offsets (arbitrarily defined as > (1<<23)) +- * compared to maximum possible of (1< 22) total += 1; ++ * compared to maximum possible of (1< 22) info.longOffsetShare += 1; ++ } ++ ++ assert(tableLog <= OffFSELog); ++ info.longOffsetShare <<= (OffFSELog - tableLog); /* scale to OffFSELog */ + } + +- assert(tableLog <= OffFSELog); +- total <<= (OffFSELog - tableLog); /* scale to OffFSELog */ ++ return info; ++} + +- return total; ++/* ++ * @returns The maximum offset we can decode in one read of our bitstream, without ++ * reloading more bits in the middle of the offset bits read. Any offsets larger ++ * than this must use the long offset decoder. ++ */ ++static size_t ZSTD_maxShortOffset(void) ++{ ++ if (MEM_64bits()) { ++ /* We can decode any offset without reloading bits. ++ * This might change if the max window size grows. ++ */ ++ ZSTD_STATIC_ASSERT(ZSTD_WINDOWLOG_MAX <= 31); ++ return (size_t)-1; ++ } else { ++ /* The maximum offBase is (1 << (STREAM_ACCUMULATOR_MIN + 1)) - 1. ++ * This offBase would require STREAM_ACCUMULATOR_MIN extra bits. ++ * Then we have to subtract ZSTD_REP_NUM to get the maximum possible offset. ++ */ ++ size_t const maxOffbase = ((size_t)1 << (STREAM_ACCUMULATOR_MIN + 1)) - 1; ++ size_t const maxOffset = maxOffbase - ZSTD_REP_NUM; ++ assert(ZSTD_highbit32((U32)maxOffbase) == STREAM_ACCUMULATOR_MIN); ++ return maxOffset; ++ } + } +-#endif + + size_t + ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx, + void* dst, size_t dstCapacity, +- const void* src, size_t srcSize, const int frame, const streaming_operation streaming) ++ const void* src, size_t srcSize, const streaming_operation streaming) + { /* blockType == blockCompressed */ + const BYTE* ip = (const BYTE*)src; +- /* isLongOffset must be true if there are long offsets. +- * Offsets are long if they are larger than 2^STREAM_ACCUMULATOR_MIN. +- * We don't expect that to be the case in 64-bit mode. +- * In block mode, window size is not known, so we have to be conservative. +- * (note: but it could be evaluated from current-lowLimit) +- */ +- ZSTD_longOffset_e const isLongOffset = (ZSTD_longOffset_e)(MEM_32bits() && (!frame || (dctx->fParams.windowSize > (1ULL << STREAM_ACCUMULATOR_MIN)))); +- DEBUGLOG(5, "ZSTD_decompressBlock_internal (size : %u)", (U32)srcSize); ++ DEBUGLOG(5, "ZSTD_decompressBlock_internal (cSize : %u)", (unsigned)srcSize); + +- RETURN_ERROR_IF(srcSize >= ZSTD_BLOCKSIZE_MAX, srcSize_wrong, ""); ++ /* Note : the wording of the specification ++ * allows compressed block to be sized exactly ZSTD_blockSizeMax(dctx). ++ * This generally does not happen, as it makes little sense, ++ * since an uncompressed block would feature same size and have no decompression cost. ++ * Also, note that decoder from reference libzstd before < v1.5.4 ++ * would consider this edge case as an error. ++ * As a consequence, avoid generating compressed blocks of size ZSTD_blockSizeMax(dctx) ++ * for broader compatibility with the deployed ecosystem of zstd decoders */ ++ RETURN_ERROR_IF(srcSize > ZSTD_blockSizeMax(dctx), srcSize_wrong, ""); + + /* Decode literals section */ + { size_t const litCSize = ZSTD_decodeLiteralsBlock(dctx, src, srcSize, dst, dstCapacity, streaming); +- DEBUGLOG(5, "ZSTD_decodeLiteralsBlock : %u", (U32)litCSize); ++ DEBUGLOG(5, "ZSTD_decodeLiteralsBlock : cSize=%u, nbLiterals=%zu", (U32)litCSize, dctx->litSize); + if (ZSTD_isError(litCSize)) return litCSize; + ip += litCSize; + srcSize -= litCSize; +@@ -2001,6 +2097,23 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* + + /* Build Decoding Tables */ + { ++ /* Compute the maximum block size, which must also work when !frame and fParams are unset. ++ * Additionally, take the min with dstCapacity to ensure that the totalHistorySize fits in a size_t. ++ */ ++ size_t const blockSizeMax = MIN(dstCapacity, ZSTD_blockSizeMax(dctx)); ++ size_t const totalHistorySize = ZSTD_totalHistorySize(ZSTD_maybeNullPtrAdd((BYTE*)dst, blockSizeMax), (BYTE const*)dctx->virtualStart); ++ /* isLongOffset must be true if there are long offsets. ++ * Offsets are long if they are larger than ZSTD_maxShortOffset(). ++ * We don't expect that to be the case in 64-bit mode. ++ * ++ * We check here to see if our history is large enough to allow long offsets. ++ * If it isn't, then we can't possible have (valid) long offsets. If the offset ++ * is invalid, then it is okay to read it incorrectly. ++ * ++ * If isLongOffsets is true, then we will later check our decoding table to see ++ * if it is even possible to generate long offsets. ++ */ ++ ZSTD_longOffset_e isLongOffset = (ZSTD_longOffset_e)(MEM_32bits() && (totalHistorySize > ZSTD_maxShortOffset())); + /* These macros control at build-time which decompressor implementation + * we use. If neither is defined, we do some inspection and dispatch at + * runtime. +@@ -2008,6 +2121,11 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* + #if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \ + !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG) + int usePrefetchDecoder = dctx->ddictIsCold; ++#else ++ /* Set to 1 to avoid computing offset info if we don't need to. ++ * Otherwise this value is ignored. ++ */ ++ int usePrefetchDecoder = 1; + #endif + int nbSeq; + size_t const seqHSize = ZSTD_decodeSeqHeaders(dctx, &nbSeq, ip, srcSize); +@@ -2015,40 +2133,55 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* + ip += seqHSize; + srcSize -= seqHSize; + +- RETURN_ERROR_IF(dst == NULL && nbSeq > 0, dstSize_tooSmall, "NULL not handled"); +- +-#if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \ +- !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG) +- if ( !usePrefetchDecoder +- && (!frame || (dctx->fParams.windowSize > (1<<24))) +- && (nbSeq>ADVANCED_SEQS) ) { /* could probably use a larger nbSeq limit */ +- U32 const shareLongOffsets = ZSTD_getLongOffsetsShare(dctx->OFTptr); +- U32 const minShare = MEM_64bits() ? 7 : 20; /* heuristic values, correspond to 2.73% and 7.81% */ +- usePrefetchDecoder = (shareLongOffsets >= minShare); ++ RETURN_ERROR_IF((dst == NULL || dstCapacity == 0) && nbSeq > 0, dstSize_tooSmall, "NULL not handled"); ++ RETURN_ERROR_IF(MEM_64bits() && sizeof(size_t) == sizeof(void*) && (size_t)(-1) - (size_t)dst < (size_t)(1 << 20), dstSize_tooSmall, ++ "invalid dst"); ++ ++ /* If we could potentially have long offsets, or we might want to use the prefetch decoder, ++ * compute information about the share of long offsets, and the maximum nbAdditionalBits. ++ * NOTE: could probably use a larger nbSeq limit ++ */ ++ if (isLongOffset || (!usePrefetchDecoder && (totalHistorySize > (1u << 24)) && (nbSeq > 8))) { ++ ZSTD_OffsetInfo const info = ZSTD_getOffsetInfo(dctx->OFTptr, nbSeq); ++ if (isLongOffset && info.maxNbAdditionalBits <= STREAM_ACCUMULATOR_MIN) { ++ /* If isLongOffset, but the maximum number of additional bits that we see in our table is small ++ * enough, then we know it is impossible to have too long an offset in this block, so we can ++ * use the regular offset decoder. ++ */ ++ isLongOffset = ZSTD_lo_isRegularOffset; ++ } ++ if (!usePrefetchDecoder) { ++ U32 const minShare = MEM_64bits() ? 7 : 20; /* heuristic values, correspond to 2.73% and 7.81% */ ++ usePrefetchDecoder = (info.longOffsetShare >= minShare); ++ } + } +-#endif + + dctx->ddictIsCold = 0; + + #if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \ + !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG) +- if (usePrefetchDecoder) ++ if (usePrefetchDecoder) { ++#else ++ (void)usePrefetchDecoder; ++ { + #endif + #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT +- return ZSTD_decompressSequencesLong(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset, frame); ++ return ZSTD_decompressSequencesLong(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset); + #endif ++ } + + #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG + /* else */ + if (dctx->litBufferLocation == ZSTD_split) +- return ZSTD_decompressSequencesSplitLitBuffer(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset, frame); ++ return ZSTD_decompressSequencesSplitLitBuffer(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset); + else +- return ZSTD_decompressSequences(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset, frame); ++ return ZSTD_decompressSequences(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset); + #endif + } + } + + ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR + void ZSTD_checkContinuity(ZSTD_DCtx* dctx, const void* dst, size_t dstSize) + { + if (dst != dctx->previousDstEnd && dstSize > 0) { /* not contiguous */ +@@ -2060,13 +2193,24 @@ void ZSTD_checkContinuity(ZSTD_DCtx* dct + } + + +-size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx, +- void* dst, size_t dstCapacity, +- const void* src, size_t srcSize) ++size_t ZSTD_decompressBlock_deprecated(ZSTD_DCtx* dctx, ++ void* dst, size_t dstCapacity, ++ const void* src, size_t srcSize) + { + size_t dSize; ++ dctx->isFrameDecompression = 0; + ZSTD_checkContinuity(dctx, dst, dstCapacity); +- dSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, /* frame */ 0, not_streaming); ++ dSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, not_streaming); ++ FORWARD_IF_ERROR(dSize, ""); + dctx->previousDstEnd = (char*)dst + dSize; + return dSize; + } ++ ++ ++/* NOTE: Must just wrap ZSTD_decompressBlock_deprecated() */ ++size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx, ++ void* dst, size_t dstCapacity, ++ const void* src, size_t srcSize) ++{ ++ return ZSTD_decompressBlock_deprecated(dctx, dst, dstCapacity, src, srcSize); ++} +--- a/lib/zstd/decompress/zstd_decompress_block.h ++++ b/lib/zstd/decompress/zstd_decompress_block.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -47,7 +48,7 @@ typedef enum { + */ + size_t ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx, + void* dst, size_t dstCapacity, +- const void* src, size_t srcSize, const int frame, const streaming_operation streaming); ++ const void* src, size_t srcSize, const streaming_operation streaming); + + /* ZSTD_buildFSETable() : + * generate FSE decoding table for one symbol (ll, ml or off) +@@ -64,5 +65,10 @@ void ZSTD_buildFSETable(ZSTD_seqSymbol* + unsigned tableLog, void* wksp, size_t wkspSize, + int bmi2); + ++/* Internal definition of ZSTD_decompressBlock() to avoid deprecation warnings. */ ++size_t ZSTD_decompressBlock_deprecated(ZSTD_DCtx* dctx, ++ void* dst, size_t dstCapacity, ++ const void* src, size_t srcSize); ++ + + #endif /* ZSTD_DEC_BLOCK_H */ +--- a/lib/zstd/decompress/zstd_decompress_internal.h ++++ b/lib/zstd/decompress/zstd_decompress_internal.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -75,12 +76,13 @@ static UNUSED_ATTR const U32 ML_base[Max + + #define ZSTD_BUILD_FSE_TABLE_WKSP_SIZE (sizeof(S16) * (MaxSeq + 1) + (1u << MaxFSELog) + sizeof(U64)) + #define ZSTD_BUILD_FSE_TABLE_WKSP_SIZE_U32 ((ZSTD_BUILD_FSE_TABLE_WKSP_SIZE + sizeof(U32) - 1) / sizeof(U32)) ++#define ZSTD_HUFFDTABLE_CAPACITY_LOG 12 + + typedef struct { + ZSTD_seqSymbol LLTable[SEQSYMBOL_TABLE_SIZE(LLFSELog)]; /* Note : Space reserved for FSE Tables */ + ZSTD_seqSymbol OFTable[SEQSYMBOL_TABLE_SIZE(OffFSELog)]; /* is also used as temporary workspace while building hufTable during DDict creation */ + ZSTD_seqSymbol MLTable[SEQSYMBOL_TABLE_SIZE(MLFSELog)]; /* and therefore must be at least HUF_DECOMPRESS_WORKSPACE_SIZE large */ +- HUF_DTable hufTable[HUF_DTABLE_SIZE(HufLog)]; /* can accommodate HUF_decompress4X */ ++ HUF_DTable hufTable[HUF_DTABLE_SIZE(ZSTD_HUFFDTABLE_CAPACITY_LOG)]; /* can accommodate HUF_decompress4X */ + U32 rep[ZSTD_REP_NUM]; + U32 workspace[ZSTD_BUILD_FSE_TABLE_WKSP_SIZE_U32]; + } ZSTD_entropyDTables_t; +@@ -152,6 +154,7 @@ struct ZSTD_DCtx_s + size_t litSize; + size_t rleSize; + size_t staticSize; ++ int isFrameDecompression; + #if DYNAMIC_BMI2 != 0 + int bmi2; /* == 1 if the CPU supports BMI2 and 0 otherwise. CPU support is determined dynamically once per context lifetime. */ + #endif +@@ -164,6 +167,8 @@ struct ZSTD_DCtx_s + ZSTD_dictUses_e dictUses; + ZSTD_DDictHashSet* ddictSet; /* Hash set for multiple ddicts */ + ZSTD_refMultipleDDicts_e refMultipleDDicts; /* User specified: if == 1, will allow references to multiple DDicts. Default == 0 (disabled) */ ++ int disableHufAsm; ++ int maxBlockSizeParam; + + /* streaming */ + ZSTD_dStreamStage streamStage; +--- a/lib/zstd/decompress_sources.h ++++ b/lib/zstd/decompress_sources.h +@@ -1,6 +1,6 @@ + /* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +--- a/lib/zstd/zstd_common_module.c ++++ b/lib/zstd/zstd_common_module.c +@@ -1,6 +1,6 @@ + // SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* +- * Copyright (c) Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -24,9 +24,6 @@ EXPORT_SYMBOL_GPL(HUF_readStats_wksp); + EXPORT_SYMBOL_GPL(ZSTD_isError); + EXPORT_SYMBOL_GPL(ZSTD_getErrorName); + EXPORT_SYMBOL_GPL(ZSTD_getErrorCode); +-EXPORT_SYMBOL_GPL(ZSTD_customMalloc); +-EXPORT_SYMBOL_GPL(ZSTD_customCalloc); +-EXPORT_SYMBOL_GPL(ZSTD_customFree); + + MODULE_LICENSE("Dual BSD/GPL"); + MODULE_DESCRIPTION("Zstd Common"); +--- a/lib/zstd/zstd_compress_module.c ++++ b/lib/zstd/zstd_compress_module.c +@@ -1,6 +1,6 @@ + // SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* +- * Copyright (c) Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +--- a/lib/zstd/zstd_decompress_module.c ++++ b/lib/zstd/zstd_decompress_module.c +@@ -1,6 +1,6 @@ + // SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* +- * Copyright (c) Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -77,7 +77,7 @@ EXPORT_SYMBOL(zstd_init_dstream); + + size_t zstd_reset_dstream(zstd_dstream *dstream) + { +- return ZSTD_resetDStream(dstream); ++ return ZSTD_DCtx_reset(dstream, ZSTD_reset_session_only); + } + EXPORT_SYMBOL(zstd_reset_dstream); + diff --git a/debian/patches/patchset-pf/zstd/0002-lib-zstd-Refactor-intentional-wrap-around-test.patch b/debian/patches/patchset-pf/zstd/0002-lib-zstd-Refactor-intentional-wrap-around-test.patch new file mode 100644 index 0000000..3b837a1 --- /dev/null +++ b/debian/patches/patchset-pf/zstd/0002-lib-zstd-Refactor-intentional-wrap-around-test.patch @@ -0,0 +1,58 @@ +From c09f361b41027ca073de5631c66dfe0e7275c3a4 Mon Sep 17 00:00:00 2001 +From: Kees Cook +Date: Mon, 22 Jan 2024 16:27:56 -0800 +Subject: lib: zstd: Refactor intentional wrap-around test + +In an effort to separate intentional arithmetic wrap-around from +unexpected wrap-around, we need to refactor places that depend on this +kind of math. One of the most common code patterns of this is: + + VAR + value < VAR + +Notably, this is considered "undefined behavior" for signed and pointer +types, which the kernel works around by using the -fno-strict-overflow +option in the build[1] (which used to just be -fwrapv). Regardless, we +want to get the kernel source to the position where we can meaningfully +instrument arithmetic wrap-around conditions and catch them when they +are unexpected, regardless of whether they are signed[2], unsigned[3], +or pointer[4] types. + +Switch to a more regular type for a 64-bit value and refactor the +open-coded wrap-around addition test to use subtraction from the type max +(since add_would_overflow() may not be defined in early boot code). This +paves the way to enabling the wrap-around sanitizers in the future. + +Link: https://git.kernel.org/linus/68df3755e383e6fecf2354a67b08f92f18536594 [1] +Link: https://github.com/KSPP/linux/issues/26 [2] +Link: https://github.com/KSPP/linux/issues/27 [3] +Link: https://github.com/KSPP/linux/issues/344 [4] +Cc: Nick Terrell +Cc: Paul Jones +Cc: Sedat Dilek +Cc: Oleksandr Natalenko +Cc: Xin Gao +Signed-off-by: Kees Cook +--- + lib/zstd/decompress/zstd_decompress.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +--- a/lib/zstd/decompress/zstd_decompress.c ++++ b/lib/zstd/decompress/zstd_decompress.c +@@ -618,7 +618,7 @@ size_t ZSTD_readSkippableFrame(void* dst + * @return : decompressed size of the frames contained */ + unsigned long long ZSTD_findDecompressedSize(const void* src, size_t srcSize) + { +- unsigned long long totalDstSize = 0; ++ U64 totalDstSize = 0; + + while (srcSize >= ZSTD_startingInputLength(ZSTD_f_zstd1)) { + U32 const magicNumber = MEM_readLE32(src); +@@ -636,7 +636,7 @@ unsigned long long ZSTD_findDecompressed + { unsigned long long const fcs = ZSTD_getFrameContentSize(src, srcSize); + if (fcs >= ZSTD_CONTENTSIZE_ERROR) return fcs; + +- if (totalDstSize + fcs < totalDstSize) ++ if (U64_MAX - totalDstSize < fcs) + return ZSTD_CONTENTSIZE_ERROR; /* check for overflow */ + totalDstSize += fcs; + } diff --git a/debian/patches/patchset-xanmod/binder/0001-binder-turn-into-module.patch b/debian/patches/patchset-xanmod/binder/0001-binder-turn-into-module.patch new file mode 100644 index 0000000..aa448d6 --- /dev/null +++ b/debian/patches/patchset-xanmod/binder/0001-binder-turn-into-module.patch @@ -0,0 +1,209 @@ +From 0bbb6450b1ba362c1c2e7d8d752b39ec9844629b Mon Sep 17 00:00:00 2001 +From: Christian Brauner +Date: Wed, 16 Jan 2019 23:13:25 +0100 +Subject: [PATCH 1/4] binder: turn into module + +The Android binder driver needs to become a module for the sake of shipping +Anbox. To do this we need to export the following functions since binder is +currently still using them: + +- security_binder_set_context_mgr() +- security_binder_transaction() +- security_binder_transfer_binder() +- security_binder_transfer_file() +- can_nice() +- __close_fd_get_file() +- mmput_async() +- task_work_add() +- map_kernel_range_noflush() +- get_vm_area() +- zap_page_range_single() +- put_ipc_ns() +- get_ipc_ns_exported() +- show_init_ipc_ns() + +Signed-off-by: Christian Brauner +[ saf: fix additional reference to init_ipc_ns from 5.0-rc6 ] +Signed-off-by: Seth Forshee +[ arighi: fix EXPORT_SYMBOL vs EXPORT_SYMBOL_GPL change from 6.0-rc5 ] +[ arighi: zap_page_range() has been dropped, export zap_page_range_single() in 6.3 ] +Signed-off-by: Andrea Righi +Signed-off-by: Alexandre Frade +--- + drivers/android/Kconfig | 6 +++--- + drivers/android/binder.c | 17 ++++++++++++++--- + drivers/android/binder_alloc.h | 3 ++- + drivers/android/binder_internal.h | 5 +++-- + drivers/android/binderfs.c | 6 +++--- + fs/file.c | 1 + + include/linux/ipc_namespace.h | 3 +++ + ipc/namespace.c | 17 +++++++++++++++++ + kernel/sched/syscalls.c | 1 + + kernel/task_work.c | 1 + + mm/memory.c | 1 + + mm/vmalloc.c | 1 + + security/security.c | 4 ++++ + 14 files changed, 61 insertions(+), 15 deletions(-) + +--- a/drivers/android/Kconfig ++++ b/drivers/android/Kconfig +@@ -14,8 +14,8 @@ config ANDROID_BINDER_IPC + between said processes. + + config ANDROID_BINDERFS +- bool "Android Binderfs filesystem" +- depends on ANDROID_BINDER_IPC ++ tristate "Android Binderfs filesystem" ++ depends on (ANDROID_BINDER_IPC=y) || (ANDROID_BINDER_IPC=m && m) + default n + help + Binderfs is a pseudo-filesystem for the Android Binder IPC driver +--- a/drivers/android/binder.c ++++ b/drivers/android/binder.c +@@ -6713,9 +6713,20 @@ err_alloc_device_names_failed: + return ret; + } + +-device_initcall(binder_init); ++module_init(binder_init); ++/* ++ * binder will have no exit function since binderfs instances can be mounted ++ * multiple times and also in user namespaces finding and destroying them all ++ * is not feasible without introducing insane locking. Just ignoring existing ++ * instances on module unload also wouldn't work since we would loose track of ++ * what major numer was dynamically allocated and also what minor numbers are ++ * already given out. So this would get us into all kinds of issues with device ++ * number reuse. So simply don't allow unloading unless we are forced to do so. ++ */ ++ ++MODULE_AUTHOR("Google, Inc."); ++MODULE_DESCRIPTION("Driver for Android binder device"); ++MODULE_LICENSE("GPL v2"); + + #define CREATE_TRACE_POINTS + #include "binder_trace.h" +- +-MODULE_LICENSE("GPL v2"); +--- a/drivers/android/binder_alloc.h ++++ b/drivers/android/binder_alloc.h +@@ -6,6 +6,7 @@ + #ifndef _LINUX_BINDER_ALLOC_H + #define _LINUX_BINDER_ALLOC_H + ++#include + #include + #include + #include +@@ -111,7 +112,7 @@ struct binder_alloc { + bool oneway_spam_detected; + }; + +-#ifdef CONFIG_ANDROID_BINDER_IPC_SELFTEST ++#if IS_ENABLED(CONFIG_ANDROID_BINDER_IPC_SELFTEST) + void binder_selftest_alloc(struct binder_alloc *alloc); + #else + static inline void binder_selftest_alloc(struct binder_alloc *alloc) {} +--- a/drivers/android/binder_internal.h ++++ b/drivers/android/binder_internal.h +@@ -5,6 +5,7 @@ + + #include + #include ++#include + #include + #include + #include +@@ -78,7 +79,7 @@ extern const struct file_operations bind + + extern char *binder_devices_param; + +-#ifdef CONFIG_ANDROID_BINDERFS ++#if IS_ENABLED(CONFIG_ANDROID_BINDERFS) + extern bool is_binderfs_device(const struct inode *inode); + extern struct dentry *binderfs_create_file(struct dentry *dir, const char *name, + const struct file_operations *fops, +@@ -99,7 +100,7 @@ static inline struct dentry *binderfs_cr + static inline void binderfs_remove_file(struct dentry *dentry) {} + #endif + +-#ifdef CONFIG_ANDROID_BINDERFS ++#if IS_ENABLED(CONFIG_ANDROID_BINDERFS) + extern int __init init_binderfs(void); + #else + static inline int __init init_binderfs(void) +--- a/drivers/android/binderfs.c ++++ b/drivers/android/binderfs.c +@@ -120,7 +120,7 @@ static int binderfs_binder_device_create + struct super_block *sb = ref_inode->i_sb; + struct binderfs_info *info = sb->s_fs_info; + #if defined(CONFIG_IPC_NS) +- bool use_reserve = (info->ipc_ns == &init_ipc_ns); ++ bool use_reserve = (info->ipc_ns == show_init_ipc_ns()); + #else + bool use_reserve = true; + #endif +@@ -397,7 +397,7 @@ static int binderfs_binder_ctl_create(st + struct dentry *root = sb->s_root; + struct binderfs_info *info = sb->s_fs_info; + #if defined(CONFIG_IPC_NS) +- bool use_reserve = (info->ipc_ns == &init_ipc_ns); ++ bool use_reserve = (info->ipc_ns == show_init_ipc_ns()); + #else + bool use_reserve = true; + #endif +@@ -683,7 +683,7 @@ static int binderfs_fill_super(struct su + return -ENOMEM; + info = sb->s_fs_info; + +- info->ipc_ns = get_ipc_ns(current->nsproxy->ipc_ns); ++ info->ipc_ns = get_ipc_ns_exported(current->nsproxy->ipc_ns); + + info->root_gid = make_kgid(sb->s_user_ns, 0); + if (!gid_valid(info->root_gid)) +--- a/include/linux/ipc_namespace.h ++++ b/include/linux/ipc_namespace.h +@@ -128,6 +128,9 @@ extern int mq_init_ns(struct ipc_namespa + static inline int mq_init_ns(struct ipc_namespace *ns) { return 0; } + #endif + ++extern struct ipc_namespace *get_ipc_ns_exported(struct ipc_namespace *ns); ++extern struct ipc_namespace *show_init_ipc_ns(void); ++ + #if defined(CONFIG_IPC_NS) + extern struct ipc_namespace *copy_ipcs(unsigned long flags, + struct user_namespace *user_ns, struct ipc_namespace *ns); +--- a/ipc/namespace.c ++++ b/ipc/namespace.c +@@ -207,6 +207,22 @@ void put_ipc_ns(struct ipc_namespace *ns + } + EXPORT_SYMBOL_GPL(put_ipc_ns); + ++struct ipc_namespace *get_ipc_ns_exported(struct ipc_namespace *ns) ++{ ++ return get_ipc_ns(ns); ++} ++EXPORT_SYMBOL(get_ipc_ns_exported); ++ ++struct ipc_namespace *show_init_ipc_ns(void) ++{ ++#if defined(CONFIG_IPC_NS) ++ return &init_ipc_ns; ++#else ++ return NULL; ++#endif ++} ++EXPORT_SYMBOL(show_init_ipc_ns); ++ + static inline struct ipc_namespace *to_ipc_ns(struct ns_common *ns) + { + return container_of(ns, struct ipc_namespace, ns); +--- a/mm/vmalloc.c ++++ b/mm/vmalloc.c +@@ -3166,6 +3166,7 @@ struct vm_struct *get_vm_area(unsigned l + NUMA_NO_NODE, GFP_KERNEL, + __builtin_return_address(0)); + } ++EXPORT_SYMBOL(get_vm_area); + + struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags, + const void *caller) diff --git a/debian/patches/patchset-xanmod/clearlinux/0001-sched-wait-Do-accept-in-LIFO-order-for-cache-efficie.patch b/debian/patches/patchset-xanmod/clearlinux/0001-sched-wait-Do-accept-in-LIFO-order-for-cache-efficie.patch new file mode 100644 index 0000000..e9f5aca --- /dev/null +++ b/debian/patches/patchset-xanmod/clearlinux/0001-sched-wait-Do-accept-in-LIFO-order-for-cache-efficie.patch @@ -0,0 +1,82 @@ +From 1e2892b83e6c4062623f9aec06eba27884d47059 Mon Sep 17 00:00:00 2001 +From: Arjan van de Ven +Date: Thu, 13 Dec 2018 01:00:49 +0000 +Subject: [PATCH 1/4] sched/wait: Do accept() in LIFO order for cache + efficiency + +Signed-off-by: Alexandre Frade +--- + include/linux/wait.h | 2 ++ + kernel/sched/wait.c | 24 ++++++++++++++++++++++++ + net/ipv4/inet_connection_sock.c | 2 +- + 3 files changed, 27 insertions(+), 1 deletion(-) + +--- a/include/linux/wait.h ++++ b/include/linux/wait.h +@@ -163,6 +163,7 @@ static inline bool wq_has_sleeper(struct + + extern void add_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry); + extern void add_wait_queue_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry); ++extern void add_wait_queue_exclusive_lifo(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry); + extern void add_wait_queue_priority(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry); + extern void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry); + +@@ -1191,6 +1192,7 @@ do { \ + */ + void prepare_to_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state); + bool prepare_to_wait_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state); ++void prepare_to_wait_exclusive_lifo(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state); + long prepare_to_wait_event(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state); + void finish_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry); + long wait_woken(struct wait_queue_entry *wq_entry, unsigned mode, long timeout); +--- a/kernel/sched/wait.c ++++ b/kernel/sched/wait.c +@@ -47,6 +47,17 @@ void add_wait_queue_priority(struct wait + } + EXPORT_SYMBOL_GPL(add_wait_queue_priority); + ++void add_wait_queue_exclusive_lifo(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry) ++{ ++ unsigned long flags; ++ ++ wq_entry->flags |= WQ_FLAG_EXCLUSIVE; ++ spin_lock_irqsave(&wq_head->lock, flags); ++ __add_wait_queue(wq_head, wq_entry); ++ spin_unlock_irqrestore(&wq_head->lock, flags); ++} ++EXPORT_SYMBOL(add_wait_queue_exclusive_lifo); ++ + void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry) + { + unsigned long flags; +@@ -259,6 +270,19 @@ prepare_to_wait_exclusive(struct wait_qu + } + EXPORT_SYMBOL(prepare_to_wait_exclusive); + ++void prepare_to_wait_exclusive_lifo(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state) ++{ ++ unsigned long flags; ++ ++ wq_entry->flags |= WQ_FLAG_EXCLUSIVE; ++ spin_lock_irqsave(&wq_head->lock, flags); ++ if (list_empty(&wq_entry->entry)) ++ __add_wait_queue(wq_head, wq_entry); ++ set_current_state(state); ++ spin_unlock_irqrestore(&wq_head->lock, flags); ++} ++EXPORT_SYMBOL(prepare_to_wait_exclusive_lifo); ++ + void init_wait_entry(struct wait_queue_entry *wq_entry, int flags) + { + wq_entry->flags = flags; +--- a/net/ipv4/inet_connection_sock.c ++++ b/net/ipv4/inet_connection_sock.c +@@ -634,7 +634,7 @@ static int inet_csk_wait_for_connect(str + * having to remove and re-insert us on the wait queue. + */ + for (;;) { +- prepare_to_wait_exclusive(sk_sleep(sk), &wait, ++ prepare_to_wait_exclusive_lifo(sk_sleep(sk), &wait, + TASK_INTERRUPTIBLE); + release_sock(sk); + if (reqsk_queue_empty(&icsk->icsk_accept_queue)) diff --git a/debian/patches/patchset-xanmod/clearlinux/0002-firmware-Enable-stateless-firmware-loading.patch b/debian/patches/patchset-xanmod/clearlinux/0002-firmware-Enable-stateless-firmware-loading.patch new file mode 100644 index 0000000..b9342a8 --- /dev/null +++ b/debian/patches/patchset-xanmod/clearlinux/0002-firmware-Enable-stateless-firmware-loading.patch @@ -0,0 +1,25 @@ +From 52caaf4af0242cc3ac1ddaf9791c4c08a4b7226d Mon Sep 17 00:00:00 2001 +From: William Douglas +Date: Wed, 20 Jun 2018 17:23:21 +0000 +Subject: [PATCH 2/4] firmware: Enable stateless firmware loading + +Prefer the order of specific version before generic and /etc before +/lib to enable the user to give specific overrides for generic +firmware and distribution firmware. + +Signed-off-by: Alexandre Frade +--- + drivers/base/firmware_loader/main.c | 2 ++ + 1 file changed, 2 insertions(+) + +--- a/drivers/base/firmware_loader/main.c ++++ b/drivers/base/firmware_loader/main.c +@@ -471,6 +471,8 @@ static int fw_decompress_xz(struct devic + static char fw_path_para[256]; + static const char * const fw_path[] = { + fw_path_para, ++ "/etc/firmware/" UTS_RELEASE, ++ "/etc/firmware", + "/lib/firmware/updates/" UTS_RELEASE, + "/lib/firmware/updates", + "/lib/firmware/" UTS_RELEASE, diff --git a/debian/patches/patchset-xanmod/clearlinux/0003-locking-rwsem-spin-faster.patch b/debian/patches/patchset-xanmod/clearlinux/0003-locking-rwsem-spin-faster.patch new file mode 100644 index 0000000..79cbb57 --- /dev/null +++ b/debian/patches/patchset-xanmod/clearlinux/0003-locking-rwsem-spin-faster.patch @@ -0,0 +1,32 @@ +From f57e26428b29ddcdf58b35659002293bde1bf281 Mon Sep 17 00:00:00 2001 +From: Arjan van de Ven +Date: Sun, 18 Feb 2018 23:35:41 +0000 +Subject: [PATCH 3/4] locking: rwsem: spin faster + +tweak rwsem owner spinning a bit + +Signed-off-by: Alexandre Frade +--- + kernel/locking/rwsem.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +--- a/kernel/locking/rwsem.c ++++ b/kernel/locking/rwsem.c +@@ -749,6 +749,7 @@ rwsem_spin_on_owner(struct rw_semaphore + struct task_struct *new, *owner; + unsigned long flags, new_flags; + enum owner_state state; ++ int i = 0; + + lockdep_assert_preemption_disabled(); + +@@ -785,7 +786,8 @@ rwsem_spin_on_owner(struct rw_semaphore + break; + } + +- cpu_relax(); ++ if (i++ > 1000) ++ cpu_relax(); + } + + return state; diff --git a/debian/patches/patchset-xanmod/net/netfilter/0001-netfilter-Add-netfilter-nf_tables-fullcone-support.patch b/debian/patches/patchset-xanmod/net/netfilter/0001-netfilter-Add-netfilter-nf_tables-fullcone-support.patch new file mode 100644 index 0000000..a6c2e65 --- /dev/null +++ b/debian/patches/patchset-xanmod/net/netfilter/0001-netfilter-Add-netfilter-nf_tables-fullcone-support.patch @@ -0,0 +1,2288 @@ +From f68a84ded01d688aa58a973788f507899fbe191f Mon Sep 17 00:00:00 2001 +From: Alexandre Frade +Date: Mon, 27 Feb 2023 01:38:18 +0000 +Subject: [PATCH 1/2] netfilter: Add netfilter nf_tables fullcone support + +Signed-off-by: Syrone Wong +Signed-off-by: Alexandre Frade +--- + include/net/netfilter/nf_nat_fullcone.h | 156 +++ + net/netfilter/Kconfig | 15 + + net/netfilter/Makefile | 5 + + net/netfilter/nf_nat_fullcone.c | 1604 +++++++++++++++++++++++ + net/netfilter/nft_ext_fullcone.c | 466 +++++++ + 5 files changed, 2246 insertions(+) + create mode 100644 include/net/netfilter/nf_nat_fullcone.h + create mode 100644 net/netfilter/nf_nat_fullcone.c + create mode 100644 net/netfilter/nft_ext_fullcone.c + +--- /dev/null ++++ b/include/net/netfilter/nf_nat_fullcone.h +@@ -0,0 +1,156 @@ ++// SPDX-License-Identifier: GPL-2.0-only ++ ++/* ++ * Nftables NAT extension: fullcone expression support library header ++ * ++ * Copyright (c) 2018 Chion Tang ++ * Original xt_FULLCONENAT and related iptables extension author ++ * Copyright (c) 2019-2022 GitHub/llccd Twitter/@gNodeB ++ * Added IPv6 support for xt_FULLCONENAT and ip6tables extension ++ * Ported to recent kernel versions ++ * Copyright (c) 2022 Syrone Wong ++ * Massively rewrite the whole module, split the original code into library and nftables 'fullcone' expression module ++ */ ++#ifndef _NF_NAT_FULLCONE_H_ ++#define _NF_NAT_FULLCONE_H_ ++ ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#ifndef NF_NAT_RANGE_PROTO_RANDOM_FULLY ++#define NF_NAT_RANGE_PROTO_RANDOM_FULLY (1 << 4) ++#endif ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 10, 0) ++static inline int nf_ct_netns_get(struct net *net, u8 nfproto) ++{ ++ return 0; ++} ++ ++static inline void nf_ct_netns_put(struct net *net, u8 nfproto) ++{ ++} ++#endif ++ ++/** ++ * enum nft_fullcone_attributes - nf_tables fullcone expression netlink attributes ++ * ++ * @NFTA_FULLCONE_REG_PROTO_MIN: source register of proto range start (NLA_U32: nft_registers) ++ * @NFTA_FULLCONE_REG_PROTO_MAX: source register of proto range end (NLA_U32: nft_registers) ++ * @NFTA_FULLCONE_FLAGS: NAT flags (see NF_NAT_RANGE_* in linux/netfilter/nf_nat.h) (NLA_U32) ++ */ ++enum nft_fullcone_attributes { ++ NFTA_FULLCONE_UNSPEC, ++ NFTA_FULLCONE_REG_PROTO_MIN, ++ NFTA_FULLCONE_REG_PROTO_MAX, ++ NFTA_FULLCONE_FLAGS, ++ __NFTA_FULLCONE_MAX ++}; ++#define NFTA_FULLCONE_MAX (__NFTA_FULLCONE_MAX - 1) ++ ++/* fullcone specific data structures */ ++ ++struct nat_mapping_original_tuple { ++ struct nf_conntrack_tuple tuple; ++ ++ struct list_head node; ++}; ++ ++struct nat_mapping { ++ uint16_t port; /* external source port */ ++ __be32 addr; /* external source ip address */ ++ ++ __be32 int_addr; /* internal source ip address */ ++ uint16_t int_port; /* internal source port */ ++ ++ int refer_count; /* how many references linked to this mapping ++ * aka. length of original_tuple_list */ ++ ++ struct list_head original_tuple_list; ++ ++ struct hlist_node node_by_ext_port; ++ struct hlist_node node_by_int_src; ++ ++}; ++ ++#if IS_ENABLED(CONFIG_NF_NAT_IPV6) || (IS_ENABLED(CONFIG_IPV6) && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 1, 0)) ++struct nat_mapping6 { ++ uint16_t port; /* external source port */ ++ union nf_inet_addr addr; /* external source ip address */ ++ ++ union nf_inet_addr int_addr; /* internal source ip address */ ++ uint16_t int_port; /* internal source port */ ++ ++ int refer_count; /* how many references linked to this mapping ++ * aka. length of original_tuple_list */ ++ ++ struct list_head original_tuple_list; ++ ++ struct hlist_node node_by_ext_port; ++ struct hlist_node node_by_int_src; ++ ++}; ++#endif ++ ++struct tuple_list { ++ struct nf_conntrack_tuple tuple_original; ++ struct nf_conntrack_tuple tuple_reply; ++ struct list_head list; ++}; ++ ++/* fullcone specific data structures end */ ++ ++// NOTE: declaration listed here must use EXPORT_SYMBOL_* ++ ++unsigned int nf_nat_fullcone_ipv4(struct sk_buff *skb, unsigned int hooknum, ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 18, 0) ++ struct nf_nat_range2 *range, ++#else ++ struct nf_nat_range *range, ++#endif ++ const struct net_device *out); ++ ++unsigned int nf_nat_fullcone_ipv6(struct sk_buff *skb, unsigned int hooknum, ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 18, 0) ++ struct nf_nat_range2 *range, ++#else ++ struct nf_nat_range *range, ++#endif ++ const struct net_device *out); ++ ++void nf_nat_fullcone_handle_dying_tuples(void); ++void nf_nat_fullcone_destroy_mappings(void); ++void nf_nat_fullcone_dying_tuple_list_add(struct list_head *new_dying); ++ ++/* ++ * For [FUTURE] usage ++ * ++ * from https://elixir.bootlin.com/linux/v5.15.32/source/net/netfilter/xt_nat.c#L37 ++static void xt_nat_convert_range(struct nf_nat_range2 *dst, ++ const struct nf_nat_ipv4_range *src) ++{ ++ memset(&dst->min_addr, 0, sizeof(dst->min_addr)); ++ memset(&dst->max_addr, 0, sizeof(dst->max_addr)); ++ // base_proto is nf_nat_range2 specific ++ memset(&dst->base_proto, 0, sizeof(dst->base_proto)); ++ ++ dst->flags = src->flags; ++ dst->min_addr.ip = src->min_ip; ++ dst->max_addr.ip = src->max_ip; ++ dst->min_proto = src->min; ++ dst->max_proto = src->max; ++} ++ * ++ */ ++ ++#endif /*_NF_NAT_FULLCONE_H_ */ +--- a/net/netfilter/Kconfig ++++ b/net/netfilter/Kconfig +@@ -571,6 +571,21 @@ config NFT_NAT + This option adds the "nat" expression that you can use to perform + typical Network Address Translation (NAT) packet transformations. + ++config NFT_FULLCONE ++ depends on NF_CONNTRACK ++ depends on NF_NAT ++ tristate "Netfilter nf_tables fullcone support" ++ help ++ This options adds the "fullcone" expression that you can use ++ to perform NAT in the RFC3489-compatible full cone SNAT flavour. ++ Currently only UDP traffic is supported for full-cone NAT. ++ For other protos FULLCONENAT is equivalent to MASQUERADE. ++ ++ To compile this code as a module, choose M here: the module will be ++ called nft_fullcone. ++ ++ If unsure, say N. ++ + config NFT_TUNNEL + tristate "Netfilter nf_tables tunnel module" + help +--- a/net/netfilter/Makefile ++++ b/net/netfilter/Makefile +@@ -240,3 +240,8 @@ obj-$(CONFIG_IP_VS) += ipvs/ + + # lwtunnel + obj-$(CONFIG_LWTUNNEL) += nf_hooks_lwtunnel.o ++ ++# Nftables/netfilter fullcone expression support ++obj-$(CONFIG_NFT_FULLCONE) += \ ++ nf_nat_fullcone.o \ ++ nft_ext_fullcone.o +--- /dev/null ++++ b/net/netfilter/nf_nat_fullcone.c +@@ -0,0 +1,1604 @@ ++// SPDX-License-Identifier: GPL-2.0-only ++ ++/* ++ * Nftables NAT extension: fullcone expression support library ++ * ++ * Copyright (c) 2018 Chion Tang ++ * Original xt_FULLCONENAT and related iptables extension author ++ * Copyright (c) 2019-2022 GitHub/llccd Twitter/@gNodeB ++ * Added IPv6 support for xt_FULLCONENAT and ip6tables extension ++ * Ported to recent kernel versions ++ * Copyright (c) 2022 Syrone Wong ++ * Massively rewrite the whole module, split the original code into library and nftables 'fullcone' expression module ++ * Copyright (c) 2022 Alexandre Frade ++ * Ported to latest kernel source tree ++ */ ++ ++#define pr_fmt(fmt) "fullcone " KBUILD_MODNAME ": " fmt ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#ifdef CONFIG_NF_CONNTRACK_CHAIN_EVENTS ++#include ++#endif ++ ++#include ++#include ++ ++#if IS_ENABLED(CONFIG_NF_NAT_IPV6) || (IS_ENABLED(CONFIG_IPV6) && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 1, 0)) ++#include ++#include ++#include ++#endif ++ ++#include ++ ++/* ++ * FULLCONE_HKEY generates u32 hash value ++ * Modified from net/netfilter/ipset/ip_set_hash_gen.h ++ * dataptr: a pointer ++ * datatypelen: sizeof(struct blah) or sizeof(u32) ++ * initval: initial value ++ * htable_bits: hashtable bits ++ */ ++#define FULLCONE_HKEY(dataptr, datatypelen, initval, htable_bits) \ ++({ \ ++ const u32 *__k = (const u32 *)(dataptr); \ ++ u32 __l = (datatypelen) / sizeof(u32); \ ++ \ ++ BUILD_BUG_ON((datatypelen) % sizeof(u32) != 0); \ ++ \ ++ jhash2(__k, __l, (initval)) & jhash_mask((htable_bits)); \ ++}) ++ ++#define HASHTABLE_BUCKET_BITS 10 ++ ++/* static variables */ ++ ++static DEFINE_HASHTABLE(mapping_table_by_ext_port, HASHTABLE_BUCKET_BITS); ++static DEFINE_HASHTABLE(mapping_table_by_int_src, HASHTABLE_BUCKET_BITS); ++ ++static DEFINE_SPINLOCK(fullconenat_lock); ++ ++#if IS_ENABLED(CONFIG_NF_NAT_IPV6) || (IS_ENABLED(CONFIG_IPV6) && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 1, 0)) ++static DEFINE_HASHTABLE(mapping6_table_by_ext_port, HASHTABLE_BUCKET_BITS); ++static DEFINE_HASHTABLE(mapping6_table_by_int_src, HASHTABLE_BUCKET_BITS); ++ ++static DEFINE_SPINLOCK(fullconenat6_lock); ++#endif ++ ++static LIST_HEAD(dying_tuple_list); ++static DEFINE_SPINLOCK(dying_tuple_list_lock); ++ ++/* static variables end */ ++ ++/* forward declaration */ ++ ++#if IS_ENABLED(CONFIG_IPV6) ++static int nat_ipv6_dev_get_saddr(struct net *net, const struct net_device *dev, ++ const struct in6_addr *daddr, unsigned int srcprefs, struct in6_addr *saddr); ++#endif ++ ++#if IS_ENABLED(CONFIG_NF_NAT_IPV6) || (IS_ENABLED(CONFIG_IPV6) && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 1, 0)) ++/* non-atomic: can only be called serially within lock zones. */ ++static char *fullcone_nf_ct_stringify_tuple6(const struct nf_conntrack_tuple ++ *t); ++#endif ++/* non-atomic: can only be called serially within lock zones. */ ++static char *nf_ct_stringify_tuple(const struct nf_conntrack_tuple *t); ++ ++#if IS_ENABLED(CONFIG_NF_NAT_IPV6) || (IS_ENABLED(CONFIG_IPV6) && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 1, 0)) ++static struct nat_mapping6 *allocate_mapping6(const union nf_inet_addr ++ *int_addr, ++ const uint16_t int_port, ++ const uint16_t port, const union nf_inet_addr *addr); ++#endif ++static struct nat_mapping *allocate_mapping(const __be32 int_addr, ++ const uint16_t int_port, const uint16_t port, const __be32 addr); ++ ++#if IS_ENABLED(CONFIG_NF_NAT_IPV6) || (IS_ENABLED(CONFIG_IPV6) && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 1, 0)) ++static void add_original_tuple_to_mapping6(struct nat_mapping6 *mapping, const struct nf_conntrack_tuple ++ *original_tuple); ++#endif ++static void add_original_tuple_to_mapping(struct nat_mapping *mapping, const struct nf_conntrack_tuple ++ *original_tuple); ++ ++#if IS_ENABLED(CONFIG_NF_NAT_IPV6) || (IS_ENABLED(CONFIG_IPV6) && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 1, 0)) ++static struct nat_mapping6 *get_mapping6_by_int_src(const union nf_inet_addr ++ *src_ip, const uint16_t src_port, const union nf_inet_addr ++ *ext_ip); ++#endif ++ ++static struct nat_mapping *get_mapping_by_int_src(const __be32 src_ip, const uint16_t src_port, const __be32 ext_ip); ++ ++#if IS_ENABLED(CONFIG_NF_NAT_IPV6) || (IS_ENABLED(CONFIG_IPV6) && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 1, 0)) ++static struct nat_mapping6 *get_mapping6_by_int_src_inrange(const union ++ nf_inet_addr ++ *src_ip, const uint16_t src_port, const union ++ nf_inet_addr ++ *min_ip, const union ++ nf_inet_addr ++ *max_ip); ++#endif ++static struct nat_mapping *get_mapping_by_int_src_inrange(const __be32 src_ip, ++ const uint16_t ++ src_port, const __be32 min_ip, const __be32 max_ip); ++ ++#if IS_ENABLED(CONFIG_NF_NAT_IPV6) || (IS_ENABLED(CONFIG_IPV6) && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 1, 0)) ++static void kill_mapping6(struct nat_mapping6 *mapping); ++#endif ++static void kill_mapping(struct nat_mapping *mapping); ++ ++#if IS_ENABLED(CONFIG_NF_NAT_IPV6) || (IS_ENABLED(CONFIG_IPV6) && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 1, 0)) ++ ++/* check if a mapping is valid. ++ * possibly delete and free an invalid mapping. ++ * the mapping should not be used anymore after check_mapping6() returns 0. */ ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 3, 0) ++static int check_mapping6(struct nat_mapping6 *mapping, struct net *net, const struct nf_conntrack_zone *zone); ++#else ++static int check_mapping6(struct nat_mapping6 *mapping, struct net *net, const u16 zone); ++#endif ++ ++#endif ++ ++/* check if a mapping is valid. ++ * possibly delete and free an invalid mapping. ++ * the mapping should not be used anymore after check_mapping() returns 0. */ ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 3, 0) ++static int check_mapping(struct nat_mapping *mapping, struct net *net, const struct nf_conntrack_zone *zone); ++#else ++static int check_mapping(struct nat_mapping *mapping, struct net *net, const u16 zone); ++#endif ++ ++#if IS_ENABLED(CONFIG_NF_NAT_IPV6) || (IS_ENABLED(CONFIG_IPV6) && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 1, 0)) ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 3, 0) ++static struct nat_mapping6 *get_mapping6_by_ext_port(const uint16_t port, const union nf_inet_addr ++ *ext_ip, struct net *net, const struct ++ nf_conntrack_zone *zone); ++#else ++static struct nat_mapping6 *get_mapping6_by_ext_port(const uint16_t port, const union nf_inet_addr ++ *ext_ip, struct net *net, const u16 zone); ++#endif ++#endif ++ ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 3, 0) ++static struct nat_mapping *get_mapping_by_ext_port(const uint16_t port, const __be32 ext_ip, struct net *net, const struct ++ nf_conntrack_zone *zone); ++#else ++static struct nat_mapping *get_mapping_by_ext_port(const uint16_t port, ++ const __be32 ext_ip, struct net *net, const u16 zone); ++#endif ++ ++#if IS_ENABLED(CONFIG_NF_NAT_IPV6) || (IS_ENABLED(CONFIG_IPV6) && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 1, 0)) ++ ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 3, 0) ++static uint16_t find_appropriate_port6(struct net *net, ++ const struct nf_conntrack_zone *zone, ++ const uint16_t original_port, const union nf_inet_addr *ext_ip, ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 18, 0) ++ struct nf_nat_range2 *range); ++ ++#else ++ struct nf_nat_range *range); ++ ++#endif ++ ++#else ++static uint16_t find_appropriate_port6(struct net *net, const u16 zone, ++ const uint16_t original_port, const union nf_inet_addr *ext_ip, ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 18, 0) ++ struct nf_nat_range2 *range); ++ ++#else ++ struct nf_nat_range *range); ++ ++#endif ++#endif ++ ++#endif ++ ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 3, 0) ++static uint16_t find_appropriate_port(struct net *net, ++ const struct nf_conntrack_zone *zone, ++ const uint16_t original_port, const __be32 ext_ip, ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 18, 0) ++ struct nf_nat_range2 *range); ++#else ++ struct nf_nat_range *range); ++#endif ++ ++#else ++static uint16_t find_appropriate_port(struct net *net, const u16 zone, ++ const uint16_t original_port, const __be32 ext_ip, ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 18, 0) ++ struct nf_nat_range2 *range); ++#else ++ struct nf_nat_range *range); ++#endif ++#endif ++ ++#if IS_ENABLED(CONFIG_NF_NAT_IPV6) || (IS_ENABLED(CONFIG_IPV6) && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 1, 0)) ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 3, 0) ++static void find_leastused_ip6(const struct nf_conntrack_zone *zone, ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 18, 0) ++ struct nf_nat_range2 *range, ++#else ++ struct nf_nat_range *range, ++#endif ++ const union nf_inet_addr *src, ++ const union nf_inet_addr *dst, union nf_inet_addr *var_ipp); ++#else ++static void find_leastused_ip6(const u16 zone, ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 18, 0) ++ struct nf_nat_range2 *range, ++#else ++ struct nf_nat_range *range, ++#endif ++ const union nf_inet_addr *src, ++ const union nf_inet_addr *dst, union nf_inet_addr *var_ipp); ++#endif ++#endif ++ ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 3, 0) ++static __be32 find_leastused_ip(const struct nf_conntrack_zone *zone, ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 18, 0) ++ struct nf_nat_range2 *range, ++#else ++ struct nf_nat_range *range, ++#endif ++ const __be32 src, const __be32 dst); ++#else ++static __be32 find_leastused_ip(const u16 zone, ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 18, 0) ++ struct nf_nat_range2 *range, ++#else ++ struct nf_nat_range *range, ++#endif ++ const __be32 src, const __be32 dst); ++#endif ++ ++/* forward declaration end */ ++ ++/* non-atomic part */ ++ ++static char tuple_tmp_string[512]; ++ ++#if IS_ENABLED(CONFIG_NF_NAT_IPV6) || (IS_ENABLED(CONFIG_IPV6) && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 1, 0)) ++/* non-atomic: can only be called serially within lock zones. */ ++static char *fullcone_nf_ct_stringify_tuple6(const struct nf_conntrack_tuple *t) ++{ ++ snprintf(tuple_tmp_string, sizeof(tuple_tmp_string), ++ "[%pI6c]:%hu -> [%pI6c]:%hu", &t->src.u3.ip6, ++ be16_to_cpu(t->src.u.all), &t->dst.u3.ip6, be16_to_cpu(t->dst.u.all)); ++ return tuple_tmp_string; ++} ++#endif ++/* non-atomic: can only be called serially within lock zones. */ ++static char *nf_ct_stringify_tuple(const struct nf_conntrack_tuple *t) ++{ ++ snprintf(tuple_tmp_string, sizeof(tuple_tmp_string), ++ "%pI4:%hu -> %pI4:%hu", &t->src.u3.ip, ++ be16_to_cpu(t->src.u.all), &t->dst.u3.ip, be16_to_cpu(t->dst.u.all)); ++ return tuple_tmp_string; ++} ++ ++/* non-atomic part end */ ++ ++void nf_nat_fullcone_dying_tuple_list_add(struct list_head *new_dying) ++{ ++ spin_lock_bh(&dying_tuple_list_lock); ++ list_add(new_dying, &dying_tuple_list); ++ spin_unlock_bh(&dying_tuple_list_lock); ++} ++ ++EXPORT_SYMBOL_GPL(nf_nat_fullcone_dying_tuple_list_add); ++ ++void nf_nat_fullcone_handle_dying_tuples(void) ++{ ++ struct list_head *iter, *tmp, *iter_2, *tmp_2; ++ struct tuple_list *item; ++ struct nf_conntrack_tuple *ct_tuple; ++ struct nat_mapping *mapping; ++ __be32 ip, ext_ip; ++ uint16_t port; ++ struct nat_mapping_original_tuple *original_tuple_item; ++#if IS_ENABLED(CONFIG_NF_NAT_IPV6) || (IS_ENABLED(CONFIG_IPV6) && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 1, 0)) ++ struct nat_mapping6 *mapping6; ++ union nf_inet_addr *ip6, *ext_ip6; ++ spin_lock_bh(&fullconenat6_lock); ++#endif ++ ++ spin_lock_bh(&fullconenat_lock); ++ spin_lock_bh(&dying_tuple_list_lock); ++ ++ list_for_each_safe(iter, tmp, &dying_tuple_list) { ++ item = list_entry(iter, struct tuple_list, list); ++ ++ /* we dont know the conntrack direction for now so we try in both ways. */ ++ ct_tuple = &(item->tuple_original); ++#if IS_ENABLED(CONFIG_NF_NAT_IPV6) || (IS_ENABLED(CONFIG_IPV6) && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 1, 0)) ++ if (ct_tuple->src.l3num == PF_INET6) { ++ ip6 = &(ct_tuple->src).u3; ++ port = be16_to_cpu((ct_tuple->src).u.udp.port); ++ ext_ip6 = &item->tuple_reply.dst.u3; ++ mapping6 = get_mapping6_by_int_src(ip6, port, ext_ip6); ++ if (mapping6 == NULL) { ++ ext_ip6 = &(ct_tuple->dst).u3; ++ ct_tuple = &(item->tuple_reply); ++ ip6 = &(ct_tuple->src).u3; ++ port = be16_to_cpu((ct_tuple->src).u.udp.port); ++ mapping6 = get_mapping6_by_int_src(ip6, port, ext_ip6); ++ if (mapping6 != NULL) { ++ pr_debug ++ ("nf_nat_fullcone_handle_dying_tuples(): INBOUND dying conntrack at ext port %d\n", ++ mapping6->port); ++ } ++ } else { ++ pr_debug ++ ("nf_nat_fullcone_handle_dying_tuples(): OUTBOUND dying conntrack at ext port %d\n", ++ mapping6->port); ++ } ++ ++ if (mapping6 == NULL) { ++ goto next; ++ } ++ ++ /* look for the corresponding out-dated tuple and free it */ ++ list_for_each_safe(iter_2, tmp_2, &mapping6->original_tuple_list) { ++ original_tuple_item = list_entry(iter_2, struct ++ nat_mapping_original_tuple, node); ++ ++ if (nf_ct_tuple_equal(&original_tuple_item->tuple, &(item->tuple_original))) { ++ pr_debug ++ ("nf_nat_fullcone_handle_dying_tuples(): tuple %s expired. free this tuple.\n", ++ fullcone_nf_ct_stringify_tuple6(&original_tuple_item->tuple)); ++ list_del(&original_tuple_item->node); ++ kfree(original_tuple_item); ++ (mapping6->refer_count)--; ++ } ++ } ++ ++ /* then kill the mapping if needed */ ++ pr_debug ++ ("nf_nat_fullcone_handle_dying_tuples(): refer_count for mapping at ext_port %d is now %d\n", ++ mapping6->port, mapping6->refer_count); ++ if (mapping6->refer_count <= 0) { ++ pr_debug ++ ("nf_nat_fullcone_handle_dying_tuples(): kill expired mapping at ext port %d\n", ++ mapping6->port); ++ kill_mapping6(mapping6); ++ } ++ goto next; ++ } ++ if (unlikely(ct_tuple->src.l3num != PF_INET)) ++#else ++ if (ct_tuple->src.l3num != PF_INET) ++#endif ++ goto next; ++ ++ ip = (ct_tuple->src).u3.ip; ++ port = be16_to_cpu((ct_tuple->src).u.udp.port); ++ ext_ip = item->tuple_reply.dst.u3.ip; ++ mapping = get_mapping_by_int_src(ip, port, ext_ip); ++ if (mapping == NULL) { ++ ext_ip = (ct_tuple->dst).u3.ip; ++ ct_tuple = &(item->tuple_reply); ++ ip = (ct_tuple->src).u3.ip; ++ port = be16_to_cpu((ct_tuple->src).u.udp.port); ++ mapping = get_mapping_by_int_src(ip, port, ext_ip); ++ if (mapping != NULL) { ++ pr_debug ++ ("nf_nat_fullcone_handle_dying_tuples(): INBOUND dying conntrack at ext port %d\n", ++ mapping->port); ++ } ++ } else { ++ pr_debug ++ ("nf_nat_fullcone_handle_dying_tuples(): OUTBOUND dying conntrack at ext port %d\n", ++ mapping->port); ++ } ++ ++ if (mapping == NULL) { ++ goto next; ++ } ++ ++ /* look for the corresponding out-dated tuple and free it */ ++ list_for_each_safe(iter_2, tmp_2, &mapping->original_tuple_list) { ++ original_tuple_item = list_entry(iter_2, struct nat_mapping_original_tuple, node); ++ ++ if (nf_ct_tuple_equal(&original_tuple_item->tuple, &(item->tuple_original))) { ++ pr_debug ++ ("nf_nat_fullcone_handle_dying_tuples(): tuple %s expired. free this tuple.\n", ++ nf_ct_stringify_tuple(&original_tuple_item->tuple)); ++ list_del(&original_tuple_item->node); ++ kfree(original_tuple_item); ++ (mapping->refer_count)--; ++ } ++ } ++ ++ /* then kill the mapping if needed */ ++ pr_debug ++ ("nf_nat_fullcone_handle_dying_tuples(): refer_count for mapping at ext_port %d is now %d\n", ++ mapping->port, mapping->refer_count); ++ if (mapping->refer_count <= 0) { ++ pr_debug ++ ("nf_nat_fullcone_handle_dying_tuples(): kill expired mapping at ext port %d\n", ++ mapping->port); ++ kill_mapping(mapping); ++ } ++ ++next: ++ list_del(&item->list); ++ kfree(item); ++ } ++ ++ spin_unlock_bh(&dying_tuple_list_lock); ++ spin_unlock_bh(&fullconenat_lock); ++#if IS_ENABLED(CONFIG_NF_NAT_IPV6) || (IS_ENABLED(CONFIG_IPV6) && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 1, 0)) ++ spin_unlock_bh(&fullconenat6_lock); ++#endif ++} ++ ++EXPORT_SYMBOL_GPL(nf_nat_fullcone_handle_dying_tuples); ++ ++#if IS_ENABLED(CONFIG_NF_NAT_IPV6) || (IS_ENABLED(CONFIG_IPV6) && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 1, 0)) ++static struct nat_mapping6 *allocate_mapping6(const union nf_inet_addr ++ *int_addr, ++ const uint16_t int_port, ++ const uint16_t port, const union nf_inet_addr *addr) ++{ ++ struct nat_mapping6 *p_new; ++ u32 hash_src; ++ ++ p_new = kmalloc(sizeof(struct nat_mapping6), GFP_ATOMIC); ++ if (p_new == NULL) { ++ pr_err("kmalloc() for allocate_mapping6 failed.\n"); ++ return NULL; ++ } ++ p_new->addr = *addr; ++ p_new->port = port; ++ p_new->int_addr = *int_addr; ++ p_new->int_port = int_port; ++ p_new->refer_count = 0; ++ (p_new->original_tuple_list).next = &(p_new->original_tuple_list); ++ (p_new->original_tuple_list).prev = &(p_new->original_tuple_list); ++ ++ hash_src = FULLCONE_HKEY(int_addr, sizeof(union nf_inet_addr), (u32) int_port, HASHTABLE_BUCKET_BITS); ++ //hash_src = jhash2((u32 *) int_addr->all, 4, (u32) int_port); ++ ++ hash_add(mapping6_table_by_ext_port, &p_new->node_by_ext_port, port); ++ hash_add(mapping6_table_by_int_src, &p_new->node_by_int_src, hash_src); ++ ++ pr_debug("new mapping allocated for [%pI6c]:%d ==> [%pI6c]:%d\n", ++ &p_new->int_addr, p_new->int_port, &p_new->addr, p_new->port); ++ ++ return p_new; ++} ++#endif ++static struct nat_mapping *allocate_mapping(const __be32 int_addr, ++ const uint16_t int_port, const uint16_t port, const __be32 addr) ++{ ++ struct nat_mapping *p_new; ++ u32 hash_src; ++ ++ p_new = kmalloc(sizeof(struct nat_mapping), GFP_ATOMIC); ++ if (p_new == NULL) { ++ pr_err("kmalloc() for allocate_mapping failed.\n"); ++ return NULL; ++ } ++ p_new->addr = addr; ++ p_new->port = port; ++ p_new->int_addr = int_addr; ++ p_new->int_port = int_port; ++ p_new->refer_count = 0; ++ (p_new->original_tuple_list).next = &(p_new->original_tuple_list); ++ (p_new->original_tuple_list).prev = &(p_new->original_tuple_list); ++ ++ hash_src = FULLCONE_HKEY(&int_addr, sizeof(__be32), (u32) int_port, HASHTABLE_BUCKET_BITS); ++ //hash_src = HASH_2(int_addr, (u32) int_port); ++ ++ hash_add(mapping_table_by_ext_port, &p_new->node_by_ext_port, port); ++ hash_add(mapping_table_by_int_src, &p_new->node_by_int_src, hash_src); ++ ++ pr_debug("new mapping allocated for %pI4:%d ==> %pI4:%d\n", ++ &p_new->int_addr, p_new->int_port, &p_new->addr, p_new->port); ++ ++ return p_new; ++} ++ ++#if IS_ENABLED(CONFIG_NF_NAT_IPV6) || (IS_ENABLED(CONFIG_IPV6) && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 1, 0)) ++static void add_original_tuple_to_mapping6(struct nat_mapping6 *mapping, const struct nf_conntrack_tuple ++ *original_tuple) ++{ ++ struct nat_mapping_original_tuple *item = kmalloc(sizeof(struct nat_mapping_original_tuple), GFP_ATOMIC); ++ if (item == NULL) { ++ pr_err("kmalloc() for add_original_tuple_to_mapping6 failed.\n"); ++ return; ++ } ++ memcpy(&item->tuple, original_tuple, sizeof(struct nf_conntrack_tuple)); ++ list_add(&item->node, &mapping->original_tuple_list); ++ (mapping->refer_count)++; ++} ++#endif ++static void add_original_tuple_to_mapping(struct nat_mapping *mapping, const struct nf_conntrack_tuple ++ *original_tuple) ++{ ++ struct nat_mapping_original_tuple *item = kmalloc(sizeof(struct nat_mapping_original_tuple), GFP_ATOMIC); ++ if (item == NULL) { ++ pr_err("kmalloc() for add_original_tuple_to_mapping failed.\n"); ++ return; ++ } ++ memcpy(&item->tuple, original_tuple, sizeof(struct nf_conntrack_tuple)); ++ list_add(&item->node, &mapping->original_tuple_list); ++ (mapping->refer_count)++; ++} ++ ++#if IS_ENABLED(CONFIG_NF_NAT_IPV6) || (IS_ENABLED(CONFIG_IPV6) && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 1, 0)) ++static struct nat_mapping6 *get_mapping6_by_int_src(const union nf_inet_addr ++ *src_ip, const uint16_t src_port, const union nf_inet_addr ++ *ext_ip) ++{ ++ struct nat_mapping6 *p_current; ++ u32 hash_src = FULLCONE_HKEY(src_ip, sizeof(union nf_inet_addr), (u32) src_port, HASHTABLE_BUCKET_BITS); ++ //u32 hash_src = jhash2((u32 *) src_ip->all, 4, (u32) src_port); ++ ++ hash_for_each_possible(mapping6_table_by_int_src, p_current, node_by_int_src, hash_src) { ++ if (nf_inet_addr_cmp(&p_current->int_addr, src_ip) ++ && p_current->int_port == src_port && nf_inet_addr_cmp(&p_current->addr, ext_ip)) { ++ return p_current; ++ } ++ } ++ ++ return NULL; ++} ++#endif ++ ++static struct nat_mapping *get_mapping_by_int_src(const __be32 src_ip, const uint16_t src_port, const __be32 ext_ip) ++{ ++ struct nat_mapping *p_current; ++ u32 hash_src = FULLCONE_HKEY(&src_ip, sizeof(__be32), (u32) src_port, HASHTABLE_BUCKET_BITS); ++ //u32 hash_src = HASH_2(src_ip, (u32) src_port); ++ ++ hash_for_each_possible(mapping_table_by_int_src, p_current, node_by_int_src, hash_src) { ++ if (p_current->int_addr == src_ip && p_current->int_port == src_port && p_current->addr == ext_ip) { ++ return p_current; ++ } ++ } ++ ++ return NULL; ++} ++ ++#if IS_ENABLED(CONFIG_NF_NAT_IPV6) || (IS_ENABLED(CONFIG_IPV6) && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 1, 0)) ++static struct nat_mapping6 *get_mapping6_by_int_src_inrange(const union ++ nf_inet_addr ++ *src_ip, const uint16_t src_port, const union ++ nf_inet_addr ++ *min_ip, const union ++ nf_inet_addr ++ *max_ip) ++{ ++ struct nat_mapping6 *p_current; ++ ++ u32 hash_src = FULLCONE_HKEY(src_ip, sizeof(union nf_inet_addr), (u32) src_port, HASHTABLE_BUCKET_BITS); ++ //u32 hash_src = jhash2((u32 *) src_ip->all, 4, (u32) src_port); ++ ++ hash_for_each_possible(mapping6_table_by_int_src, p_current, node_by_int_src, hash_src) { ++ if (nf_inet_addr_cmp(&p_current->int_addr, src_ip) ++ && p_current->int_port == src_port ++ && memcmp(&p_current->addr, min_ip, ++ sizeof(union nf_inet_addr)) >= 0 ++ && memcmp(&p_current->addr, max_ip, sizeof(union nf_inet_addr)) <= 0) { ++ return p_current; ++ } ++ } ++ ++ return NULL; ++} ++#endif ++static struct nat_mapping *get_mapping_by_int_src_inrange(const __be32 src_ip, ++ const uint16_t ++ src_port, const __be32 min_ip, const __be32 max_ip) ++{ ++ struct nat_mapping *p_current; ++ u32 hash_src = FULLCONE_HKEY(&src_ip, sizeof(__be32), (u32) src_port, HASHTABLE_BUCKET_BITS); ++ //u32 hash_src = HASH_2(src_ip, (u32) src_port); ++ ++ hash_for_each_possible(mapping_table_by_int_src, p_current, node_by_int_src, hash_src) { ++ if (p_current->int_addr == src_ip ++ && p_current->int_port == src_port ++ && memcmp(&p_current->addr, &min_ip, sizeof(__be32)) >= 0 ++ && memcmp(&p_current->addr, &max_ip, sizeof(__be32)) <= 0) { ++ return p_current; ++ } ++ } ++ ++ return NULL; ++} ++ ++#if IS_ENABLED(CONFIG_NF_NAT_IPV6) || (IS_ENABLED(CONFIG_IPV6) && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 1, 0)) ++static void kill_mapping6(struct nat_mapping6 *mapping) ++{ ++ struct list_head *iter, *tmp; ++ struct nat_mapping_original_tuple *original_tuple_item; ++ ++ if (mapping == NULL) { ++ return; ++ } ++ ++ list_for_each_safe(iter, tmp, &mapping->original_tuple_list) { ++ original_tuple_item = list_entry(iter, struct nat_mapping_original_tuple, node); ++ list_del(&original_tuple_item->node); ++ kfree(original_tuple_item); ++ } ++ ++ hash_del(&mapping->node_by_ext_port); ++ hash_del(&mapping->node_by_int_src); ++ kfree(mapping); ++} ++#endif ++static void kill_mapping(struct nat_mapping *mapping) ++{ ++ struct list_head *iter, *tmp; ++ struct nat_mapping_original_tuple *original_tuple_item; ++ ++ if (mapping == NULL) { ++ return; ++ } ++ ++ list_for_each_safe(iter, tmp, &mapping->original_tuple_list) { ++ original_tuple_item = list_entry(iter, struct nat_mapping_original_tuple, node); ++ list_del(&original_tuple_item->node); ++ kfree(original_tuple_item); ++ } ++ ++ hash_del(&mapping->node_by_ext_port); ++ hash_del(&mapping->node_by_int_src); ++ kfree(mapping); ++} ++ ++#if IS_ENABLED(CONFIG_NF_NAT_IPV6) || (IS_ENABLED(CONFIG_IPV6) && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 1, 0)) ++ ++/* check if a mapping is valid. ++ * possibly delete and free an invalid mapping. ++ * the mapping should not be used anymore after check_mapping6() returns 0. */ ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 3, 0) ++static int check_mapping6(struct nat_mapping6 *mapping, struct net *net, const struct nf_conntrack_zone *zone) ++{ ++#else ++static int check_mapping6(struct nat_mapping6 *mapping, struct net *net, const u16 zone) ++{ ++#endif ++ struct list_head *iter, *tmp; ++ struct nat_mapping_original_tuple *original_tuple_item; ++ struct nf_conntrack_tuple_hash *tuple_hash; ++ struct nf_conn *ct; ++ ++ /* for dying/unconfirmed conntrack tuples, an IPCT_DESTROY event may NOT be fired. ++ * so we manually kill one of those tuples once we acquire one. */ ++ ++ list_for_each_safe(iter, tmp, &mapping->original_tuple_list) { ++ original_tuple_item = list_entry(iter, struct nat_mapping_original_tuple, node); ++ ++ tuple_hash = nf_conntrack_find_get(net, zone, &original_tuple_item->tuple); ++ ++ if (tuple_hash == NULL) { ++ pr_debug ++ ("check_mapping6(): tuple %s dying/unconfirmed. free this tuple.\n", ++ fullcone_nf_ct_stringify_tuple6(&original_tuple_item->tuple)); ++ ++ list_del(&original_tuple_item->node); ++ kfree(original_tuple_item); ++ (mapping->refer_count)--; ++ } else { ++ ct = nf_ct_tuplehash_to_ctrack(tuple_hash); ++ if (likely(ct != NULL)) ++ nf_ct_put(ct); ++ } ++ ++ } ++ ++ /* kill the mapping if need */ ++ pr_debug ++ ("check_mapping6() refer_count for mapping at ext_port %d is now %d\n", ++ mapping->port, mapping->refer_count); ++ if (mapping->refer_count <= 0) { ++ pr_debug("check_mapping6(): kill dying/unconfirmed mapping at ext port %d\n", mapping->port); ++ kill_mapping6(mapping); ++ return 0; ++ } else { ++ return 1; ++ } ++} ++ ++#endif ++ ++/* check if a mapping is valid. ++ * possibly delete and free an invalid mapping. ++ * the mapping should not be used anymore after check_mapping() returns 0. */ ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 3, 0) ++static int check_mapping(struct nat_mapping *mapping, struct net *net, const struct nf_conntrack_zone *zone) ++{ ++#else ++static int check_mapping(struct nat_mapping *mapping, struct net *net, const u16 zone) ++{ ++#endif ++ struct list_head *iter, *tmp; ++ struct nat_mapping_original_tuple *original_tuple_item; ++ struct nf_conntrack_tuple_hash *tuple_hash; ++ struct nf_conn *ct; ++ ++ /* for dying/unconfirmed conntrack tuples, an IPCT_DESTROY event may NOT be fired. ++ * so we manually kill one of those tuples once we acquire one. */ ++ ++ list_for_each_safe(iter, tmp, &mapping->original_tuple_list) { ++ original_tuple_item = list_entry(iter, struct nat_mapping_original_tuple, node); ++ ++ tuple_hash = nf_conntrack_find_get(net, zone, &original_tuple_item->tuple); ++ ++ if (tuple_hash == NULL) { ++ pr_debug ++ ("check_mapping(): tuple %s dying/unconfirmed. free this tuple.\n", ++ nf_ct_stringify_tuple(&original_tuple_item->tuple)); ++ ++ list_del(&original_tuple_item->node); ++ kfree(original_tuple_item); ++ (mapping->refer_count)--; ++ } else { ++ ct = nf_ct_tuplehash_to_ctrack(tuple_hash); ++ if (likely(ct != NULL)) ++ nf_ct_put(ct); ++ } ++ ++ } ++ ++ /* kill the mapping if need */ ++ pr_debug ++ ("check_mapping() refer_count for mapping at ext_port %d is now %d\n", mapping->port, mapping->refer_count); ++ if (mapping->refer_count <= 0) { ++ pr_debug("check_mapping(): kill dying/unconfirmed mapping at ext port %d\n", mapping->port); ++ kill_mapping(mapping); ++ return 0; ++ } else { ++ return 1; ++ } ++} ++ ++#if IS_ENABLED(CONFIG_NF_NAT_IPV6) || (IS_ENABLED(CONFIG_IPV6) && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 1, 0)) ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 3, 0) ++static struct nat_mapping6 *get_mapping6_by_ext_port(const uint16_t port, const union nf_inet_addr ++ *ext_ip, struct net *net, const struct ++ nf_conntrack_zone *zone) ++{ ++#else ++static struct nat_mapping6 *get_mapping6_by_ext_port(const uint16_t port, const union nf_inet_addr ++ *ext_ip, struct net *net, const u16 zone) ++{ ++#endif ++ struct nat_mapping6 *p_current; ++ struct hlist_node *tmp; ++ ++ hash_for_each_possible_safe(mapping6_table_by_ext_port, p_current, tmp, node_by_ext_port, port) { ++ if (p_current->port == port && check_mapping6(p_current, net, zone) ++ && nf_inet_addr_cmp(&p_current->addr, ext_ip)) { ++ return p_current; ++ } ++ } ++ ++ return NULL; ++} ++#endif ++ ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 3, 0) ++static struct nat_mapping *get_mapping_by_ext_port(const uint16_t port, const __be32 ext_ip, struct net *net, const struct ++ nf_conntrack_zone *zone) ++{ ++#else ++static struct nat_mapping *get_mapping_by_ext_port(const uint16_t port, ++ const __be32 ext_ip, struct net *net, const u16 zone) ++{ ++#endif ++ struct nat_mapping *p_current; ++ struct hlist_node *tmp; ++ ++ hash_for_each_possible_safe(mapping_table_by_ext_port, p_current, tmp, node_by_ext_port, port) { ++ if (p_current->port == port && check_mapping(p_current, net, zone) ++ && p_current->addr == ext_ip) { ++ return p_current; ++ } ++ } ++ ++ return NULL; ++} ++ ++#if IS_ENABLED(CONFIG_NF_NAT_IPV6) || (IS_ENABLED(CONFIG_IPV6) && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 1, 0)) ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 3, 0) ++static uint16_t find_appropriate_port6(struct net *net, ++ const struct nf_conntrack_zone *zone, ++ const uint16_t original_port, const union nf_inet_addr *ext_ip, ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 18, 0) ++ struct nf_nat_range2 *range) ++#else ++ struct nf_nat_range *range) ++#endif ++#else ++static uint16_t find_appropriate_port6(struct net *net, const u16 zone, ++ const uint16_t original_port, const union nf_inet_addr *ext_ip, ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 18, 0) ++ struct nf_nat_range2 *range) ++#else ++ struct nf_nat_range *range) ++#endif ++#endif ++{ ++ uint16_t min, start, selected, range_size, i; ++ struct nat_mapping6 *mapping = NULL; ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 18, 0) ++ // nf_nat_range2 specific ++ memset(&range->base_proto, 0, sizeof(range->base_proto)); ++#endif ++ if (range->flags & NF_NAT_RANGE_PROTO_SPECIFIED) { ++ min = be16_to_cpu((range->min_proto).udp.port); ++ range_size = be16_to_cpu((range->max_proto).udp.port) - min + 1; ++ } else { ++ /* minimum port is 1024. same behavior as default linux NAT. */ ++ min = 1024; ++ range_size = 65535 - min + 1; ++ } ++ ++ if ((range->flags & NF_NAT_RANGE_PROTO_RANDOM) ++ || (range->flags & NF_NAT_RANGE_PROTO_RANDOM_FULLY)) { ++ /* for now we do the same thing for both --random and --random-fully */ ++ ++ /* select a random starting point */ ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 1, 0) ++ start = (uint16_t) (get_random_u32() % (u32) range_size); ++#else ++ start = (uint16_t) (prandom_u32() % (u32) range_size); ++#endif ++ } else { ++ ++ if ((original_port >= min && original_port <= min + range_size - 1) ++ || !(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) { ++ /* 1. try to preserve the port if it's available */ ++ mapping = get_mapping6_by_ext_port(original_port, ext_ip, net, zone); ++ if (mapping == NULL) { ++ return original_port; ++ } ++ } ++ ++ /* otherwise, we start from zero */ ++ start = 0; ++ } ++ ++ for (i = 0; i < range_size; i++) { ++ /* 2. try to find an available port */ ++ selected = min + ((start + i) % range_size); ++ mapping = get_mapping6_by_ext_port(selected, ext_ip, net, zone); ++ if (mapping == NULL) { ++ return selected; ++ } ++ } ++ ++ /* 3. at least we tried. override a previous mapping. */ ++ selected = min + start; ++ mapping = get_mapping6_by_ext_port(selected, ext_ip, net, zone); ++ kill_mapping6(mapping); ++ ++ return selected; ++} ++#endif ++ ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 3, 0) ++static uint16_t find_appropriate_port(struct net *net, ++ const struct nf_conntrack_zone *zone, ++ const uint16_t original_port, const __be32 ext_ip, ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 18, 0) ++ struct nf_nat_range2 *range) ++#else ++ struct nf_nat_range *range) ++#endif ++#else ++static uint16_t find_appropriate_port(struct net *net, const u16 zone, ++ const uint16_t original_port, const __be32 ext_ip, ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 18, 0) ++ struct nf_nat_range2 *range) ++#else ++ struct nf_nat_range *range) ++#endif ++#endif ++{ ++ uint16_t min, start, selected, range_size, i; ++ struct nat_mapping *mapping = NULL; ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 18, 0) ++ // nf_nat_range2 specific ++ memset(&range->base_proto, 0, sizeof(range->base_proto)); ++#endif ++ if (range->flags & NF_NAT_RANGE_PROTO_SPECIFIED) { ++ min = be16_to_cpu((range->min_proto).udp.port); ++ range_size = be16_to_cpu((range->max_proto).udp.port) - min + 1; ++ } else { ++ /* minimum port is 1024. same behavior as default linux NAT. */ ++ min = 1024; ++ range_size = 65535 - min + 1; ++ } ++ ++ if ((range->flags & NF_NAT_RANGE_PROTO_RANDOM) ++ || (range->flags & NF_NAT_RANGE_PROTO_RANDOM_FULLY)) { ++ /* for now we do the same thing for both --random and --random-fully */ ++ ++ /* select a random starting point */ ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 1, 0) ++ start = (uint16_t) (get_random_u32() % (u32) range_size); ++#else ++ start = (uint16_t) (prandom_u32() % (u32) range_size); ++#endif ++ } else { ++ ++ if ((original_port >= min && original_port <= min + range_size - 1) ++ || !(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) { ++ /* 1. try to preserve the port if it's available */ ++ mapping = get_mapping_by_ext_port(original_port, ext_ip, net, zone); ++ if (mapping == NULL) { ++ return original_port; ++ } ++ } ++ ++ /* otherwise, we start from zero */ ++ start = 0; ++ } ++ ++ for (i = 0; i < range_size; i++) { ++ /* 2. try to find an available port */ ++ selected = min + ((start + i) % range_size); ++ mapping = get_mapping_by_ext_port(selected, ext_ip, net, zone); ++ if (mapping == NULL) { ++ return selected; ++ } ++ } ++ ++ /* 3. at least we tried. override a previous mapping. */ ++ selected = min + start; ++ mapping = get_mapping_by_ext_port(selected, ext_ip, net, zone); ++ kill_mapping(mapping); ++ ++ return selected; ++} ++ ++#if IS_ENABLED(CONFIG_NF_NAT_IPV6) || (IS_ENABLED(CONFIG_IPV6) && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 1, 0)) ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 3, 0) ++static void find_leastused_ip6(const struct nf_conntrack_zone *zone, ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 18, 0) ++ struct nf_nat_range2 *range, ++#else ++ struct nf_nat_range *range, ++#endif ++ const union nf_inet_addr *src, ++ const union nf_inet_addr *dst, union nf_inet_addr *var_ipp) ++#else ++static void find_leastused_ip6(const u16 zone, ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 18, 0) ++ struct nf_nat_range2 *range, ++#else ++ struct nf_nat_range *range, ++#endif ++ const union nf_inet_addr *src, ++ const union nf_inet_addr *dst, union nf_inet_addr *var_ipp) ++#endif ++{ ++ unsigned int i; ++ /* Host order */ ++ u32 minip, maxip, j, dist; ++ bool full_range; ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 18, 0) ++ // nf_nat_range2 specific ++ memset(&(range->base_proto), 0, sizeof(range->base_proto)); ++#endif ++ ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 3, 0) ++ j = FULLCONE_HKEY(src, sizeof(union nf_inet_addr), ++ range->flags & NF_NAT_RANGE_PERSISTENT ? 0 : dst->all[3] ^ zone->id, HASHTABLE_BUCKET_BITS); ++ //j = jhash2((u32 *) src, 4, range->flags & NF_NAT_RANGE_PERSISTENT ? 0 : dst->all[3] ^ zone->id); ++#else ++ j = FULLCONE_HKEY(src, sizeof(union nf_inet_addr), ++ range->flags & NF_NAT_RANGE_PERSISTENT ? 0 : dst->all[3] ^ zone, HASHTABLE_BUCKET_BITS); ++ //j = jhash2((u32 *) src, 4, range->flags & NF_NAT_RANGE_PERSISTENT ? 0 : dst->all[3] ^ zone); ++#endif ++ ++ full_range = false; ++ for (i = 0; i <= 3; i++) { ++ /* If first bytes of the address are at the maximum, use the ++ * distance. Otherwise use the full range. */ ++ if (!full_range) { ++ minip = ntohl(range->min_addr.all[i]); ++ maxip = ntohl(range->max_addr.all[i]); ++ dist = maxip - minip + 1; ++ } else { ++ minip = 0; ++ dist = ~0; ++ } ++ ++ var_ipp->all[i] = (__force __be32) htonl(minip + reciprocal_scale(j, dist)); ++ if (var_ipp->all[i] != range->max_addr.all[i]) ++ full_range = true; ++ ++ if (!(range->flags & NF_NAT_RANGE_PERSISTENT)) ++ j ^= (__force u32) dst->all[i]; ++ } ++} ++#endif ++ ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 3, 0) ++static __be32 find_leastused_ip(const struct nf_conntrack_zone *zone, ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 18, 0) ++ struct nf_nat_range2 *range, ++#else ++ struct nf_nat_range *range, ++#endif ++ const __be32 src, const __be32 dst) ++#else ++static __be32 find_leastused_ip(const u16 zone, ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 18, 0) ++ struct nf_nat_range2 *range, ++#else ++ struct nf_nat_range *range, ++#endif ++ const __be32 src, const __be32 dst) ++#endif ++{ ++ /* Host order */ ++ u32 minip, maxip, j, dist; ++ ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 18, 0) ++ // nf_nat_range2 specific ++ memset(&(range->base_proto), 0, sizeof(range->base_proto)); ++#endif ++ ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 3, 0) ++ j = FULLCONE_HKEY(&src, sizeof(__be32), range->flags & NF_NAT_RANGE_PERSISTENT ? 0 : dst ^ zone->id, ++ HASHTABLE_BUCKET_BITS); ++ //j = jhash_1word((u32) src, range->flags & NF_NAT_RANGE_PERSISTENT ? 0 : dst ^ zone->id); ++#else ++ j = FULLCONE_HKEY(&src, sizeof(__be32), range->flags & NF_NAT_RANGE_PERSISTENT ? 0 : dst ^ zone, ++ HASHTABLE_BUCKET_BITS); ++ //j = jhash_1word((u32) src, range->flags & NF_NAT_RANGE_PERSISTENT ? 0 : dst ^ zone); ++#endif ++ ++ minip = ntohl(range->min_addr.ip); ++ maxip = ntohl(range->max_addr.ip); ++ dist = maxip - minip + 1; ++ ++ return (__be32) htonl(minip + reciprocal_scale(j, dist)); ++} ++ ++void nf_nat_fullcone_destroy_mappings(void) ++{ ++ struct nat_mapping *p_current; ++#if IS_ENABLED(CONFIG_NF_NAT_IPV6) || (IS_ENABLED(CONFIG_IPV6) && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 1, 0)) ++ struct nat_mapping6 *p6_current; ++#endif ++ struct hlist_node *tmp; ++ int i; ++ ++ spin_lock_bh(&fullconenat_lock); ++ ++ hash_for_each_safe(mapping_table_by_ext_port, i, tmp, p_current, node_by_ext_port) { ++ kill_mapping(p_current); ++ } ++ ++ spin_unlock_bh(&fullconenat_lock); ++ ++#if IS_ENABLED(CONFIG_NF_NAT_IPV6) || (IS_ENABLED(CONFIG_IPV6) && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 1, 0)) ++ spin_lock_bh(&fullconenat6_lock); ++ ++ hash_for_each_safe(mapping6_table_by_ext_port, i, tmp, p6_current, node_by_ext_port) { ++ kill_mapping6(p6_current); ++ } ++ ++ spin_unlock_bh(&fullconenat6_lock); ++#endif ++} ++ ++EXPORT_SYMBOL_GPL(nf_nat_fullcone_destroy_mappings); ++ ++/* ++ * nfproto choices ++ * enum { ++ NFPROTO_INET = 1, ++ NFPROTO_IPV4 = 2, ++ NFPROTO_IPV6 = 10, ++}; ++ */ ++static unsigned int nf_nat_handle_prerouting(u8 nfproto, struct sk_buff *skb, unsigned int hooknum, ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 18, 0) ++ struct nf_nat_range2 *newrange) ++#else ++ struct nf_nat_range *newrange) ++#endif ++{ ++ unsigned int ret; ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 3, 0) ++ const struct nf_conntrack_zone *zone; ++#else ++ u16 zone; ++#endif ++ struct net *net; ++ struct nf_conn *ct; ++ enum ip_conntrack_info ctinfo; ++ struct nf_conntrack_tuple *ct_tuple_origin; ++ ++ uint16_t port, original_port; ++ uint8_t protonum; ++ ++/* NFPROTO specific def */ ++ struct nat_mapping *mapping; ++ struct nat_mapping6 *mapping_6; ++ ++ __be32 ip; ++ union nf_inet_addr *ip_6; ++ /* NFPROTO specific def end */ ++ ++ WARN_ON(!(nfproto == NFPROTO_IPV4 || nfproto == NFPROTO_IPV6)); ++ ++ /* NFPROTO specific init */ ++ mapping = NULL; ++ mapping_6 = NULL; ++ ++ ip = 0; ++ ip_6 = NULL; ++ /* NFPROTO specific init end */ ++ ++ original_port = 0; ++ ret = NFT_CONTINUE; // BUG: use XT_CONTINUE for Xtables ++ ++ ct = nf_ct_get(skb, &ctinfo); ++ net = nf_ct_net(ct); ++ zone = nf_ct_zone(ct); ++ ++ ct_tuple_origin = &(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); ++ ++ protonum = (ct_tuple_origin->dst).protonum; ++ if (protonum != IPPROTO_UDP) { ++ // Currently only UDP traffic is supported for full-cone NAT. ++ // For other protos FULLCONENAT is equivalent to MASQUERADE. ++ return ret; ++ } ++ ++ if (nfproto == NFPROTO_IPV4) { ++ ip = (ct_tuple_origin->dst).u3.ip; ++ } else if (nfproto == NFPROTO_IPV6) { ++ ip_6 = &(ct_tuple_origin->dst).u3; ++ } ++ ++ port = be16_to_cpu((ct_tuple_origin->dst).u.udp.port); ++ ++ if (nfproto == NFPROTO_IPV4) { ++ spin_lock_bh(&fullconenat_lock); ++ } else if (nfproto == NFPROTO_IPV6) { ++ spin_lock_bh(&fullconenat6_lock); ++ } ++ ++ /* find an active mapping based on the inbound port */ ++ if (nfproto == NFPROTO_IPV4) { ++ mapping = get_mapping_by_ext_port(port, ip, net, zone); ++ } else if (nfproto == NFPROTO_IPV6) { ++ mapping_6 = get_mapping6_by_ext_port(port, ip_6, net, zone); ++ } ++ ++ if (nfproto == NFPROTO_IPV4) { ++ if (mapping == NULL) { ++ goto unlock; ++ } ++ } else if (nfproto == NFPROTO_IPV6) { ++ if (mapping_6 == NULL) { ++ goto unlock; ++ } ++ } ++ ++ newrange->flags = NF_NAT_RANGE_MAP_IPS | NF_NAT_RANGE_PROTO_SPECIFIED; ++ if (nfproto == NFPROTO_IPV4) { ++ newrange->min_addr.ip = mapping->int_addr; ++ newrange->max_addr.ip = mapping->int_addr; ++ newrange->min_proto.udp.port = cpu_to_be16(mapping->int_port); ++ } else if (nfproto == NFPROTO_IPV6) { ++ newrange->min_addr = mapping_6->int_addr; ++ newrange->max_addr = mapping_6->int_addr; ++ newrange->min_proto.udp.port = cpu_to_be16(mapping_6->int_port); ++ } ++ newrange->max_proto = newrange->min_proto; ++ ++ if (nfproto == NFPROTO_IPV4) { ++ pr_debug(" %s ==> %pI4:%d\n", ++ nf_ct_stringify_tuple(ct_tuple_origin), &mapping->int_addr, mapping->int_port); ++ } else if (nfproto == NFPROTO_IPV6) { ++ pr_debug(" %s ==> [%pI6c]:%d\n", ++ fullcone_nf_ct_stringify_tuple6(ct_tuple_origin), &mapping_6->int_addr, mapping_6->int_port); ++ } ++ ++ ret = nf_nat_setup_info(ct, newrange, HOOK2MANIP(hooknum)); ++ ++ if (ret == NF_ACCEPT) { ++ if (nfproto == NFPROTO_IPV4) { ++ add_original_tuple_to_mapping(mapping, ct_tuple_origin); ++ pr_debug ++ ("INBOUND: refer_count for mapping at ext_port %d is now %d\n", ++ mapping->port, mapping->refer_count); ++ } else if (nfproto == NFPROTO_IPV6) { ++ add_original_tuple_to_mapping6(mapping_6, ct_tuple_origin); ++ pr_debug ++ ("INBOUND: refer_count for mapping_6 at ext_port %d is now %d\n", ++ mapping_6->port, mapping_6->refer_count); ++ } ++ ++ } ++ ++unlock: ++ if (nfproto == NFPROTO_IPV4) { ++ spin_unlock_bh(&fullconenat_lock); ++ } else if (nfproto == NFPROTO_IPV6) { ++ spin_unlock_bh(&fullconenat6_lock); ++ } ++ ++ return ret; ++ ++} ++ ++static unsigned int nf_nat_handle_postrouting(u8 nfproto, struct sk_buff *skb, unsigned int hooknum, ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 18, 0) ++ struct nf_nat_range2 *range, struct nf_nat_range2 *newrange, ++#else ++ struct nf_nat_range *range, struct nf_nat_range *newrange, ++#endif ++ const struct net_device *out) ++{ ++ unsigned int ret; ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 3, 0) ++ const struct nf_conntrack_zone *zone; ++#else ++ u16 zone; ++#endif ++ struct net *net; ++ struct nf_conn *ct; ++ enum ip_conntrack_info ctinfo; ++ struct nf_conn_nat *nat; ++ struct nf_conntrack_tuple *ct_tuple, *ct_tuple_origin; ++ uint16_t port, original_port, want_port; ++ uint8_t protonum; ++ bool is_src_mapping_active; ++ ++ /* NFPROTO specific def */ ++ struct nat_mapping *mapping, *src_mapping; ++ struct nat_mapping6 *mapping_6, *src_mapping_6; ++ ++ __be32 ip; ++ union nf_inet_addr *ip_6; ++ ++ const struct rtable *rt; ++ __be32 newsrc, nh; ++ ++ /* NFPROTO specific def end */ ++ ++ WARN_ON(!(nfproto == NFPROTO_IPV4 || nfproto == NFPROTO_IPV6)); ++ ++ /* NFPROTO specific init */ ++ mapping = NULL; ++ src_mapping = NULL; ++ mapping_6 = NULL; ++ src_mapping_6 = NULL; ++ ++ ip = 0; ++ ip_6 = NULL; ++ /* NFPROTO specific init end */ ++ ++ original_port = 0; ++ ret = NFT_CONTINUE; // BUG: use XT_CONTINUE for Xtables ++ ++ ct = nf_ct_get(skb, &ctinfo); ++ net = nf_ct_net(ct); ++ zone = nf_ct_zone(ct); ++ ++ ct_tuple_origin = &(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); ++ protonum = (ct_tuple_origin->dst).protonum; ++ ++ if (range->flags & NF_NAT_RANGE_MAP_IPS) { ++ if (nfproto == NFPROTO_IPV4) { ++ newrange->min_addr.ip = range->min_addr.ip; ++ newrange->max_addr.ip = range->max_addr.ip; ++ } else if (nfproto == NFPROTO_IPV6) { ++ newrange->min_addr = range->min_addr; ++ newrange->max_addr = range->max_addr; ++ } ++ ++ } else { ++ if (nfproto == NFPROTO_IPV4) { ++ rt = skb_rtable(skb); ++ nh = rt_nexthop(rt, ip_hdr(skb)->daddr); ++ newsrc = inet_select_addr(out, nh, RT_SCOPE_UNIVERSE); ++ ++ if (unlikely(!newsrc)) ++ return NF_DROP; ++ newrange->min_addr.ip = newsrc; ++ newrange->max_addr.ip = newsrc; ++ } else if (nfproto == NFPROTO_IPV6) { ++ if (unlikely ++ (nat_ipv6_dev_get_saddr ++ (nf_ct_net(ct), out, &ipv6_hdr(skb)->daddr, 0, &(newrange->min_addr.in6)) < 0)) ++ return NF_DROP; ++ newrange->max_addr = newrange->min_addr; ++ ++ } ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 16, 0) ++ nat = nf_ct_nat_ext_add(ct); ++#else ++ nat = nfct_nat(ct); ++#endif ++ if (likely(nat)) ++ nat->masq_index = out->ifindex; ++ ++ } ++ ++ if (protonum == IPPROTO_UDP) { ++ if (nfproto == NFPROTO_IPV4) { ++ ip = (ct_tuple_origin->src).u3.ip; ++ } else if (nfproto == NFPROTO_IPV6) { ++ ip_6 = &(ct_tuple_origin->src).u3; ++ } ++ ++ original_port = be16_to_cpu((ct_tuple_origin->src).u.udp.port); ++ ++ if (nfproto == NFPROTO_IPV4) { ++ spin_lock_bh(&fullconenat_lock); ++ } else if (nfproto == NFPROTO_IPV6) { ++ spin_lock_bh(&fullconenat6_lock); ++ } ++ ++ if (nfproto == NFPROTO_IPV4) { ++ if (newrange->min_addr.ip != newrange->max_addr.ip) ++ src_mapping = ++ get_mapping_by_int_src_inrange(ip, ++ original_port, ++ newrange->min_addr.ip, newrange->max_addr.ip); ++ else ++ src_mapping = get_mapping_by_int_src(ip, original_port, newrange->min_addr.ip); ++ } else if (nfproto == NFPROTO_IPV6) { ++ if (!nf_inet_addr_cmp(&newrange->min_addr, &newrange->max_addr)) ++ src_mapping_6 = ++ get_mapping6_by_int_src_inrange(ip_6, ++ original_port, ++ &newrange->min_addr, &newrange->max_addr); ++ else ++ src_mapping_6 = get_mapping6_by_int_src(ip_6, original_port, &newrange->min_addr); ++ } ++ ++ if (nfproto == NFPROTO_IPV4) { ++ is_src_mapping_active = src_mapping != NULL && check_mapping(src_mapping, net, zone); ++ } else if (nfproto == NFPROTO_IPV6) { ++ is_src_mapping_active = src_mapping_6 != NULL && check_mapping6(src_mapping_6, net, zone); ++ } ++ ++ if (is_src_mapping_active) { ++ ++ /* outbound nat: if a previously established mapping is active, ++ * we will reuse that mapping. */ ++ ++ newrange->flags = NF_NAT_RANGE_MAP_IPS | NF_NAT_RANGE_PROTO_SPECIFIED; ++ if (nfproto == NFPROTO_IPV4) { ++ newrange->min_proto.udp.port = cpu_to_be16(src_mapping->port); ++ } else if (nfproto == NFPROTO_IPV6) { ++ newrange->min_proto.udp.port = cpu_to_be16(src_mapping_6->port); ++ } ++ ++ newrange->max_proto = newrange->min_proto; ++ ++ if (nfproto == NFPROTO_IPV4) { ++ if (newrange->min_addr.ip != newrange->max_addr.ip) { ++ newrange->min_addr.ip = src_mapping->addr; ++ newrange->max_addr.ip = newrange->min_addr.ip; ++ } ++ } else if (nfproto == NFPROTO_IPV6) { ++ if (!nf_inet_addr_cmp(&newrange->min_addr, &newrange->max_addr)) { ++ newrange->min_addr = src_mapping_6->addr; ++ newrange->max_addr = newrange->min_addr; ++ } ++ } ++ ++ } else { ++ ++ /* if not, we find a new external IP:port to map to. ++ * the SNAT may fail so we should re-check the mapped port later. */ ++ ++ if (nfproto == NFPROTO_IPV4) { ++ if (newrange->min_addr.ip != newrange->max_addr.ip) { ++ newrange->min_addr.ip = ++ find_leastused_ip(zone, range, ip, (ct_tuple_origin->dst).u3.ip); ++ newrange->max_addr.ip = newrange->min_addr.ip; ++ } ++ want_port = ++ find_appropriate_port(net, zone, original_port, newrange->min_addr.ip, range); ++ } else if (nfproto == NFPROTO_IPV6) { ++ ++ if (!nf_inet_addr_cmp(&newrange->min_addr, &newrange->max_addr)) { ++ find_leastused_ip6(zone, range, ip_6, ++ &(ct_tuple_origin->dst).u3, &newrange->min_addr); ++ newrange->max_addr = newrange->min_addr; ++ } ++ ++ want_port = ++ find_appropriate_port6(net, zone, original_port, &newrange->min_addr, range); ++ } ++ ++ newrange->flags = NF_NAT_RANGE_MAP_IPS | NF_NAT_RANGE_PROTO_SPECIFIED; ++ newrange->min_proto.udp.port = cpu_to_be16(want_port); ++ newrange->max_proto = newrange->min_proto; ++ ++ if (nfproto == NFPROTO_IPV4) { ++ src_mapping = NULL; ++ } else if (nfproto == NFPROTO_IPV6) { ++ src_mapping_6 = NULL; ++ } ++ ++ } ++ } ++ ++ /* do SNAT now */ ++ ret = nf_nat_setup_info(ct, newrange, HOOK2MANIP(hooknum)); ++ ++ if (protonum != IPPROTO_UDP) { ++ /* non-UDP packets, bailout */ ++ goto out; ++ } ++ if (ret != NF_ACCEPT) { ++ /* failed SNAT, bailout */ ++ goto unlock; ++ } ++ ++ /* the reply tuple contains the mapped port. */ ++ ct_tuple = &(ct->tuplehash[IP_CT_DIR_REPLY].tuple); ++ /* this is the resulted mapped port. */ ++ port = be16_to_cpu((ct_tuple->dst).u.udp.port); ++ ++ if (nfproto == NFPROTO_IPV4) { ++ pr_debug(" %s ==> %d\n", nf_ct_stringify_tuple(ct_tuple_origin), port); ++ } else if (nfproto == NFPROTO_IPV6) { ++ pr_debug(" %s ==> %d\n", fullcone_nf_ct_stringify_tuple6(ct_tuple_origin), port); ++ } ++ ++ /* save the mapping information into our mapping table */ ++ ++ if (nfproto == NFPROTO_IPV4) { ++ mapping = src_mapping; ++ if (mapping == NULL) { ++ mapping = allocate_mapping(ip, original_port, port, (ct_tuple->dst).u3.ip); ++ } ++ if (likely(mapping != NULL)) { ++ add_original_tuple_to_mapping(mapping, ct_tuple_origin); ++ pr_debug ++ (" OUTBOUND: refer_count for mapping at ext_port %d is now %d\n", ++ mapping->port, mapping->refer_count); ++ } ++ } else if (nfproto == NFPROTO_IPV6) { ++ mapping_6 = src_mapping_6; ++ if (mapping_6 == NULL) { ++ mapping_6 = allocate_mapping6(ip_6, original_port, port, &(ct_tuple->dst).u3); ++ } ++ if (likely(mapping_6 != NULL)) { ++ add_original_tuple_to_mapping6(mapping_6, ct_tuple_origin); ++ pr_debug ++ ("OUTBOUND: refer_count for mapping at ext_port %d is now %d\n", ++ mapping_6->port, mapping_6->refer_count); ++ } ++ } ++ ++unlock: ++ if (nfproto == NFPROTO_IPV4) { ++ spin_unlock_bh(&fullconenat_lock); ++ } else if (nfproto == NFPROTO_IPV6) { ++ spin_unlock_bh(&fullconenat6_lock); ++ } ++ ++out: ++ return ret; ++} ++ ++unsigned int nf_nat_fullcone_ipv4(struct sk_buff *skb, unsigned int hooknum, ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 18, 0) ++ struct nf_nat_range2 *range, ++#else ++ struct nf_nat_range *range, ++#endif ++ const struct net_device *out) ++{ ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 18, 0) ++ struct nf_nat_range2 newrange; ++#else ++ struct nf_nat_range newrange; ++#endif ++ ++ WARN_ON(!(hooknum == NF_INET_POST_ROUTING || hooknum == NF_INET_PRE_ROUTING)); ++ ++ memset(&newrange.min_addr, 0, sizeof(newrange.min_addr)); ++ memset(&newrange.max_addr, 0, sizeof(newrange.max_addr)); ++ newrange.flags = range->flags | NF_NAT_RANGE_MAP_IPS; ++ newrange.min_proto = range->min_proto; ++ newrange.max_proto = range->max_proto; ++ ++ switch (hooknum) { ++ case NF_INET_PRE_ROUTING: ++ /* inbound packets */ ++ return nf_nat_handle_prerouting(NFPROTO_IPV4, skb, hooknum, &newrange); ++ case NF_INET_POST_ROUTING: ++ /* outbound packets */ ++ return nf_nat_handle_postrouting(NFPROTO_IPV4, skb, hooknum, range, &newrange, out); ++ } ++ ++ WARN_ON(1); ++ // logical error ++ return 5; ++} ++ ++EXPORT_SYMBOL_GPL(nf_nat_fullcone_ipv4); ++ ++#if IS_ENABLED(CONFIG_IPV6) ++static int ++nat_ipv6_dev_get_saddr(struct net *net, const struct net_device *dev, ++ const struct in6_addr *daddr, unsigned int srcprefs, struct in6_addr *saddr) ++{ ++#ifdef CONFIG_IPV6_MODULE ++ const struct nf_ipv6_ops *v6_ops = nf_get_ipv6_ops(); ++ ++ if (!v6_ops) ++ return -EHOSTUNREACH; ++ ++ return v6_ops->dev_get_saddr(net, dev, daddr, srcprefs, saddr); ++#else ++ return ipv6_dev_get_saddr(net, dev, daddr, srcprefs, saddr); ++#endif ++} ++#endif ++ ++unsigned int nf_nat_fullcone_ipv6(struct sk_buff *skb, unsigned int hooknum, ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 18, 0) ++ struct nf_nat_range2 *range, ++#else ++ struct nf_nat_range *range, ++#endif ++ const struct net_device *out) ++{ ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 18, 0) ++ struct nf_nat_range2 newrange; ++#else ++ struct nf_nat_range newrange; ++#endif ++ ++ WARN_ON(!(hooknum == NF_INET_POST_ROUTING || hooknum == NF_INET_PRE_ROUTING)); ++ ++ memset(&newrange.min_addr, 0, sizeof(newrange.min_addr)); ++ memset(&newrange.max_addr, 0, sizeof(newrange.max_addr)); ++ newrange.flags = range->flags | NF_NAT_RANGE_MAP_IPS; ++ newrange.min_proto = range->min_proto; ++ newrange.max_proto = range->max_proto; ++ ++ switch (hooknum) { ++ case NF_INET_PRE_ROUTING: ++ /* inbound packets */ ++ return nf_nat_handle_prerouting(NFPROTO_IPV6, skb, hooknum, &newrange); ++ case NF_INET_POST_ROUTING: ++ /* outbound packets */ ++ return nf_nat_handle_postrouting(NFPROTO_IPV6, skb, hooknum, range, &newrange, out); ++ } ++ ++ WARN_ON(1); ++ // logical error ++ return 5; ++} ++ ++EXPORT_SYMBOL_GPL(nf_nat_fullcone_ipv6); ++ ++MODULE_LICENSE("GPL"); ++MODULE_AUTHOR("Syrone Wong "); ++MODULE_DESCRIPTION("Netfilter fullcone expression support library of RFC3489 full cone NAT"); +--- /dev/null ++++ b/net/netfilter/nft_ext_fullcone.c +@@ -0,0 +1,466 @@ ++// SPDX-License-Identifier: GPL-2.0-only ++ ++/* ++ * Nftables NAT extension: fullcone expression support ++ * ++ * Copyright (c) 2018 Chion Tang ++ * Original xt_FULLCONENAT and related iptables extension author ++ * Copyright (c) 2019-2022 GitHub/llccd Twitter/@gNodeB ++ * Added IPv6 support for xt_FULLCONENAT and ip6tables extension ++ * Ported to recent kernel versions ++ * Copyright (c) 2022 Syrone Wong ++ * Massively rewrite the whole module, split the original code into library and nftables 'fullcone' expression module ++ * Copyright (c) 2022 Alexandre Frade ++ * Ported to latest kernel source tree ++ */ ++ ++#define pr_fmt(fmt) "fullcone " KBUILD_MODNAME ": " fmt ++#define NF_FULLCONE_WORKQUEUE_NAME "fullcone " KBUILD_MODNAME ": wq" ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++ ++#include ++ ++static void nft_fullcone_set_regs(const struct nft_expr *expr, const struct nft_regs *regs, ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 18, 0) ++ struct nf_nat_range2 *range); ++#else ++ struct nf_nat_range *range); ++#endif ++ ++#ifdef CONFIG_NF_CONNTRACK_CHAIN_EVENTS ++struct notifier_block ct_event_notifier; ++#else ++struct nf_ct_event_notifier ct_event_notifier; ++#endif ++static DEFINE_MUTEX(nf_ct_net_event_lock); ++int ct_event_notifier_registered = 0; ++ ++int module_refer_count = 0; ++ ++static void gc_worker(struct work_struct *work); ++static struct workqueue_struct *wq __read_mostly = NULL; ++static DECLARE_DELAYED_WORK(gc_worker_wk, gc_worker); ++ ++static void gc_worker(struct work_struct *work) ++{ ++ nf_nat_fullcone_handle_dying_tuples(); ++} ++ ++struct nft_fullcone { ++ u32 flags; ++ u8 sreg_proto_min; ++ u8 sreg_proto_max; ++}; ++ ++static const struct nla_policy nft_fullcone_policy[NFTA_FULLCONE_MAX + 1] = { ++ [NFTA_FULLCONE_FLAGS] = {.type = NLA_U32 }, ++ [NFTA_FULLCONE_REG_PROTO_MIN] = {.type = NLA_U32 }, ++ [NFTA_FULLCONE_REG_PROTO_MAX] = {.type = NLA_U32 }, ++}; ++ ++/* conntrack destroy event callback function */ ++#ifdef CONFIG_NF_CONNTRACK_CHAIN_EVENTS ++static int ct_event_cb(struct notifier_block *this, unsigned long events, void *ptr) ++{ ++ struct nf_ct_event *item = ptr; ++#elif LINUX_VERSION_CODE >= KERNEL_VERSION(5, 15, 0) ++static int ct_event_cb(unsigned int events, const struct nf_ct_event *item) ++{ ++#else ++static int ct_event_cb(unsigned int events, struct nf_ct_event *item) ++{ ++#endif ++ struct nf_conn *ct; ++ struct nf_conntrack_tuple *ct_tuple_reply, *ct_tuple_original; ++ uint8_t protonum; ++ struct tuple_list *dying_tuple_item; ++ ++ ct = item->ct; ++ /* we handle only conntrack destroy events */ ++ if (ct == NULL || !(events & (1 << IPCT_DESTROY))) { ++ return 0; ++ } ++ ++ ct_tuple_original = &(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); ++ ++ ct_tuple_reply = &(ct->tuplehash[IP_CT_DIR_REPLY].tuple); ++ ++ protonum = (ct_tuple_original->dst).protonum; ++ if (protonum != IPPROTO_UDP) { ++ return 0; ++ } ++ ++ dying_tuple_item = kmalloc(sizeof(struct tuple_list), GFP_ATOMIC); ++ ++ if (dying_tuple_item == NULL) { ++ pr_debug("warning: ct_event_cb(): kmalloc failed.\n"); ++ return 0; ++ } ++ ++ memcpy(&(dying_tuple_item->tuple_original), ct_tuple_original, sizeof(struct nf_conntrack_tuple)); ++ memcpy(&(dying_tuple_item->tuple_reply), ct_tuple_reply, sizeof(struct nf_conntrack_tuple)); ++ ++ nf_nat_fullcone_dying_tuple_list_add(&(dying_tuple_item->list)); ++ ++ if (wq != NULL) ++ queue_delayed_work(wq, &gc_worker_wk, msecs_to_jiffies(100)); ++ ++ return 0; ++} ++ ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 15, 0) && !defined(CONFIG_NF_CONNTRACK_CHAIN_EVENTS) ++static int exp_event_cb(unsigned int events, const struct nf_exp_event *item) ++{ ++ return 0; ++} ++#endif ++ ++static int nft_fullcone_validate(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nft_data **data) ++{ ++ int err; ++ ++ err = nft_chain_validate_dependency(ctx->chain, NFT_CHAIN_T_NAT); ++ if (err < 0) ++ return err; ++ ++ // TODO: check hooks ++ return nft_chain_validate_hooks(ctx->chain, (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_POST_ROUTING)); ++} ++ ++static int nft_fullcone_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr *const tb[]) ++{ ++ int err; ++ int register_ct_notifier_ret = 0; ++ ++ err = nf_ct_netns_get(ctx->net, ctx->family); ++ ++ mutex_lock(&nf_ct_net_event_lock); ++ ++ module_refer_count++; ++ ++ pr_debug("nft_fullcone_init(): module_refer_count is now %d\n", module_refer_count); ++ ++ if (module_refer_count == 1) { ++#ifdef CONFIG_NF_CONNTRACK_CHAIN_EVENTS ++ ct_event_notifier.notifier_call = ct_event_cb; ++#elif LINUX_VERSION_CODE >= KERNEL_VERSION(5, 15, 0) ++ ct_event_notifier.ct_event = ct_event_cb; ++ ct_event_notifier.exp_event = exp_event_cb; ++#else ++ ct_event_notifier.fcn = ct_event_cb; ++#endif ++ ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 15, 0) && !defined(CONFIG_NF_CONNTRACK_CHAIN_EVENTS) ++ if (!READ_ONCE(ctx->net->ct.nf_conntrack_event_cb)) { ++ nf_conntrack_register_notifier(ctx->net, &ct_event_notifier); ++ } ++#else ++ register_ct_notifier_ret = nf_conntrack_register_notifier(ctx->net, &ct_event_notifier); ++#endif ++ ++ if (register_ct_notifier_ret) { ++ /* non-zero means failure */ ++ pr_warn("failed to register a conntrack notifier. Disable active GC for mappings.\n"); ++ } else { ++ ct_event_notifier_registered = 1; ++ pr_debug("nft_fullcone_init(): ct_event_notifier registered\n"); ++ } ++ ++ } ++ ++ mutex_unlock(&nf_ct_net_event_lock); ++ ++ return err; ++} ++ ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 2, 0) ++static int nft_fullcone_dump(struct sk_buff *skb, const struct nft_expr *expr, bool reset) ++#else ++static int nft_fullcone_dump(struct sk_buff *skb, const struct nft_expr *expr) ++#endif ++{ ++ const struct nft_fullcone *priv = nft_expr_priv(expr); ++ ++ if (priv->flags != 0 && nla_put_be32(skb, NFTA_FULLCONE_FLAGS, htonl(priv->flags))) ++ goto nla_put_failure; ++ ++ if (priv->sreg_proto_min) { ++ if (nft_dump_register(skb, NFTA_FULLCONE_REG_PROTO_MIN, ++ priv->sreg_proto_min) || ++ nft_dump_register(skb, NFTA_FULLCONE_REG_PROTO_MAX, priv->sreg_proto_max)) ++ goto nla_put_failure; ++ } ++ ++ return 0; ++ ++nla_put_failure: ++ return -1; ++} ++ ++/* nft_fullcone_set_regs sets nft_regs from nft_expr fullcone specific private data */ ++static void nft_fullcone_set_regs(const struct nft_expr *expr, const struct nft_regs *regs, ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 18, 0) ++ struct nf_nat_range2 *range ++#else ++ struct nf_nat_range *range ++#endif ++ ) ++{ ++ // private data connected via nft_expr_type.ops <==> nft_expr_ops.type ++ // private data type from nft_expr_type.{policy,maxattr,ops} ++ // private data size from nft_expr_ops.size ++ struct nft_fullcone *priv = nft_expr_priv(expr); ++ range->flags = priv->flags; ++ if (priv->sreg_proto_min) { ++ range->min_proto.all = (__force __be16) ++ nft_reg_load16(®s->data[priv->sreg_proto_min]); ++ range->max_proto.all = (__force __be16) ++ nft_reg_load16(®s->data[priv->sreg_proto_max]); ++ } ++} ++ ++static void nft_fullcone_ipv4_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) ++{ ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 18, 0) ++ struct nf_nat_range2 range; ++#else ++ struct nf_nat_range range; ++#endif ++ ++ memset(&range, 0, sizeof(range)); ++ nft_fullcone_set_regs(expr, regs, &range); ++ regs->verdict.code = nf_nat_fullcone_ipv4(pkt->skb, nft_hook(pkt), &range, nft_out(pkt)); ++} ++ ++static void nft_fullcone_common_destory(const struct nft_ctx *ctx) ++{ ++ mutex_lock(&nf_ct_net_event_lock); ++ ++ module_refer_count--; ++ ++ pr_debug("nft_fullcone_common_destory(): module_refer_count is now %d\n", module_refer_count); ++ ++ if (module_refer_count == 0) { ++ if (ct_event_notifier_registered) { ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 15, 0) && !defined(CONFIG_NF_CONNTRACK_CHAIN_EVENTS) ++ nf_conntrack_unregister_notifier(ctx->net); ++#else ++ nf_conntrack_unregister_notifier(ctx->net, &ct_event_notifier); ++#endif ++ ct_event_notifier_registered = 0; ++ ++ pr_debug("nft_fullcone_common_destory(): ct_event_notifier unregistered\n"); ++ ++ } ++ } ++ ++ mutex_unlock(&nf_ct_net_event_lock); ++} ++ ++static void nft_fullcone_ipv4_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) ++{ ++ nft_fullcone_common_destory(ctx); ++ nf_ct_netns_put(ctx->net, NFPROTO_IPV4); ++} ++ ++static struct nft_expr_type nft_fullcone_ipv4_type; ++static const struct nft_expr_ops nft_fullcone_ipv4_ops = { ++ .type = &nft_fullcone_ipv4_type, ++ .size = NFT_EXPR_SIZE(sizeof(struct nft_fullcone)), ++ .eval = nft_fullcone_ipv4_eval, ++ .init = nft_fullcone_init, ++ .destroy = nft_fullcone_ipv4_destroy, ++ .dump = nft_fullcone_dump, ++ .validate = nft_fullcone_validate, ++}; ++ ++static struct nft_expr_type nft_fullcone_ipv4_type __read_mostly = { ++ .family = NFPROTO_IPV4, ++ .name = "fullcone", ++ .ops = &nft_fullcone_ipv4_ops, ++ .policy = nft_fullcone_policy, ++ .maxattr = NFTA_FULLCONE_MAX, ++ .owner = THIS_MODULE, ++}; ++ ++#ifdef CONFIG_NF_TABLES_IPV6 ++static void nft_fullcone_ipv6_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) ++{ ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 18, 0) ++ struct nf_nat_range2 range; ++#else ++ struct nf_nat_range range; ++#endif ++ ++ memset(&range, 0, sizeof(range)); ++ nft_fullcone_set_regs(expr, regs, &range); ++ regs->verdict.code = nf_nat_fullcone_ipv6(pkt->skb, nft_hook(pkt), &range, nft_out(pkt)); ++} ++ ++static void nft_fullcone_ipv6_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) ++{ ++ nft_fullcone_common_destory(ctx); ++ nf_ct_netns_put(ctx->net, NFPROTO_IPV6); ++} ++ ++static struct nft_expr_type nft_fullcone_ipv6_type; ++static const struct nft_expr_ops nft_fullcone_ipv6_ops = { ++ .type = &nft_fullcone_ipv6_type, ++ .size = NFT_EXPR_SIZE(sizeof(struct nft_fullcone)), ++ .eval = nft_fullcone_ipv6_eval, ++ .init = nft_fullcone_init, ++ .destroy = nft_fullcone_ipv6_destroy, ++ .dump = nft_fullcone_dump, ++ .validate = nft_fullcone_validate, ++}; ++ ++static struct nft_expr_type nft_fullcone_ipv6_type __read_mostly = { ++ .family = NFPROTO_IPV6, ++ .name = "fullcone", ++ .ops = &nft_fullcone_ipv6_ops, ++ .policy = nft_fullcone_policy, ++ .maxattr = NFTA_FULLCONE_MAX, ++ .owner = THIS_MODULE, ++}; ++ ++static int __init nft_fullcone_module_init_ipv6(void) ++{ ++ return nft_register_expr(&nft_fullcone_ipv6_type); ++} ++ ++static void nft_fullcone_module_exit_ipv6(void) ++{ ++ nft_unregister_expr(&nft_fullcone_ipv6_type); ++} ++#else ++static inline int nft_fullcone_module_init_ipv6(void) ++{ ++ return 0; ++} ++ ++static inline void nft_fullcone_module_exit_ipv6(void) ++{ ++} ++#endif ++ ++#ifdef CONFIG_NF_TABLES_INET ++static void nft_fullcone_inet_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) ++{ ++ switch (nft_pf(pkt)) { ++ case NFPROTO_IPV4: ++ return nft_fullcone_ipv4_eval(expr, regs, pkt); ++ case NFPROTO_IPV6: ++ return nft_fullcone_ipv6_eval(expr, regs, pkt); ++ } ++ ++ WARN_ON_ONCE(1); ++} ++ ++static void nft_fullcone_inet_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) ++{ ++ nft_fullcone_common_destory(ctx); ++ nf_ct_netns_put(ctx->net, NFPROTO_INET); ++} ++ ++static struct nft_expr_type nft_fullcone_inet_type; ++static const struct nft_expr_ops nft_fullcone_inet_ops = { ++ .type = &nft_fullcone_inet_type, ++ .size = NFT_EXPR_SIZE(sizeof(struct nft_fullcone)), ++ .eval = nft_fullcone_inet_eval, ++ .init = nft_fullcone_init, ++ .destroy = nft_fullcone_inet_destroy, ++ .dump = nft_fullcone_dump, ++ .validate = nft_fullcone_validate, ++}; ++ ++static struct nft_expr_type nft_fullcone_inet_type __read_mostly = { ++ .family = NFPROTO_INET, ++ .name = "fullcone", ++ .ops = &nft_fullcone_inet_ops, ++ .policy = nft_fullcone_policy, ++ .maxattr = NFTA_FULLCONE_MAX, ++ .owner = THIS_MODULE, ++}; ++ ++static int __init nft_fullcone_module_init_inet(void) ++{ ++ return nft_register_expr(&nft_fullcone_inet_type); ++} ++ ++static void nft_fullcone_module_exit_inet(void) ++{ ++ nft_unregister_expr(&nft_fullcone_inet_type); ++} ++#else ++static inline int nft_fullcone_module_init_inet(void) ++{ ++ return 0; ++} ++ ++static inline void nft_fullcone_module_exit_inet(void) ++{ ++} ++#endif ++ ++static int __init nft_fullcone_module_init(void) ++{ ++ int ret; ++ ++ ret = nft_fullcone_module_init_ipv6(); ++ if (ret < 0) ++ return ret; ++ ++ ret = nft_fullcone_module_init_inet(); ++ if (ret < 0) { ++ nft_fullcone_module_exit_ipv6(); ++ return ret; ++ } ++ ++ ret = nft_register_expr(&nft_fullcone_ipv4_type); ++ if (ret < 0) { ++ nft_fullcone_module_exit_inet(); ++ nft_fullcone_module_exit_ipv6(); ++ return ret; ++ } ++ ++ wq = create_singlethread_workqueue(NF_FULLCONE_WORKQUEUE_NAME); ++ if (wq == NULL) { ++ pr_err("failed to create workqueue %s\n", NF_FULLCONE_WORKQUEUE_NAME); ++ } ++ ++ return ret; ++} ++ ++static void __exit nft_fullcone_module_exit(void) ++{ ++ nft_fullcone_module_exit_ipv6(); ++ nft_fullcone_module_exit_inet(); ++ nft_unregister_expr(&nft_fullcone_ipv4_type); ++ ++ if (wq) { ++ cancel_delayed_work_sync(&gc_worker_wk); ++ flush_workqueue(wq); ++ destroy_workqueue(wq); ++ } ++ ++ nf_nat_fullcone_handle_dying_tuples(); ++ nf_nat_fullcone_destroy_mappings(); ++} ++ ++module_init(nft_fullcone_module_init); ++module_exit(nft_fullcone_module_exit); ++ ++MODULE_LICENSE("GPL"); ++MODULE_AUTHOR("Syrone Wong "); ++MODULE_ALIAS_NFT_EXPR("fullcone"); ++MODULE_DESCRIPTION("Netfilter nftables fullcone expression support of RFC3489 full cone NAT"); diff --git a/debian/patches/patchset-xanmod/net/netfilter/0002-netfilter-add-xt_FLOWOFFLOAD-target.patch b/debian/patches/patchset-xanmod/net/netfilter/0002-netfilter-add-xt_FLOWOFFLOAD-target.patch new file mode 100644 index 0000000..ca309c3 --- /dev/null +++ b/debian/patches/patchset-xanmod/net/netfilter/0002-netfilter-add-xt_FLOWOFFLOAD-target.patch @@ -0,0 +1,809 @@ +From 9a066610b055315bf155a99b2ea0a58245ef11e2 Mon Sep 17 00:00:00 2001 +From: Felix Fietkau +Date: Tue, 20 Feb 2018 15:56:02 +0100 +Subject: [PATCH 2/2] netfilter: add xt_FLOWOFFLOAD target + +Signed-off-by: Felix Fietkau +Signed-off-by: Alexandre Frade +--- + include/net/netfilter/nf_flow_table.h | 5 + + include/uapi/linux/netfilter/xt_FLOWOFFLOAD.h | 17 + + net/netfilter/Kconfig | 9 + + net/netfilter/Makefile | 1 + + net/netfilter/nf_flow_table_core.c | 5 +- + net/netfilter/xt_FLOWOFFLOAD.c | 698 ++++++++++++++++++ + 6 files changed, 732 insertions(+), 3 deletions(-) + create mode 100644 include/uapi/linux/netfilter/xt_FLOWOFFLOAD.h + create mode 100644 net/netfilter/xt_FLOWOFFLOAD.c + +--- a/include/net/netfilter/nf_flow_table.h ++++ b/include/net/netfilter/nf_flow_table.h +@@ -294,6 +294,11 @@ void nf_flow_table_free(struct nf_flowta + + void flow_offload_teardown(struct flow_offload *flow); + ++int nf_flow_table_iterate(struct nf_flowtable *flow_table, ++ void (*iter)(struct nf_flowtable *flowtable, ++ struct flow_offload *flow, void *data), ++ void *data); ++ + void nf_flow_snat_port(const struct flow_offload *flow, + struct sk_buff *skb, unsigned int thoff, + u8 protocol, enum flow_offload_tuple_dir dir); +--- /dev/null ++++ b/include/uapi/linux/netfilter/xt_FLOWOFFLOAD.h +@@ -0,0 +1,17 @@ ++/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ ++#ifndef _XT_FLOWOFFLOAD_H ++#define _XT_FLOWOFFLOAD_H ++ ++#include ++ ++enum { ++ XT_FLOWOFFLOAD_HW = 1 << 0, ++ ++ XT_FLOWOFFLOAD_MASK = XT_FLOWOFFLOAD_HW ++}; ++ ++struct xt_flowoffload_target_info { ++ __u32 flags; ++}; ++ ++#endif /* _XT_FLOWOFFLOAD_H */ +--- a/net/netfilter/Kconfig ++++ b/net/netfilter/Kconfig +@@ -1041,6 +1041,15 @@ config NETFILTER_XT_TARGET_NOTRACK + depends on NETFILTER_ADVANCED + select NETFILTER_XT_TARGET_CT + ++config NETFILTER_XT_TARGET_FLOWOFFLOAD ++ tristate '"FLOWOFFLOAD" target support' ++ depends on NF_FLOW_TABLE ++ depends on NETFILTER_INGRESS ++ help ++ This option adds a `FLOWOFFLOAD' target, which uses the nf_flow_offload ++ module to speed up processing of packets by bypassing the usual ++ netfilter chains ++ + config NETFILTER_XT_TARGET_RATEEST + tristate '"RATEEST" target support' + depends on NETFILTER_ADVANCED +--- a/net/netfilter/Makefile ++++ b/net/netfilter/Makefile +@@ -168,6 +168,7 @@ obj-$(CONFIG_NETFILTER_XT_TARGET_CLASSIF + obj-$(CONFIG_NETFILTER_XT_TARGET_CONNSECMARK) += xt_CONNSECMARK.o + obj-$(CONFIG_NETFILTER_XT_TARGET_CT) += xt_CT.o + obj-$(CONFIG_NETFILTER_XT_TARGET_DSCP) += xt_DSCP.o ++obj-$(CONFIG_NETFILTER_XT_TARGET_FLOWOFFLOAD) += xt_FLOWOFFLOAD.o + obj-$(CONFIG_NETFILTER_XT_TARGET_HL) += xt_HL.o + obj-$(CONFIG_NETFILTER_XT_TARGET_HMARK) += xt_HMARK.o + obj-$(CONFIG_NETFILTER_XT_TARGET_LED) += xt_LED.o +--- a/net/netfilter/nf_flow_table_core.c ++++ b/net/netfilter/nf_flow_table_core.c +@@ -7,7 +7,6 @@ + #include + #include + #include +-#include + #include + #include + #include +@@ -373,8 +372,7 @@ flow_offload_lookup(struct nf_flowtable + } + EXPORT_SYMBOL_GPL(flow_offload_lookup); + +-static int +-nf_flow_table_iterate(struct nf_flowtable *flow_table, ++int nf_flow_table_iterate(struct nf_flowtable *flow_table, + void (*iter)(struct nf_flowtable *flowtable, + struct flow_offload *flow, void *data), + void *data) +@@ -435,6 +433,7 @@ static void nf_flow_offload_gc_step(stru + nf_flow_offload_stats(flow_table, flow); + } + } ++EXPORT_SYMBOL_GPL(nf_flow_table_iterate); + + void nf_flow_table_gc_run(struct nf_flowtable *flow_table) + { +--- /dev/null ++++ b/net/netfilter/xt_FLOWOFFLOAD.c +@@ -0,0 +1,698 @@ ++/* ++ * Copyright (C) 2018-2021 Felix Fietkau ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 as ++ * published by the Free Software Foundation. ++ */ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++struct xt_flowoffload_hook { ++ struct hlist_node list; ++ struct nf_hook_ops ops; ++ struct net *net; ++ bool registered; ++ bool used; ++}; ++ ++struct xt_flowoffload_table { ++ struct nf_flowtable ft; ++ struct hlist_head hooks; ++ struct delayed_work work; ++}; ++ ++struct nf_forward_info { ++ const struct net_device *indev; ++ const struct net_device *outdev; ++ const struct net_device *hw_outdev; ++ struct id { ++ __u16 id; ++ __be16 proto; ++ } encap[NF_FLOW_TABLE_ENCAP_MAX]; ++ u8 num_encaps; ++ u8 ingress_vlans; ++ u8 h_source[ETH_ALEN]; ++ u8 h_dest[ETH_ALEN]; ++ enum flow_offload_xmit_type xmit_type; ++}; ++ ++static DEFINE_SPINLOCK(hooks_lock); ++ ++struct xt_flowoffload_table flowtable[2]; ++ ++static unsigned int ++xt_flowoffload_net_hook(void *priv, struct sk_buff *skb, ++ const struct nf_hook_state *state) ++{ ++ struct vlan_ethhdr *veth; ++ __be16 proto; ++ ++ switch (skb->protocol) { ++ case htons(ETH_P_8021Q): ++ veth = (struct vlan_ethhdr *)skb_mac_header(skb); ++ proto = veth->h_vlan_encapsulated_proto; ++ break; ++ case htons(ETH_P_PPP_SES): ++ if (!nf_flow_pppoe_proto(skb, &proto)) ++ return NF_ACCEPT; ++ break; ++ default: ++ proto = skb->protocol; ++ break; ++ } ++ ++ switch (proto) { ++ case htons(ETH_P_IP): ++ return nf_flow_offload_ip_hook(priv, skb, state); ++ case htons(ETH_P_IPV6): ++ return nf_flow_offload_ipv6_hook(priv, skb, state); ++ } ++ ++ return NF_ACCEPT; ++} ++ ++static int ++xt_flowoffload_create_hook(struct xt_flowoffload_table *table, ++ struct net_device *dev) ++{ ++ struct xt_flowoffload_hook *hook; ++ struct nf_hook_ops *ops; ++ ++ hook = kzalloc(sizeof(*hook), GFP_ATOMIC); ++ if (!hook) ++ return -ENOMEM; ++ ++ ops = &hook->ops; ++ ops->pf = NFPROTO_NETDEV; ++ ops->hooknum = NF_NETDEV_INGRESS; ++ ops->priority = 10; ++ ops->priv = &table->ft; ++ ops->hook = xt_flowoffload_net_hook; ++ ops->dev = dev; ++ ++ hlist_add_head(&hook->list, &table->hooks); ++ mod_delayed_work(system_power_efficient_wq, &table->work, 0); ++ ++ return 0; ++} ++ ++static struct xt_flowoffload_hook * ++flow_offload_lookup_hook(struct xt_flowoffload_table *table, ++ struct net_device *dev) ++{ ++ struct xt_flowoffload_hook *hook; ++ ++ hlist_for_each_entry(hook, &table->hooks, list) { ++ if (hook->ops.dev == dev) ++ return hook; ++ } ++ ++ return NULL; ++} ++ ++static void ++xt_flowoffload_check_device(struct xt_flowoffload_table *table, ++ struct net_device *dev) ++{ ++ struct xt_flowoffload_hook *hook; ++ ++ if (!dev) ++ return; ++ ++ spin_lock_bh(&hooks_lock); ++ hook = flow_offload_lookup_hook(table, dev); ++ if (hook) ++ hook->used = true; ++ else ++ xt_flowoffload_create_hook(table, dev); ++ spin_unlock_bh(&hooks_lock); ++} ++ ++static void ++xt_flowoffload_register_hooks(struct xt_flowoffload_table *table) ++{ ++ struct xt_flowoffload_hook *hook; ++ ++restart: ++ hlist_for_each_entry(hook, &table->hooks, list) { ++ if (hook->registered) ++ continue; ++ ++ hook->registered = true; ++ hook->net = dev_net(hook->ops.dev); ++ spin_unlock_bh(&hooks_lock); ++ nf_register_net_hook(hook->net, &hook->ops); ++ if (table->ft.flags & NF_FLOWTABLE_HW_OFFLOAD) ++ table->ft.type->setup(&table->ft, hook->ops.dev, ++ FLOW_BLOCK_BIND); ++ spin_lock_bh(&hooks_lock); ++ goto restart; ++ } ++ ++} ++ ++static bool ++xt_flowoffload_cleanup_hooks(struct xt_flowoffload_table *table) ++{ ++ struct xt_flowoffload_hook *hook; ++ bool active = false; ++ ++restart: ++ spin_lock_bh(&hooks_lock); ++ hlist_for_each_entry(hook, &table->hooks, list) { ++ if (hook->used || !hook->registered) { ++ active = true; ++ continue; ++ } ++ ++ hlist_del(&hook->list); ++ spin_unlock_bh(&hooks_lock); ++ if (table->ft.flags & NF_FLOWTABLE_HW_OFFLOAD) ++ table->ft.type->setup(&table->ft, hook->ops.dev, ++ FLOW_BLOCK_UNBIND); ++ nf_unregister_net_hook(hook->net, &hook->ops); ++ kfree(hook); ++ goto restart; ++ } ++ spin_unlock_bh(&hooks_lock); ++ ++ return active; ++} ++ ++static void ++xt_flowoffload_check_hook(struct nf_flowtable *flowtable, ++ struct flow_offload *flow, void *data) ++{ ++ struct xt_flowoffload_table *table; ++ struct flow_offload_tuple *tuple0 = &flow->tuplehash[0].tuple; ++ struct flow_offload_tuple *tuple1 = &flow->tuplehash[1].tuple; ++ struct xt_flowoffload_hook *hook; ++ ++ table = container_of(flowtable, struct xt_flowoffload_table, ft); ++ ++ spin_lock_bh(&hooks_lock); ++ hlist_for_each_entry(hook, &table->hooks, list) { ++ if (hook->ops.dev->ifindex != tuple0->iifidx && ++ hook->ops.dev->ifindex != tuple1->iifidx) ++ continue; ++ ++ hook->used = true; ++ } ++ spin_unlock_bh(&hooks_lock); ++} ++ ++static void ++xt_flowoffload_hook_work(struct work_struct *work) ++{ ++ struct xt_flowoffload_table *table; ++ struct xt_flowoffload_hook *hook; ++ int err; ++ ++ table = container_of(work, struct xt_flowoffload_table, work.work); ++ ++ spin_lock_bh(&hooks_lock); ++ xt_flowoffload_register_hooks(table); ++ hlist_for_each_entry(hook, &table->hooks, list) ++ hook->used = false; ++ spin_unlock_bh(&hooks_lock); ++ ++ err = nf_flow_table_iterate(&table->ft, xt_flowoffload_check_hook, ++ NULL); ++ if (err && err != -EAGAIN) ++ goto out; ++ ++ if (!xt_flowoffload_cleanup_hooks(table)) ++ return; ++ ++out: ++ queue_delayed_work(system_power_efficient_wq, &table->work, HZ); ++} ++ ++static bool ++xt_flowoffload_skip(struct sk_buff *skb, int family) ++{ ++ if (skb_sec_path(skb)) ++ return true; ++ ++ if (family == NFPROTO_IPV4) { ++ const struct ip_options *opt = &(IPCB(skb)->opt); ++ ++ if (unlikely(opt->optlen)) ++ return true; ++ } ++ ++ return false; ++} ++ ++static enum flow_offload_xmit_type nf_xmit_type(struct dst_entry *dst) ++{ ++ if (dst_xfrm(dst)) ++ return FLOW_OFFLOAD_XMIT_XFRM; ++ ++ return FLOW_OFFLOAD_XMIT_NEIGH; ++} ++ ++static void nf_default_forward_path(struct nf_flow_route *route, ++ struct dst_entry *dst_cache, ++ enum ip_conntrack_dir dir, ++ struct net_device **dev) ++{ ++ dev[!dir] = dst_cache->dev; ++ route->tuple[!dir].in.ifindex = dst_cache->dev->ifindex; ++ route->tuple[dir].dst = dst_cache; ++ route->tuple[dir].xmit_type = nf_xmit_type(dst_cache); ++} ++ ++static bool nf_is_valid_ether_device(const struct net_device *dev) ++{ ++ if (!dev || (dev->flags & IFF_LOOPBACK) || dev->type != ARPHRD_ETHER || ++ dev->addr_len != ETH_ALEN || !is_valid_ether_addr(dev->dev_addr)) ++ return false; ++ ++ return true; ++} ++ ++static void nf_dev_path_info(const struct net_device_path_stack *stack, ++ struct nf_forward_info *info, ++ unsigned char *ha) ++{ ++ const struct net_device_path *path; ++ int i; ++ ++ memcpy(info->h_dest, ha, ETH_ALEN); ++ ++ for (i = 0; i < stack->num_paths; i++) { ++ path = &stack->path[i]; ++ switch (path->type) { ++ case DEV_PATH_ETHERNET: ++ case DEV_PATH_DSA: ++ case DEV_PATH_VLAN: ++ case DEV_PATH_PPPOE: ++ info->indev = path->dev; ++ if (is_zero_ether_addr(info->h_source)) ++ memcpy(info->h_source, path->dev->dev_addr, ETH_ALEN); ++ ++ if (path->type == DEV_PATH_ETHERNET) ++ break; ++ if (path->type == DEV_PATH_DSA) { ++ i = stack->num_paths; ++ break; ++ } ++ ++ /* DEV_PATH_VLAN and DEV_PATH_PPPOE */ ++ if (info->num_encaps >= NF_FLOW_TABLE_ENCAP_MAX) { ++ info->indev = NULL; ++ break; ++ } ++ if (!info->outdev) ++ info->outdev = path->dev; ++ info->encap[info->num_encaps].id = path->encap.id; ++ info->encap[info->num_encaps].proto = path->encap.proto; ++ info->num_encaps++; ++ if (path->type == DEV_PATH_PPPOE) ++ memcpy(info->h_dest, path->encap.h_dest, ETH_ALEN); ++ break; ++ case DEV_PATH_BRIDGE: ++ if (is_zero_ether_addr(info->h_source)) ++ memcpy(info->h_source, path->dev->dev_addr, ETH_ALEN); ++ ++ switch (path->bridge.vlan_mode) { ++ case DEV_PATH_BR_VLAN_UNTAG_HW: ++ info->ingress_vlans |= BIT(info->num_encaps - 1); ++ break; ++ case DEV_PATH_BR_VLAN_TAG: ++ info->encap[info->num_encaps].id = path->bridge.vlan_id; ++ info->encap[info->num_encaps].proto = path->bridge.vlan_proto; ++ info->num_encaps++; ++ break; ++ case DEV_PATH_BR_VLAN_UNTAG: ++ info->num_encaps--; ++ break; ++ case DEV_PATH_BR_VLAN_KEEP: ++ break; ++ } ++ break; ++ default: ++ info->indev = NULL; ++ break; ++ } ++ } ++ if (!info->outdev) ++ info->outdev = info->indev; ++ ++ info->hw_outdev = info->indev; ++ ++ if (nf_is_valid_ether_device(info->indev)) ++ info->xmit_type = FLOW_OFFLOAD_XMIT_DIRECT; ++} ++ ++static int nf_dev_fill_forward_path(const struct nf_flow_route *route, ++ const struct dst_entry *dst_cache, ++ const struct nf_conn *ct, ++ enum ip_conntrack_dir dir, u8 *ha, ++ struct net_device_path_stack *stack) ++{ ++ const void *daddr = &ct->tuplehash[!dir].tuple.src.u3; ++ struct net_device *dev = dst_cache->dev; ++ struct neighbour *n; ++ u8 nud_state; ++ ++ if (!nf_is_valid_ether_device(dev)) ++ goto out; ++ ++ n = dst_neigh_lookup(dst_cache, daddr); ++ if (!n) ++ return -1; ++ ++ read_lock_bh(&n->lock); ++ nud_state = n->nud_state; ++ ether_addr_copy(ha, n->ha); ++ read_unlock_bh(&n->lock); ++ neigh_release(n); ++ ++ if (!(nud_state & NUD_VALID)) ++ return -1; ++ ++out: ++ return dev_fill_forward_path(dev, ha, stack); ++} ++ ++static void nf_dev_forward_path(struct nf_flow_route *route, ++ const struct nf_conn *ct, ++ enum ip_conntrack_dir dir, ++ struct net_device **devs) ++{ ++ const struct dst_entry *dst = route->tuple[dir].dst; ++ struct net_device_path_stack stack; ++ struct nf_forward_info info = {}; ++ unsigned char ha[ETH_ALEN]; ++ int i; ++ ++ if (nf_dev_fill_forward_path(route, dst, ct, dir, ha, &stack) >= 0) ++ nf_dev_path_info(&stack, &info, ha); ++ ++ devs[!dir] = (struct net_device *)info.indev; ++ if (!info.indev) ++ return; ++ ++ route->tuple[!dir].in.ifindex = info.indev->ifindex; ++ for (i = 0; i < info.num_encaps; i++) { ++ route->tuple[!dir].in.encap[i].id = info.encap[i].id; ++ route->tuple[!dir].in.encap[i].proto = info.encap[i].proto; ++ } ++ route->tuple[!dir].in.num_encaps = info.num_encaps; ++ route->tuple[!dir].in.ingress_vlans = info.ingress_vlans; ++ ++ if (info.xmit_type == FLOW_OFFLOAD_XMIT_DIRECT) { ++ memcpy(route->tuple[dir].out.h_source, info.h_source, ETH_ALEN); ++ memcpy(route->tuple[dir].out.h_dest, info.h_dest, ETH_ALEN); ++ route->tuple[dir].out.ifindex = info.outdev->ifindex; ++ route->tuple[dir].out.hw_ifindex = info.hw_outdev->ifindex; ++ route->tuple[dir].xmit_type = info.xmit_type; ++ } ++} ++ ++static int ++xt_flowoffload_route(struct sk_buff *skb, const struct nf_conn *ct, ++ const struct xt_action_param *par, ++ struct nf_flow_route *route, enum ip_conntrack_dir dir, ++ struct net_device **devs) ++{ ++ struct dst_entry *this_dst = skb_dst(skb); ++ struct dst_entry *other_dst = NULL; ++ struct flowi fl; ++ ++ memset(&fl, 0, sizeof(fl)); ++ switch (xt_family(par)) { ++ case NFPROTO_IPV4: ++ fl.u.ip4.daddr = ct->tuplehash[dir].tuple.src.u3.ip; ++ fl.u.ip4.flowi4_oif = xt_in(par)->ifindex; ++ break; ++ case NFPROTO_IPV6: ++ fl.u.ip6.saddr = ct->tuplehash[!dir].tuple.dst.u3.in6; ++ fl.u.ip6.daddr = ct->tuplehash[dir].tuple.src.u3.in6; ++ fl.u.ip6.flowi6_oif = xt_in(par)->ifindex; ++ break; ++ } ++ ++ nf_route(xt_net(par), &other_dst, &fl, false, xt_family(par)); ++ if (!other_dst) ++ return -ENOENT; ++ ++ nf_default_forward_path(route, this_dst, dir, devs); ++ nf_default_forward_path(route, other_dst, !dir, devs); ++ ++ if (route->tuple[dir].xmit_type == FLOW_OFFLOAD_XMIT_NEIGH && ++ route->tuple[!dir].xmit_type == FLOW_OFFLOAD_XMIT_NEIGH) { ++ nf_dev_forward_path(route, ct, dir, devs); ++ nf_dev_forward_path(route, ct, !dir, devs); ++ } ++ ++ return 0; ++} ++ ++static unsigned int ++flowoffload_tg(struct sk_buff *skb, const struct xt_action_param *par) ++{ ++ struct xt_flowoffload_table *table; ++ const struct xt_flowoffload_target_info *info = par->targinfo; ++ struct tcphdr _tcph, *tcph = NULL; ++ enum ip_conntrack_info ctinfo; ++ enum ip_conntrack_dir dir; ++ struct nf_flow_route route = {}; ++ struct flow_offload *flow = NULL; ++ struct net_device *devs[2] = {}; ++ struct nf_conn *ct; ++ struct net *net; ++ ++ if (xt_flowoffload_skip(skb, xt_family(par))) ++ return XT_CONTINUE; ++ ++ ct = nf_ct_get(skb, &ctinfo); ++ if (ct == NULL) ++ return XT_CONTINUE; ++ ++ switch (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum) { ++ case IPPROTO_TCP: ++ if (ct->proto.tcp.state != TCP_CONNTRACK_ESTABLISHED) ++ return XT_CONTINUE; ++ ++ tcph = skb_header_pointer(skb, par->thoff, ++ sizeof(_tcph), &_tcph); ++ if (unlikely(!tcph || tcph->fin || tcph->rst)) ++ return XT_CONTINUE; ++ break; ++ case IPPROTO_UDP: ++ break; ++ default: ++ return XT_CONTINUE; ++ } ++ ++ if (nf_ct_ext_exist(ct, NF_CT_EXT_HELPER) || ++ ct->status & (IPS_SEQ_ADJUST | IPS_NAT_CLASH)) ++ return XT_CONTINUE; ++ ++ if (!nf_ct_is_confirmed(ct)) ++ return XT_CONTINUE; ++ ++ devs[dir] = xt_out(par); ++ devs[!dir] = xt_in(par); ++ ++ if (!devs[dir] || !devs[!dir]) ++ return XT_CONTINUE; ++ ++ if (test_and_set_bit(IPS_OFFLOAD_BIT, &ct->status)) ++ return XT_CONTINUE; ++ ++ dir = CTINFO2DIR(ctinfo); ++ ++ if (xt_flowoffload_route(skb, ct, par, &route, dir, devs) < 0) ++ goto err_flow_route; ++ ++ flow = flow_offload_alloc(ct); ++ if (!flow) ++ goto err_flow_alloc; ++ ++ flow_offload_route_init(flow, &route); ++ ++ if (tcph) { ++ ct->proto.tcp.seen[0].flags |= IP_CT_TCP_FLAG_BE_LIBERAL; ++ ct->proto.tcp.seen[1].flags |= IP_CT_TCP_FLAG_BE_LIBERAL; ++ } ++ ++ table = &flowtable[!!(info->flags & XT_FLOWOFFLOAD_HW)]; ++ ++ net = read_pnet(&table->ft.net); ++ if (!net) ++ write_pnet(&table->ft.net, xt_net(par)); ++ ++ if (flow_offload_add(&table->ft, flow) < 0) ++ goto err_flow_add; ++ ++ xt_flowoffload_check_device(table, devs[0]); ++ xt_flowoffload_check_device(table, devs[1]); ++ ++ dst_release(route.tuple[!dir].dst); ++ ++ return XT_CONTINUE; ++ ++err_flow_add: ++ flow_offload_free(flow); ++err_flow_alloc: ++ dst_release(route.tuple[!dir].dst); ++err_flow_route: ++ clear_bit(IPS_OFFLOAD_BIT, &ct->status); ++ ++ return XT_CONTINUE; ++} ++ ++static int flowoffload_chk(const struct xt_tgchk_param *par) ++{ ++ struct xt_flowoffload_target_info *info = par->targinfo; ++ ++ if (info->flags & ~XT_FLOWOFFLOAD_MASK) ++ return -EINVAL; ++ ++ return 0; ++} ++ ++static struct xt_target offload_tg_reg __read_mostly = { ++ .family = NFPROTO_UNSPEC, ++ .name = "FLOWOFFLOAD", ++ .revision = 0, ++ .targetsize = sizeof(struct xt_flowoffload_target_info), ++ .usersize = sizeof(struct xt_flowoffload_target_info), ++ .checkentry = flowoffload_chk, ++ .target = flowoffload_tg, ++ .me = THIS_MODULE, ++}; ++ ++static int flow_offload_netdev_event(struct notifier_block *this, ++ unsigned long event, void *ptr) ++{ ++ struct xt_flowoffload_hook *hook0, *hook1; ++ struct net_device *dev = netdev_notifier_info_to_dev(ptr); ++ ++ if (event != NETDEV_UNREGISTER) ++ return NOTIFY_DONE; ++ ++ spin_lock_bh(&hooks_lock); ++ hook0 = flow_offload_lookup_hook(&flowtable[0], dev); ++ if (hook0) ++ hlist_del(&hook0->list); ++ ++ hook1 = flow_offload_lookup_hook(&flowtable[1], dev); ++ if (hook1) ++ hlist_del(&hook1->list); ++ spin_unlock_bh(&hooks_lock); ++ ++ if (hook0) { ++ nf_unregister_net_hook(hook0->net, &hook0->ops); ++ kfree(hook0); ++ } ++ ++ if (hook1) { ++ nf_unregister_net_hook(hook1->net, &hook1->ops); ++ kfree(hook1); ++ } ++ ++ nf_flow_table_cleanup(dev); ++ ++ return NOTIFY_DONE; ++} ++ ++static struct notifier_block flow_offload_netdev_notifier = { ++ .notifier_call = flow_offload_netdev_event, ++}; ++ ++static int nf_flow_rule_route_inet(struct net *net, ++ struct flow_offload *flow, ++ enum flow_offload_tuple_dir dir, ++ struct nf_flow_rule *flow_rule) ++{ ++ const struct flow_offload_tuple *flow_tuple = &flow->tuplehash[dir].tuple; ++ int err; ++ ++ switch (flow_tuple->l3proto) { ++ case NFPROTO_IPV4: ++ err = nf_flow_rule_route_ipv4(net, flow, dir, flow_rule); ++ break; ++ case NFPROTO_IPV6: ++ err = nf_flow_rule_route_ipv6(net, flow, dir, flow_rule); ++ break; ++ default: ++ err = -1; ++ break; ++ } ++ ++ return err; ++} ++ ++static struct nf_flowtable_type flowtable_inet = { ++ .family = NFPROTO_INET, ++ .init = nf_flow_table_init, ++ .setup = nf_flow_table_offload_setup, ++ .action = nf_flow_rule_route_inet, ++ .free = nf_flow_table_free, ++ .hook = xt_flowoffload_net_hook, ++ .owner = THIS_MODULE, ++}; ++ ++static int init_flowtable(struct xt_flowoffload_table *tbl) ++{ ++ INIT_DELAYED_WORK(&tbl->work, xt_flowoffload_hook_work); ++ tbl->ft.type = &flowtable_inet; ++ tbl->ft.flags = NF_FLOWTABLE_COUNTER; ++ ++ return nf_flow_table_init(&tbl->ft); ++} ++ ++static int __init xt_flowoffload_tg_init(void) ++{ ++ int ret; ++ ++ register_netdevice_notifier(&flow_offload_netdev_notifier); ++ ++ ret = init_flowtable(&flowtable[0]); ++ if (ret) ++ return ret; ++ ++ ret = init_flowtable(&flowtable[1]); ++ if (ret) ++ goto cleanup; ++ ++ flowtable[1].ft.flags |= NF_FLOWTABLE_HW_OFFLOAD; ++ ++ ret = xt_register_target(&offload_tg_reg); ++ if (ret) ++ goto cleanup2; ++ ++ return 0; ++ ++cleanup2: ++ nf_flow_table_free(&flowtable[1].ft); ++cleanup: ++ nf_flow_table_free(&flowtable[0].ft); ++ return ret; ++} ++ ++static void __exit xt_flowoffload_tg_exit(void) ++{ ++ xt_unregister_target(&offload_tg_reg); ++ unregister_netdevice_notifier(&flow_offload_netdev_notifier); ++ nf_flow_table_free(&flowtable[0].ft); ++ nf_flow_table_free(&flowtable[1].ft); ++} ++ ++MODULE_LICENSE("GPL"); ++module_init(xt_flowoffload_tg_init); ++module_exit(xt_flowoffload_tg_exit); diff --git a/debian/patches/patchset-xanmod/net/tcp/cloudflare/0001-tcp-Add-a-sysctl-to-skip-tcp-collapse-processing-whe.patch b/debian/patches/patchset-xanmod/net/tcp/cloudflare/0001-tcp-Add-a-sysctl-to-skip-tcp-collapse-processing-whe.patch new file mode 100644 index 0000000..06f4c22 --- /dev/null +++ b/debian/patches/patchset-xanmod/net/tcp/cloudflare/0001-tcp-Add-a-sysctl-to-skip-tcp-collapse-processing-whe.patch @@ -0,0 +1,152 @@ +From 772c6e460211ac740b2550fa75be36b8a49731fe Mon Sep 17 00:00:00 2001 +From: "mfreemon@cloudflare.com" +Date: Tue, 1 Mar 2022 17:06:02 -0600 +Subject: [PATCH] tcp: Add a sysctl to skip tcp collapse processing when the + receive buffer is full + +For context and additional information about this patch, see the +blog post at https://blog.cloudflare.com/optimizing-tcp-for-high-throughput-and-low-latency/ + +sysctl: net.ipv4.tcp_collapse_max_bytes + +If tcp_collapse_max_bytes is non-zero, attempt to collapse the +queue to free up memory if the current amount of memory allocated +is less than tcp_collapse_max_bytes. Otherwise, the packet is +dropped without attempting to collapse the queue. + +If tcp_collapse_max_bytes is zero, this feature is disabled +and the default Linux behavior is used. The default Linux +behavior is to always perform the attempt to collapse the +queue to free up memory. + +When the receive queue is small, we want to collapse the +queue. There are two reasons for this: (a) the latency of +performing the collapse will be small on a small queue, and +(b) we want to avoid sending a congestion signal (via a +packet drop) to the sender when the receive queue is small. + +The result is that we avoid latency spikes caused by the +time it takes to perform the collapse logic when the receive +queue is large and full, while preserving existing behavior +and performance for all other cases. + +Signed-off-by: Alexandre Frade +--- + include/net/netns/ipv4.h | 1 + + include/trace/events/tcp.h | 7 +++++++ + net/ipv4/sysctl_net_ipv4.c | 7 +++++++ + net/ipv4/tcp_input.c | 36 ++++++++++++++++++++++++++++++++++++ + net/ipv4/tcp_ipv4.c | 1 + + 5 files changed, 52 insertions(+) + +--- a/include/net/netns/ipv4.h ++++ b/include/net/netns/ipv4.h +@@ -223,6 +223,7 @@ struct netns_ipv4 { + + u8 sysctl_fib_notify_on_flag_change; + u8 sysctl_tcp_syn_linear_timeouts; ++ unsigned int sysctl_tcp_collapse_max_bytes; + + #ifdef CONFIG_NET_L3_MASTER_DEV + u8 sysctl_udp_l3mdev_accept; +--- a/include/trace/events/tcp.h ++++ b/include/trace/events/tcp.h +@@ -213,6 +213,13 @@ DEFINE_EVENT(tcp_event_sk, tcp_rcv_space + TP_ARGS(sk) + ); + ++DEFINE_EVENT(tcp_event_sk, tcp_collapse_max_bytes_exceeded, ++ ++ TP_PROTO(struct sock *sk), ++ ++ TP_ARGS(sk) ++); ++ + TRACE_EVENT(tcp_retransmit_synack, + + TP_PROTO(const struct sock *sk, const struct request_sock *req), +--- a/net/ipv4/sysctl_net_ipv4.c ++++ b/net/ipv4/sysctl_net_ipv4.c +@@ -1558,6 +1558,13 @@ static struct ctl_table ipv4_net_table[] + .extra2 = SYSCTL_ONE, + }, + { ++ .procname = "tcp_collapse_max_bytes", ++ .data = &init_net.ipv4.sysctl_tcp_collapse_max_bytes, ++ .maxlen = sizeof(unsigned int), ++ .mode = 0644, ++ .proc_handler = proc_douintvec_minmax, ++ }, ++ { + .procname = "tcp_pingpong_thresh", + .data = &init_net.ipv4.sysctl_tcp_pingpong_thresh, + .maxlen = sizeof(u8), +--- a/net/ipv4/tcp_input.c ++++ b/net/ipv4/tcp_input.c +@@ -5645,6 +5645,7 @@ static bool tcp_prune_ofo_queue(struct s + static int tcp_prune_queue(struct sock *sk, const struct sk_buff *in_skb) + { + struct tcp_sock *tp = tcp_sk(sk); ++ struct net *net = sock_net(sk); + + NET_INC_STATS(sock_net(sk), LINUX_MIB_PRUNECALLED); + +@@ -5656,6 +5657,39 @@ static int tcp_prune_queue(struct sock * + if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf) + return 0; + ++ /* For context and additional information about this patch, see the ++ * blog post at ++ * ++ * sysctl: net.ipv4.tcp_collapse_max_bytes ++ * ++ * If tcp_collapse_max_bytes is non-zero, attempt to collapse the ++ * queue to free up memory if the current amount of memory allocated ++ * is less than tcp_collapse_max_bytes. Otherwise, the packet is ++ * dropped without attempting to collapse the queue. ++ * ++ * If tcp_collapse_max_bytes is zero, this feature is disabled ++ * and the default Linux behavior is used. The default Linux ++ * behavior is to always perform the attempt to collapse the ++ * queue to free up memory. ++ * ++ * When the receive queue is small, we want to collapse the ++ * queue. There are two reasons for this: (a) the latency of ++ * performing the collapse will be small on a small queue, and ++ * (b) we want to avoid sending a congestion signal (via a ++ * packet drop) to the sender when the receive queue is small. ++ * ++ * The result is that we avoid latency spikes caused by the ++ * time it takes to perform the collapse logic when the receive ++ * queue is large and full, while preserving existing behavior ++ * and performance for all other cases. ++ */ ++ if (net->ipv4.sysctl_tcp_collapse_max_bytes && ++ (atomic_read(&sk->sk_rmem_alloc) > net->ipv4.sysctl_tcp_collapse_max_bytes)) { ++ /* We are dropping the packet */ ++ trace_tcp_collapse_max_bytes_exceeded(sk); ++ goto do_not_collapse; ++ } ++ + tcp_collapse_ofo_queue(sk); + if (!skb_queue_empty(&sk->sk_receive_queue)) + tcp_collapse(sk, &sk->sk_receive_queue, NULL, +@@ -5674,6 +5708,8 @@ static int tcp_prune_queue(struct sock * + if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf) + return 0; + ++do_not_collapse: ++ + /* If we are really being abused, tell the caller to silently + * drop receive data on the floor. It will get retransmitted + * and hopefully then we'll have sufficient space. +--- a/net/ipv4/tcp_ipv4.c ++++ b/net/ipv4/tcp_ipv4.c +@@ -3508,6 +3508,7 @@ static int __net_init tcp_sk_init(struct + + net->ipv4.sysctl_tcp_syn_linear_timeouts = 4; + net->ipv4.sysctl_tcp_shrink_window = 0; ++ net->ipv4.sysctl_tcp_collapse_max_bytes = 0; + + net->ipv4.sysctl_tcp_pingpong_thresh = 1; + net->ipv4.sysctl_tcp_rto_min_us = jiffies_to_usecs(TCP_RTO_MIN); diff --git a/debian/patches/patchset-xanmod/pci_acso/0001-PCI-Enable-overrides-for-missing-ACS-capabilities.patch b/debian/patches/patchset-xanmod/pci_acso/0001-PCI-Enable-overrides-for-missing-ACS-capabilities.patch new file mode 100644 index 0000000..4581564 --- /dev/null +++ b/debian/patches/patchset-xanmod/pci_acso/0001-PCI-Enable-overrides-for-missing-ACS-capabilities.patch @@ -0,0 +1,191 @@ +From 454d96573eea54fcb7714a4a455d13173dfb9768 Mon Sep 17 00:00:00 2001 +From: Mark Weiman +Date: Sun, 12 Aug 2018 11:36:21 -0400 +Subject: [PATCH] PCI: Enable overrides for missing ACS capabilities + +This an updated version of Alex Williamson's patch from: +https://lkml.org/lkml/2013/5/30/513 + +Original commit message follows: + +PCIe ACS (Access Control Services) is the PCIe 2.0+ feature that +allows us to control whether transactions are allowed to be redirected +in various subnodes of a PCIe topology. For instance, if two +endpoints are below a root port or downsteam switch port, the +downstream port may optionally redirect transactions between the +devices, bypassing upstream devices. The same can happen internally +on multifunction devices. The transaction may never be visible to the +upstream devices. + +One upstream device that we particularly care about is the IOMMU. If +a redirection occurs in the topology below the IOMMU, then the IOMMU +cannot provide isolation between devices. This is why the PCIe spec +encourages topologies to include ACS support. Without it, we have to +assume peer-to-peer DMA within a hierarchy can bypass IOMMU isolation. + +Unfortunately, far too many topologies do not support ACS to make this +a steadfast requirement. Even the latest chipsets from Intel are only +sporadically supporting ACS. We have trouble getting interconnect +vendors to include the PCIe spec required PCIe capability, let alone +suggested features. + +Therefore, we need to add some flexibility. The pcie_acs_override= +boot option lets users opt-in specific devices or sets of devices to +assume ACS support. The "downstream" option assumes full ACS support +on root ports and downstream switch ports. The "multifunction" +option assumes the subset of ACS features available on multifunction +endpoints and upstream switch ports are supported. The "id:nnnn:nnnn" +option enables ACS support on devices matching the provided vendor +and device IDs, allowing more strategic ACS overrides. These options +may be combined in any order. A maximum of 16 id specific overrides +are available. It's suggested to use the most limited set of options +necessary to avoid completely disabling ACS across the topology. +Note to hardware vendors, we have facilities to permanently quirk +specific devices which enforce isolation but not provide an ACS +capability. Please contact me to have your devices added and save +your customers the hassle of this boot option. + +Rebased-by: Alexandre Frade +Signed-off-by: Mark Weiman +Signed-off-by: Alexandre Frade +--- + .../admin-guide/kernel-parameters.txt | 9 ++ + drivers/pci/quirks.c | 102 ++++++++++++++++++ + 2 files changed, 111 insertions(+) + +--- a/Documentation/admin-guide/kernel-parameters.txt ++++ b/Documentation/admin-guide/kernel-parameters.txt +@@ -4414,6 +4414,15 @@ + nomsi [MSI] If the PCI_MSI kernel config parameter is + enabled, this kernel boot option can be used to + disable the use of MSI interrupts system-wide. ++ pcie_acs_override = ++ [PCIE] Override missing PCIe ACS support for: ++ downstream ++ All downstream ports - full ACS capabilities ++ multifunction ++ All multifunction devices - multifunction ACS subset ++ id:nnnn:nnnn ++ Specific device - full ACS capabilities ++ Specified as vid:did (vendor/device ID) in hex + noioapicquirk [APIC] Disable all boot interrupt quirks. + Safety option to keep boot IRQs enabled. This + should never be necessary. +--- a/drivers/pci/quirks.c ++++ b/drivers/pci/quirks.c +@@ -3747,6 +3747,106 @@ static void quirk_no_bus_reset(struct pc + dev->dev_flags |= PCI_DEV_FLAGS_NO_BUS_RESET; + } + ++static bool acs_on_downstream; ++static bool acs_on_multifunction; ++ ++#define NUM_ACS_IDS 16 ++struct acs_on_id { ++ unsigned short vendor; ++ unsigned short device; ++}; ++static struct acs_on_id acs_on_ids[NUM_ACS_IDS]; ++static u8 max_acs_id; ++ ++static __init int pcie_acs_override_setup(char *p) ++{ ++ if (!p) ++ return -EINVAL; ++ ++ while (*p) { ++ if (!strncmp(p, "downstream", 10)) ++ acs_on_downstream = true; ++ if (!strncmp(p, "multifunction", 13)) ++ acs_on_multifunction = true; ++ if (!strncmp(p, "id:", 3)) { ++ char opt[5]; ++ int ret; ++ long val; ++ ++ if (max_acs_id >= NUM_ACS_IDS - 1) { ++ pr_warn("Out of PCIe ACS override slots (%d)\n", ++ NUM_ACS_IDS); ++ goto next; ++ } ++ ++ p += 3; ++ snprintf(opt, 5, "%s", p); ++ ret = kstrtol(opt, 16, &val); ++ if (ret) { ++ pr_warn("PCIe ACS ID parse error %d\n", ret); ++ goto next; ++ } ++ acs_on_ids[max_acs_id].vendor = val; ++ ++ p += strcspn(p, ":"); ++ if (*p != ':') { ++ pr_warn("PCIe ACS invalid ID\n"); ++ goto next; ++ } ++ ++ p++; ++ snprintf(opt, 5, "%s", p); ++ ret = kstrtol(opt, 16, &val); ++ if (ret) { ++ pr_warn("PCIe ACS ID parse error %d\n", ret); ++ goto next; ++ } ++ acs_on_ids[max_acs_id].device = val; ++ max_acs_id++; ++ } ++next: ++ p += strcspn(p, ","); ++ if (*p == ',') ++ p++; ++ } ++ ++ if (acs_on_downstream || acs_on_multifunction || max_acs_id) ++ pr_warn("Warning: PCIe ACS overrides enabled; This may allow non-IOMMU protected peer-to-peer DMA\n"); ++ ++ return 0; ++} ++early_param("pcie_acs_override", pcie_acs_override_setup); ++ ++static int pcie_acs_overrides(struct pci_dev *dev, u16 acs_flags) ++{ ++ int i; ++ ++ /* Never override ACS for legacy devices or devices with ACS caps */ ++ if (!pci_is_pcie(dev) || ++ pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ACS)) ++ return -ENOTTY; ++ ++ for (i = 0; i < max_acs_id; i++) ++ if (acs_on_ids[i].vendor == dev->vendor && ++ acs_on_ids[i].device == dev->device) ++ return 1; ++ ++ switch (pci_pcie_type(dev)) { ++ case PCI_EXP_TYPE_DOWNSTREAM: ++ case PCI_EXP_TYPE_ROOT_PORT: ++ if (acs_on_downstream) ++ return 1; ++ break; ++ case PCI_EXP_TYPE_ENDPOINT: ++ case PCI_EXP_TYPE_UPSTREAM: ++ case PCI_EXP_TYPE_LEG_END: ++ case PCI_EXP_TYPE_RC_END: ++ if (acs_on_multifunction && dev->multifunction) ++ return 1; ++ } ++ ++ return -ENOTTY; ++} + /* + * Some NVIDIA GPU devices do not work with bus reset, SBR needs to be + * prevented for those affected devices. +@@ -5168,6 +5268,8 @@ static const struct pci_dev_acs_enabled + { PCI_VENDOR_ID_ZHAOXIN, PCI_ANY_ID, pci_quirk_zhaoxin_pcie_ports_acs }, + /* Wangxun nics */ + { PCI_VENDOR_ID_WANGXUN, PCI_ANY_ID, pci_quirk_wangxun_nic_acs }, ++ /* ACS override */ ++ { PCI_ANY_ID, PCI_ANY_ID, pcie_acs_overrides }, + { 0 } + }; + diff --git a/debian/patches/patchset-xanmod/valve/0001-extcon-Add-driver-for-Steam-Deck.patch b/debian/patches/patchset-xanmod/valve/0001-extcon-Add-driver-for-Steam-Deck.patch new file mode 100644 index 0000000..a32e0d3 --- /dev/null +++ b/debian/patches/patchset-xanmod/valve/0001-extcon-Add-driver-for-Steam-Deck.patch @@ -0,0 +1,219 @@ +From e054b70adebd85ed4ab62b458ee6ec3e902d970f Mon Sep 17 00:00:00 2001 +From: Andrey Smirnov +Date: Sun, 27 Feb 2022 14:46:08 -0800 +Subject: [PATCH 1/6] extcon: Add driver for Steam Deck + +(cherry picked from commit f9f2eddae582ae39d5f89c1218448fc259b90aa8) +Signed-off-by: Cristian Ciocaltea +Signed-off-by: Alexandre Frade +--- + drivers/extcon/Kconfig | 7 ++ + drivers/extcon/Makefile | 1 + + drivers/extcon/extcon-steamdeck.c | 180 ++++++++++++++++++++++++++++++ + 3 files changed, 188 insertions(+) + create mode 100644 drivers/extcon/extcon-steamdeck.c + +--- a/drivers/extcon/Kconfig ++++ b/drivers/extcon/Kconfig +@@ -203,4 +203,11 @@ config EXTCON_RTK_TYPE_C + The DHC (Digital Home Hub) RTD series SoC contains a type c module. + This driver will detect the status of the type-c port. + ++config EXTCON_STEAMDECK ++ tristate "Steam Deck extcon support" ++ depends on MFD_STEAMDECK ++ help ++ Say Y here to enable support of USB Type C cable detection extcon ++ support on Steam Deck devices ++ + endif +--- a/drivers/extcon/Makefile ++++ b/drivers/extcon/Makefile +@@ -26,3 +26,4 @@ obj-$(CONFIG_EXTCON_USB_GPIO) += extcon- + obj-$(CONFIG_EXTCON_USBC_CROS_EC) += extcon-usbc-cros-ec.o + obj-$(CONFIG_EXTCON_USBC_TUSB320) += extcon-usbc-tusb320.o + obj-$(CONFIG_EXTCON_RTK_TYPE_C) += extcon-rtk-type-c.o ++obj-$(CONFIG_EXTCON_STEAMDECK) += extcon-steamdeck.o +--- /dev/null ++++ b/drivers/extcon/extcon-steamdeck.c +@@ -0,0 +1,180 @@ ++ ++#include ++#include ++#include ++ ++#define ACPI_STEAMDECK_NOTIFY_STATUS 0x80 ++ ++/* 0 - port connected, 1 -port disconnected */ ++#define ACPI_STEAMDECK_PORT_CONNECT BIT(0) ++/* 0 - Upstream Facing Port, 1 - Downdstream Facing Port */ ++#define ACPI_STEAMDECK_CUR_DATA_ROLE BIT(3) ++/* ++ * Debouncing delay to allow negotiation process to settle. 2s value ++ * was arrived at via trial and error. ++ */ ++#define STEAMDECK_ROLE_SWITCH_DELAY (msecs_to_jiffies(2000)) ++ ++struct steamdeck_extcon { ++ struct acpi_device *adev; ++ struct delayed_work role_work; ++ struct extcon_dev *edev; ++ struct device *dev; ++}; ++ ++static int steamdeck_read_pdcs(struct steamdeck_extcon *sd, unsigned long long *pdcs) ++{ ++ acpi_status status; ++ ++ status = acpi_evaluate_integer(sd->adev->handle, "PDCS", NULL, pdcs); ++ if (ACPI_FAILURE(status)) { ++ dev_err(sd->dev, "PDCS evaluation failed: %s\n", ++ acpi_format_exception(status)); ++ return -EIO; ++ } ++ ++ return 0; ++} ++ ++static void steamdeck_usb_role_work(struct work_struct *work) ++{ ++ struct steamdeck_extcon *sd = ++ container_of(work, struct steamdeck_extcon, role_work.work); ++ unsigned long long pdcs; ++ bool usb_host; ++ ++ if (steamdeck_read_pdcs(sd, &pdcs)) ++ return; ++ ++ /* ++ * We only care about these two ++ */ ++ pdcs &= ACPI_STEAMDECK_PORT_CONNECT | ACPI_STEAMDECK_CUR_DATA_ROLE; ++ ++ /* ++ * For "connect" events our role is determined by a bit in ++ * PDCS, for "disconnect" we switch to being a gadget ++ * unconditionally. The thinking for the latter is we don't ++ * want to start acting as a USB host until we get ++ * confirmation from the firmware that we are a USB host ++ */ ++ usb_host = (pdcs & ACPI_STEAMDECK_PORT_CONNECT) ? ++ pdcs & ACPI_STEAMDECK_CUR_DATA_ROLE : false; ++ ++ dev_dbg(sd->dev, "USB role is %s\n", usb_host ? "host" : "device"); ++ WARN_ON(extcon_set_state_sync(sd->edev, EXTCON_USB_HOST, ++ usb_host)); ++ ++} ++ ++static void steamdeck_notify(acpi_handle handle, u32 event, void *context) ++{ ++ struct device *dev = context; ++ struct steamdeck_extcon *sd = dev_get_drvdata(dev); ++ unsigned long long pdcs; ++ unsigned long delay; ++ ++ switch (event) { ++ case ACPI_STEAMDECK_NOTIFY_STATUS: ++ if (steamdeck_read_pdcs(sd, &pdcs)) ++ return; ++ /* ++ * We process "disconnect" events immediately and ++ * "connect" events with a delay to give the HW time ++ * to settle. For example attaching USB hub (at least ++ * for HW used for testing) will generate intermediary ++ * event with "host" bit not set, followed by the one ++ * that does have it set. ++ */ ++ delay = (pdcs & ACPI_STEAMDECK_PORT_CONNECT) ? ++ STEAMDECK_ROLE_SWITCH_DELAY : 0; ++ ++ queue_delayed_work(system_long_wq, &sd->role_work, delay); ++ break; ++ default: ++ dev_warn(dev, "Unsupported event [0x%x]\n", event); ++ } ++} ++ ++static void steamdeck_remove_notify_handler(void *data) ++{ ++ struct steamdeck_extcon *sd = data; ++ ++ acpi_remove_notify_handler(sd->adev->handle, ACPI_DEVICE_NOTIFY, ++ steamdeck_notify); ++ cancel_delayed_work_sync(&sd->role_work); ++} ++ ++static const unsigned int steamdeck_extcon_cable[] = { ++ EXTCON_USB, ++ EXTCON_USB_HOST, ++ EXTCON_CHG_USB_SDP, ++ EXTCON_CHG_USB_CDP, ++ EXTCON_CHG_USB_DCP, ++ EXTCON_CHG_USB_ACA, ++ EXTCON_NONE, ++}; ++ ++static int steamdeck_extcon_probe(struct platform_device *pdev) ++{ ++ struct device *dev = &pdev->dev; ++ struct steamdeck_extcon *sd; ++ acpi_status status; ++ int ret; ++ ++ sd = devm_kzalloc(dev, sizeof(*sd), GFP_KERNEL); ++ if (!sd) ++ return -ENOMEM; ++ ++ INIT_DELAYED_WORK(&sd->role_work, steamdeck_usb_role_work); ++ platform_set_drvdata(pdev, sd); ++ sd->adev = ACPI_COMPANION(dev->parent); ++ sd->dev = dev; ++ sd->edev = devm_extcon_dev_allocate(dev, steamdeck_extcon_cable); ++ if (IS_ERR(sd->edev)) ++ return PTR_ERR(sd->edev); ++ ++ ret = devm_extcon_dev_register(dev, sd->edev); ++ if (ret < 0) { ++ dev_err(dev, "Failed to register extcon device: %d\n", ret); ++ return ret; ++ } ++ ++ /* ++ * Set initial role value ++ */ ++ queue_delayed_work(system_long_wq, &sd->role_work, 0); ++ flush_delayed_work(&sd->role_work); ++ ++ status = acpi_install_notify_handler(sd->adev->handle, ++ ACPI_DEVICE_NOTIFY, ++ steamdeck_notify, ++ dev); ++ if (ACPI_FAILURE(status)) { ++ dev_err(dev, "Error installing ACPI notify handler\n"); ++ return -EIO; ++ } ++ ++ ret = devm_add_action_or_reset(dev, steamdeck_remove_notify_handler, ++ sd); ++ return ret; ++} ++ ++static const struct platform_device_id steamdeck_extcon_id_table[] = { ++ { .name = "steamdeck-extcon" }, ++ {} ++}; ++MODULE_DEVICE_TABLE(platform, steamdeck_extcon_id_table); ++ ++static struct platform_driver steamdeck_extcon_driver = { ++ .probe = steamdeck_extcon_probe, ++ .driver = { ++ .name = "steamdeck-extcon", ++ }, ++ .id_table = steamdeck_extcon_id_table, ++}; ++module_platform_driver(steamdeck_extcon_driver); ++ ++MODULE_AUTHOR("Andrey Smirnov "); ++MODULE_DESCRIPTION("Steam Deck extcon driver"); ++MODULE_LICENSE("GPL"); diff --git a/debian/patches/patchset-xanmod/valve/0002-hwmon-Add-driver-for-Steam-Deck-s-EC-sensors.patch b/debian/patches/patchset-xanmod/valve/0002-hwmon-Add-driver-for-Steam-Deck-s-EC-sensors.patch new file mode 100644 index 0000000..ec80cc4 --- /dev/null +++ b/debian/patches/patchset-xanmod/valve/0002-hwmon-Add-driver-for-Steam-Deck-s-EC-sensors.patch @@ -0,0 +1,274 @@ +From be312a466f817a38a48668f93bb07a75ff9fa875 Mon Sep 17 00:00:00 2001 +From: Andrey Smirnov +Date: Sat, 19 Feb 2022 16:09:45 -0800 +Subject: [PATCH 2/6] hwmon: Add driver for Steam Deck's EC sensors + +Add driver for sensors exposed by EC firmware on Steam Deck hardware. + +(cherry picked from commit 6917aac77bee6185ae3920b936cdbe7876118c0b) +Signed-off-by: Cristian Ciocaltea +Signed-off-by: Alexandre Frade +--- + drivers/hwmon/Kconfig | 11 ++ + drivers/hwmon/Makefile | 1 + + drivers/hwmon/steamdeck-hwmon.c | 224 ++++++++++++++++++++++++++++++++ + 3 files changed, 236 insertions(+) + create mode 100644 drivers/hwmon/steamdeck-hwmon.c + +--- a/drivers/hwmon/Kconfig ++++ b/drivers/hwmon/Kconfig +@@ -2050,6 +2050,17 @@ config SENSORS_SCH5636 + This driver can also be built as a module. If so, the module + will be called sch5636. + ++config SENSORS_STEAMDECK ++ tristate "Steam Deck EC sensors" ++ depends on MFD_STEAMDECK ++ help ++ If you say yes here you get support for the hardware ++ monitoring features exposed by EC firmware on Steam Deck ++ devices ++ ++ This driver can also be built as a module. If so, the module ++ will be called steamdeck-hwmon. ++ + config SENSORS_STTS751 + tristate "ST Microelectronics STTS751" + depends on I2C +--- a/drivers/hwmon/Makefile ++++ b/drivers/hwmon/Makefile +@@ -207,6 +207,7 @@ obj-$(CONFIG_SENSORS_SMSC47M1) += smsc47 + obj-$(CONFIG_SENSORS_SMSC47M192)+= smsc47m192.o + obj-$(CONFIG_SENSORS_SPARX5) += sparx5-temp.o + obj-$(CONFIG_SENSORS_SPD5118) += spd5118.o ++obj-$(CONFIG_SENSORS_STEAMDECK) += steamdeck-hwmon.o + obj-$(CONFIG_SENSORS_STTS751) += stts751.o + obj-$(CONFIG_SENSORS_SURFACE_FAN)+= surface_fan.o + obj-$(CONFIG_SENSORS_SY7636A) += sy7636a-hwmon.o +--- /dev/null ++++ b/drivers/hwmon/steamdeck-hwmon.c +@@ -0,0 +1,224 @@ ++// SPDX-License-Identifier: GPL-2.0+ ++/* ++ * Steam Deck EC sensors driver ++ * ++ * Copyright (C) 2021-2022 Valve Corporation ++ */ ++ ++#include ++#include ++#include ++ ++#define STEAMDECK_HWMON_NAME "steamdeck-hwmon" ++ ++struct steamdeck_hwmon { ++ struct acpi_device *adev; ++}; ++ ++static long ++steamdeck_hwmon_get(struct steamdeck_hwmon *sd, const char *method) ++{ ++ unsigned long long val; ++ if (ACPI_FAILURE(acpi_evaluate_integer(sd->adev->handle, ++ (char *)method, NULL, &val))) ++ return -EIO; ++ ++ return val; ++} ++ ++static int ++steamdeck_hwmon_read(struct device *dev, enum hwmon_sensor_types type, ++ u32 attr, int channel, long *out) ++{ ++ struct steamdeck_hwmon *sd = dev_get_drvdata(dev); ++ ++ switch (type) { ++ case hwmon_curr: ++ if (attr != hwmon_curr_input) ++ return -EOPNOTSUPP; ++ ++ *out = steamdeck_hwmon_get(sd, "PDAM"); ++ if (*out < 0) ++ return *out; ++ break; ++ case hwmon_in: ++ if (attr != hwmon_in_input) ++ return -EOPNOTSUPP; ++ ++ *out = steamdeck_hwmon_get(sd, "PDVL"); ++ if (*out < 0) ++ return *out; ++ break; ++ case hwmon_temp: ++ if (attr != hwmon_temp_input) ++ return -EOPNOTSUPP; ++ ++ *out = steamdeck_hwmon_get(sd, "BATT"); ++ if (*out < 0) ++ return *out; ++ /* ++ * Assuming BATT returns deg C we need to mutiply it ++ * by 1000 to convert to mC ++ */ ++ *out *= 1000; ++ break; ++ case hwmon_fan: ++ switch (attr) { ++ case hwmon_fan_input: ++ *out = steamdeck_hwmon_get(sd, "FANR"); ++ if (*out < 0) ++ return *out; ++ break; ++ case hwmon_fan_target: ++ *out = steamdeck_hwmon_get(sd, "FSSR"); ++ if (*out < 0) ++ return *out; ++ break; ++ case hwmon_fan_fault: ++ *out = steamdeck_hwmon_get(sd, "FANC"); ++ if (*out < 0) ++ return *out; ++ /* ++ * FANC (Fan check): ++ * 0: Abnormal ++ * 1: Normal ++ */ ++ *out = !*out; ++ break; ++ default: ++ return -EOPNOTSUPP; ++ } ++ break; ++ default: ++ return -EOPNOTSUPP; ++ } ++ ++ return 0; ++} ++ ++static int ++steamdeck_hwmon_read_string(struct device *dev, enum hwmon_sensor_types type, ++ u32 attr, int channel, const char **str) ++{ ++ switch (type) { ++ /* ++ * These two aren't, strictly speaking, measured. EC ++ * firmware just reports what PD negotiation resulted ++ * in. ++ */ ++ case hwmon_curr: ++ *str = "PD Contract Current"; ++ break; ++ case hwmon_in: ++ *str = "PD Contract Voltage"; ++ break; ++ case hwmon_temp: ++ *str = "Battery Temp"; ++ break; ++ case hwmon_fan: ++ *str = "System Fan"; ++ break; ++ default: ++ return -EOPNOTSUPP; ++ } ++ ++ return 0; ++} ++ ++static int ++steamdeck_hwmon_write(struct device *dev, enum hwmon_sensor_types type, ++ u32 attr, int channel, long val) ++{ ++ struct steamdeck_hwmon *sd = dev_get_drvdata(dev); ++ ++ if (type != hwmon_fan || ++ attr != hwmon_fan_target) ++ return -EOPNOTSUPP; ++ ++ val = clamp_val(val, 0, 7300); ++ ++ if (ACPI_FAILURE(acpi_execute_simple_method(sd->adev->handle, ++ "FANS", val))) ++ return -EIO; ++ ++ return 0; ++} ++ ++static umode_t ++steamdeck_hwmon_is_visible(const void *data, enum hwmon_sensor_types type, ++ u32 attr, int channel) ++{ ++ if (type == hwmon_fan && ++ attr == hwmon_fan_target) ++ return 0644; ++ ++ return 0444; ++} ++ ++static const struct hwmon_channel_info *steamdeck_hwmon_info[] = { ++ HWMON_CHANNEL_INFO(in, ++ HWMON_I_INPUT | HWMON_I_LABEL), ++ HWMON_CHANNEL_INFO(curr, ++ HWMON_C_INPUT | HWMON_C_LABEL), ++ HWMON_CHANNEL_INFO(temp, ++ HWMON_T_INPUT | HWMON_T_LABEL), ++ HWMON_CHANNEL_INFO(fan, ++ HWMON_F_INPUT | HWMON_F_LABEL | ++ HWMON_F_TARGET | HWMON_F_FAULT), ++ NULL ++}; ++ ++static const struct hwmon_ops steamdeck_hwmon_ops = { ++ .is_visible = steamdeck_hwmon_is_visible, ++ .read = steamdeck_hwmon_read, ++ .read_string = steamdeck_hwmon_read_string, ++ .write = steamdeck_hwmon_write, ++}; ++ ++static const struct hwmon_chip_info steamdeck_hwmon_chip_info = { ++ .ops = &steamdeck_hwmon_ops, ++ .info = steamdeck_hwmon_info, ++}; ++ ++static int steamdeck_hwmon_probe(struct platform_device *pdev) ++{ ++ struct device *dev = &pdev->dev; ++ struct steamdeck_hwmon *sd; ++ struct device *hwmon; ++ ++ sd = devm_kzalloc(dev, sizeof(*sd), GFP_KERNEL); ++ if (!sd) ++ return -ENOMEM; ++ ++ sd->adev = ACPI_COMPANION(dev->parent); ++ hwmon = devm_hwmon_device_register_with_info(dev, ++ "steamdeck_hwmon", ++ sd, ++ &steamdeck_hwmon_chip_info, ++ NULL); ++ if (IS_ERR(hwmon)) { ++ dev_err(dev, "Failed to register HWMON device"); ++ return PTR_ERR(hwmon); ++ } ++ ++ return 0; ++} ++ ++static const struct platform_device_id steamdeck_hwmon_id_table[] = { ++ { .name = STEAMDECK_HWMON_NAME }, ++ {} ++}; ++MODULE_DEVICE_TABLE(platform, steamdeck_hwmon_id_table); ++ ++static struct platform_driver steamdeck_hwmon_driver = { ++ .probe = steamdeck_hwmon_probe, ++ .driver = { ++ .name = STEAMDECK_HWMON_NAME, ++ }, ++ .id_table = steamdeck_hwmon_id_table, ++}; ++module_platform_driver(steamdeck_hwmon_driver); ++ ++MODULE_AUTHOR("Andrey Smirnov "); ++MODULE_DESCRIPTION("Steam Deck EC sensors driver"); ++MODULE_LICENSE("GPL"); diff --git a/debian/patches/patchset-xanmod/valve/0003-hwmon-steamdeck-hwmon-Add-support-for-max-battery-le.patch b/debian/patches/patchset-xanmod/valve/0003-hwmon-steamdeck-hwmon-Add-support-for-max-battery-le.patch new file mode 100644 index 0000000..874d5b9 --- /dev/null +++ b/debian/patches/patchset-xanmod/valve/0003-hwmon-steamdeck-hwmon-Add-support-for-max-battery-le.patch @@ -0,0 +1,104 @@ +From efe7b80f8a7448265d238c13ed1b6e327a9c739e Mon Sep 17 00:00:00 2001 +From: Andrey Smirnov +Date: Sat, 15 Jul 2023 12:58:54 -0700 +Subject: [PATCH 3/6] hwmon: steamdeck-hwmon: Add support for max battery + level/rate + +Add support for max battery level/charge rate attributes. + +Signed-off-by: Andrey Smirnov +(cherry picked from commit 50af83e8fd75dc52221edd3fb6fd7a7f70c4d8a4) +Signed-off-by: Cristian Ciocaltea +Signed-off-by: Alexandre Frade +--- + drivers/hwmon/steamdeck-hwmon.c | 72 ++++++++++++++++++++++++++++++++- + 1 file changed, 71 insertions(+), 1 deletion(-) + +--- a/drivers/hwmon/steamdeck-hwmon.c ++++ b/drivers/hwmon/steamdeck-hwmon.c +@@ -180,6 +180,76 @@ static const struct hwmon_chip_info stea + .info = steamdeck_hwmon_info, + }; + ++ ++static ssize_t ++steamdeck_hwmon_simple_store(struct device *dev, const char *buf, size_t count, ++ const char *method, ++ unsigned long upper_limit) ++{ ++ struct steamdeck_hwmon *sd = dev_get_drvdata(dev); ++ unsigned long value; ++ ++ if (kstrtoul(buf, 10, &value) || value >= upper_limit) ++ return -EINVAL; ++ ++ if (ACPI_FAILURE(acpi_execute_simple_method(sd->adev->handle, ++ (char *)method, value))) ++ return -EIO; ++ ++ return count; ++} ++ ++static ssize_t ++steamdeck_hwmon_simple_show(struct device *dev, char *buf, ++ const char *method) ++{ ++ struct steamdeck_hwmon *sd = dev_get_drvdata(dev); ++ unsigned long value; ++ ++ value = steamdeck_hwmon_get(sd, method); ++ if (value < 0) ++ return value; ++ ++ return sprintf(buf, "%ld\n", value); ++} ++ ++#define STEAMDECK_HWMON_ATTR_RW(_name, _set_method, _get_method, \ ++ _upper_limit) \ ++ static ssize_t _name##_show(struct device *dev, \ ++ struct device_attribute *attr, \ ++ char *buf) \ ++ { \ ++ return steamdeck_hwmon_simple_show(dev, buf, \ ++ _get_method); \ ++ } \ ++ static ssize_t _name##_store(struct device *dev, \ ++ struct device_attribute *attr, \ ++ const char *buf, size_t count) \ ++ { \ ++ return steamdeck_hwmon_simple_store(dev, buf, count, \ ++ _set_method, \ ++ _upper_limit); \ ++ } \ ++ static DEVICE_ATTR_RW(_name) ++ ++STEAMDECK_HWMON_ATTR_RW(max_battery_charge_level, "FCBL", "SFBL", 101); ++STEAMDECK_HWMON_ATTR_RW(max_battery_charge_rate, "CHGR", "GCHR", 101); ++ ++static struct attribute *steamdeck_hwmon_attributes[] = { ++ &dev_attr_max_battery_charge_level.attr, ++ &dev_attr_max_battery_charge_rate.attr, ++ NULL ++}; ++ ++static const struct attribute_group steamdeck_hwmon_group = { ++ .attrs = steamdeck_hwmon_attributes, ++}; ++ ++static const struct attribute_group *steamdeck_hwmon_groups[] = { ++ &steamdeck_hwmon_group, ++ NULL ++}; ++ + static int steamdeck_hwmon_probe(struct platform_device *pdev) + { + struct device *dev = &pdev->dev; +@@ -195,7 +265,7 @@ static int steamdeck_hwmon_probe(struct + "steamdeck_hwmon", + sd, + &steamdeck_hwmon_chip_info, +- NULL); ++ steamdeck_hwmon_groups); + if (IS_ERR(hwmon)) { + dev_err(dev, "Failed to register HWMON device"); + return PTR_ERR(hwmon); diff --git a/debian/patches/patchset-xanmod/valve/0004-leds-steamdeck-Add-support-for-Steam-Deck-LED.patch b/debian/patches/patchset-xanmod/valve/0004-leds-steamdeck-Add-support-for-Steam-Deck-LED.patch new file mode 100644 index 0000000..98c746a --- /dev/null +++ b/debian/patches/patchset-xanmod/valve/0004-leds-steamdeck-Add-support-for-Steam-Deck-LED.patch @@ -0,0 +1,118 @@ +From e76032abd59b7d2b15c8106630423e3091b298ce Mon Sep 17 00:00:00 2001 +From: Andrey Smirnov +Date: Sun, 27 Feb 2022 12:58:05 -0800 +Subject: [PATCH 4/6] leds: steamdeck: Add support for Steam Deck LED + +(cherry picked from commit 85a86d19aa7022ff0555023d53aef78323a42d0c) +Signed-off-by: Cristian Ciocaltea +Signed-off-by: Alexandre Frade +--- + drivers/leds/Kconfig | 7 ++++ + drivers/leds/Makefile | 1 + + drivers/leds/leds-steamdeck.c | 74 +++++++++++++++++++++++++++++++++++ + 3 files changed, 82 insertions(+) + create mode 100644 drivers/leds/leds-steamdeck.c + +--- a/drivers/leds/Kconfig ++++ b/drivers/leds/Kconfig +@@ -951,6 +951,13 @@ config LEDS_ACER_A500 + This option enables support for the Power Button LED of + Acer Iconia Tab A500. + ++config LEDS_STEAMDECK ++ tristate "LED support for Steam Deck" ++ depends on LEDS_CLASS && MFD_STEAMDECK ++ help ++ This option enabled support for the status LED (next to the ++ power button) on Steam Deck ++ + source "drivers/leds/blink/Kconfig" + + comment "Flash and Torch LED drivers" +--- a/drivers/leds/Makefile ++++ b/drivers/leds/Makefile +@@ -81,6 +81,7 @@ obj-$(CONFIG_LEDS_POWERNV) += leds-powe + obj-$(CONFIG_LEDS_PWM) += leds-pwm.o + obj-$(CONFIG_LEDS_REGULATOR) += leds-regulator.o + obj-$(CONFIG_LEDS_SC27XX_BLTC) += leds-sc27xx-bltc.o ++obj-$(CONFIG_LEDS_STEAMDECK) += leds-steamdeck.o + obj-$(CONFIG_LEDS_SUN50I_A100) += leds-sun50i-a100.o + obj-$(CONFIG_LEDS_SUNFIRE) += leds-sunfire.o + obj-$(CONFIG_LEDS_SYSCON) += leds-syscon.o +--- /dev/null ++++ b/drivers/leds/leds-steamdeck.c +@@ -0,0 +1,74 @@ ++// SPDX-License-Identifier: GPL-2.0+ ++ ++/* ++ * Steam Deck EC MFD LED cell driver ++ * ++ * Copyright (C) 2021-2022 Valve Corporation ++ * ++ */ ++ ++#include ++#include ++#include ++ ++struct steamdeck_led { ++ struct acpi_device *adev; ++ struct led_classdev cdev; ++}; ++ ++static int steamdeck_leds_brightness_set(struct led_classdev *cdev, ++ enum led_brightness value) ++{ ++ struct steamdeck_led *sd = container_of(cdev, struct steamdeck_led, ++ cdev); ++ ++ if (ACPI_FAILURE(acpi_execute_simple_method(sd->adev->handle, ++ "CHBV", value))) ++ return -EIO; ++ ++ return 0; ++} ++ ++static int steamdeck_leds_probe(struct platform_device *pdev) ++{ ++ struct device *dev = &pdev->dev; ++ struct steamdeck_led *sd; ++ int ret; ++ ++ sd = devm_kzalloc(dev, sizeof(*sd), GFP_KERNEL); ++ if (!sd) ++ return -ENOMEM; ++ ++ sd->adev = ACPI_COMPANION(dev->parent); ++ ++ sd->cdev.name = "status:white"; ++ sd->cdev.brightness_set_blocking = steamdeck_leds_brightness_set; ++ sd->cdev.max_brightness = 100; ++ ++ ret = devm_led_classdev_register(dev, &sd->cdev); ++ if (ret) { ++ dev_err(dev, "Failed to register LEDs device: %d\n", ret); ++ return ret; ++ } ++ ++ return 0; ++} ++ ++static const struct platform_device_id steamdeck_leds_id_table[] = { ++ { .name = "steamdeck-leds" }, ++ {} ++}; ++MODULE_DEVICE_TABLE(platform, steamdeck_leds_id_table); ++ ++static struct platform_driver steamdeck_leds_driver = { ++ .probe = steamdeck_leds_probe, ++ .driver = { ++ .name = "steamdeck-leds", ++ }, ++ .id_table = steamdeck_leds_id_table, ++}; ++module_platform_driver(steamdeck_leds_driver); ++ ++MODULE_AUTHOR("Andrey Smirnov "); ++MODULE_DESCRIPTION("Steam Deck LEDs driver"); ++MODULE_LICENSE("GPL"); diff --git a/debian/patches/patchset-xanmod/valve/0005-mfd-Add-MFD-core-driver-for-Steam-Deck.patch b/debian/patches/patchset-xanmod/valve/0005-mfd-Add-MFD-core-driver-for-Steam-Deck.patch new file mode 100644 index 0000000..8227294 --- /dev/null +++ b/debian/patches/patchset-xanmod/valve/0005-mfd-Add-MFD-core-driver-for-Steam-Deck.patch @@ -0,0 +1,176 @@ +From c3cf089a9da1216d3e1e047eba2ec46e0febe457 Mon Sep 17 00:00:00 2001 +From: Andrey Smirnov +Date: Sat, 19 Feb 2022 16:08:36 -0800 +Subject: [PATCH 5/6] mfd: Add MFD core driver for Steam Deck + +Add MFD core driver for Steam Deck. Doesn't really do much so far +besides instantiating a number of MFD cells that implement all the +interesting functionality. + +(cherry picked from commit 5f534c2d6ebdefccb9c024eb0f013bc1c0c622d9) +Signed-off-by: Cristian Ciocaltea +Signed-off-by: Alexandre Frade +--- + drivers/mfd/Kconfig | 11 ++++ + drivers/mfd/Makefile | 2 + + drivers/mfd/steamdeck.c | 127 ++++++++++++++++++++++++++++++++++++++++ + 3 files changed, 140 insertions(+) + create mode 100644 drivers/mfd/steamdeck.c + +--- a/drivers/mfd/Kconfig ++++ b/drivers/mfd/Kconfig +@@ -2390,5 +2390,16 @@ config MFD_RSMU_SPI + Additional drivers must be enabled in order to use the functionality + of the device. + ++config MFD_STEAMDECK ++ tristate "Valve Steam Deck" ++ select MFD_CORE ++ depends on ACPI ++ depends on X86_64 || COMPILE_TEST ++ help ++ This driver registers various MFD cells that expose aspects ++ of Steam Deck specific ACPI functionality. ++ ++ Say N here, unless you are running on Steam Deck hardware. ++ + endmenu + endif +--- a/drivers/mfd/Makefile ++++ b/drivers/mfd/Makefile +@@ -288,3 +288,5 @@ obj-$(CONFIG_MFD_ATC260X_I2C) += atc260x + + obj-$(CONFIG_MFD_RSMU_I2C) += rsmu_i2c.o rsmu_core.o + obj-$(CONFIG_MFD_RSMU_SPI) += rsmu_spi.o rsmu_core.o ++ ++obj-$(CONFIG_MFD_STEAMDECK) += steamdeck.o +--- /dev/null ++++ b/drivers/mfd/steamdeck.c +@@ -0,0 +1,127 @@ ++// SPDX-License-Identifier: GPL-2.0+ ++ ++/* ++ * Steam Deck EC MFD core driver ++ * ++ * Copyright (C) 2021-2022 Valve Corporation ++ * ++ */ ++ ++#include ++#include ++#include ++ ++#define STEAMDECK_STA_OK \ ++ (ACPI_STA_DEVICE_ENABLED | \ ++ ACPI_STA_DEVICE_PRESENT | \ ++ ACPI_STA_DEVICE_FUNCTIONING) ++ ++struct steamdeck { ++ struct acpi_device *adev; ++ struct device *dev; ++}; ++ ++#define STEAMDECK_ATTR_RO(_name, _method) \ ++ static ssize_t _name##_show(struct device *dev, \ ++ struct device_attribute *attr, \ ++ char *buf) \ ++ { \ ++ struct steamdeck *sd = dev_get_drvdata(dev); \ ++ unsigned long long val; \ ++ \ ++ if (ACPI_FAILURE(acpi_evaluate_integer( \ ++ sd->adev->handle, \ ++ _method, NULL, &val))) \ ++ return -EIO; \ ++ \ ++ return sysfs_emit(buf, "%llu\n", val); \ ++ } \ ++ static DEVICE_ATTR_RO(_name) ++ ++STEAMDECK_ATTR_RO(firmware_version, "PDFW"); ++STEAMDECK_ATTR_RO(board_id, "BOID"); ++ ++static struct attribute *steamdeck_attrs[] = { ++ &dev_attr_firmware_version.attr, ++ &dev_attr_board_id.attr, ++ NULL ++}; ++ ++ATTRIBUTE_GROUPS(steamdeck); ++ ++static const struct mfd_cell steamdeck_cells[] = { ++ { .name = "steamdeck-hwmon" }, ++ { .name = "steamdeck-leds" }, ++ { .name = "steamdeck-extcon" }, ++}; ++ ++static void steamdeck_remove_sysfs_groups(void *data) ++{ ++ struct steamdeck *sd = data; ++ ++ sysfs_remove_groups(&sd->dev->kobj, steamdeck_groups); ++} ++ ++static int steamdeck_probe(struct platform_device *pdev) ++{ ++ struct device *dev = &pdev->dev; ++ unsigned long long sta; ++ struct steamdeck *sd; ++ acpi_status status; ++ int ret; ++ ++ sd = devm_kzalloc(dev, sizeof(*sd), GFP_KERNEL); ++ if (!sd) ++ return -ENOMEM; ++ sd->adev = ACPI_COMPANION(dev); ++ sd->dev = dev; ++ platform_set_drvdata(pdev, sd); ++ ++ status = acpi_evaluate_integer(sd->adev->handle, "_STA", ++ NULL, &sta); ++ if (ACPI_FAILURE(status)) { ++ dev_err(dev, "Status check failed (0x%x)\n", status); ++ return -EINVAL; ++ } ++ ++ if ((sta & STEAMDECK_STA_OK) != STEAMDECK_STA_OK) { ++ dev_err(dev, "Device is not ready\n"); ++ return -EINVAL; ++ } ++ ++ ret = sysfs_create_groups(&dev->kobj, steamdeck_groups); ++ if (ret) { ++ dev_err(dev, "Failed to create sysfs group\n"); ++ return ret; ++ } ++ ++ ret = devm_add_action_or_reset(dev, steamdeck_remove_sysfs_groups, ++ sd); ++ if (ret) { ++ dev_err(dev, "Failed to register devres action\n"); ++ return ret; ++ } ++ ++ return devm_mfd_add_devices(dev, PLATFORM_DEVID_NONE, ++ steamdeck_cells, ARRAY_SIZE(steamdeck_cells), ++ NULL, 0, NULL); ++} ++ ++static const struct acpi_device_id steamdeck_device_ids[] = { ++ { "VLV0100", 0 }, ++ { "", 0 }, ++}; ++MODULE_DEVICE_TABLE(acpi, steamdeck_device_ids); ++ ++static struct platform_driver steamdeck_driver = { ++ .probe = steamdeck_probe, ++ .driver = { ++ .name = "steamdeck", ++ .acpi_match_table = steamdeck_device_ids, ++ }, ++}; ++module_platform_driver(steamdeck_driver); ++ ++MODULE_AUTHOR("Andrey Smirnov "); ++MODULE_DESCRIPTION("Steam Deck EC MFD core driver"); ++MODULE_LICENSE("GPL"); diff --git a/debian/patches/patchset-xanmod/valve/0006-mfd-steamdeck-Expose-controller-board-power-in-sysfs.patch b/debian/patches/patchset-xanmod/valve/0006-mfd-steamdeck-Expose-controller-board-power-in-sysfs.patch new file mode 100644 index 0000000..7153da1 --- /dev/null +++ b/debian/patches/patchset-xanmod/valve/0006-mfd-steamdeck-Expose-controller-board-power-in-sysfs.patch @@ -0,0 +1,49 @@ +From c24abd65482a08109e756c612e83c8c8450a263b Mon Sep 17 00:00:00 2001 +From: Andrey Smirnov +Date: Sun, 24 Sep 2023 15:02:33 -0700 +Subject: [PATCH 6/6] mfd: steamdeck: Expose controller board power in sysfs + +As of version 118 Deck's BIOS implements "SCBP" method that allows +gating power of the controller board (VBUS). Add a basic WO method to +our root MFD device to allow toggling that. + +Signed-off-by: Andrey Smirnov +(cherry picked from commit f97f32718acc10cbb51fef925842392e80904d74) +Signed-off-by: Cristian Ciocaltea +Signed-off-by: Alexandre Frade +--- + drivers/mfd/steamdeck.c | 20 ++++++++++++++++++++ + 1 file changed, 20 insertions(+) + +--- a/drivers/mfd/steamdeck.c ++++ b/drivers/mfd/steamdeck.c +@@ -41,9 +41,29 @@ struct steamdeck { + STEAMDECK_ATTR_RO(firmware_version, "PDFW"); + STEAMDECK_ATTR_RO(board_id, "BOID"); + ++static ssize_t controller_board_power_store(struct device *dev, ++ struct device_attribute *attr, ++ const char *buf, size_t count) ++{ ++ struct steamdeck *sd = dev_get_drvdata(dev); ++ bool enabled; ++ ssize_t ret = kstrtobool(buf, &enabled); ++ ++ if (ret) ++ return ret; ++ ++ if (ACPI_FAILURE(acpi_execute_simple_method(sd->adev->handle, ++ "SCBP", enabled))) ++ return -EIO; ++ ++ return count; ++} ++static DEVICE_ATTR_WO(controller_board_power); ++ + static struct attribute *steamdeck_attrs[] = { + &dev_attr_firmware_version.attr, + &dev_attr_board_id.attr, ++ &dev_attr_controller_board_power.attr, + NULL + }; + diff --git a/debian/patches/patchset-xanmod/xanmod/0001-XANMOD-kbuild-Add-GCC-SMS-based-modulo-scheduling-fl.patch b/debian/patches/patchset-xanmod/xanmod/0001-XANMOD-kbuild-Add-GCC-SMS-based-modulo-scheduling-fl.patch new file mode 100644 index 0000000..967c233 --- /dev/null +++ b/debian/patches/patchset-xanmod/xanmod/0001-XANMOD-kbuild-Add-GCC-SMS-based-modulo-scheduling-fl.patch @@ -0,0 +1,25 @@ +From 2a949c896ea66e647d94254c28b1d4d6194ee14b Mon Sep 17 00:00:00 2001 +From: Alexandre Frade +Date: Mon, 16 Sep 2024 00:55:35 +0000 +Subject: [PATCH 03/19] XANMOD: kbuild: Add GCC SMS-based modulo scheduling + flags + +Signed-off-by: Alexandre Frade +--- + Makefile | 5 +++++ + 1 file changed, 5 insertions(+) + +--- a/Makefile ++++ b/Makefile +@@ -822,6 +822,11 @@ KBUILD_CFLAGS += -Os + KBUILD_RUSTFLAGS += -Copt-level=s + endif + ++# Perform swing modulo scheduling immediately before the first scheduling pass. ++# This pass looks at innermost loops and reorders their instructions by ++# overlapping different iterations. ++KBUILD_CFLAGS += $(call cc-option,-fmodulo-sched -fmodulo-sched-allow-regmoves) ++ + # Always set `debug-assertions` and `overflow-checks` because their default + # depends on `opt-level` and `debug-assertions`, respectively. + KBUILD_RUSTFLAGS += -Cdebug-assertions=$(if $(CONFIG_RUST_DEBUG_ASSERTIONS),y,n) diff --git a/debian/patches/patchset-xanmod/xanmod/0002-kbuild-Remove-GCC-minimal-function-alignment.patch b/debian/patches/patchset-xanmod/xanmod/0002-kbuild-Remove-GCC-minimal-function-alignment.patch new file mode 100644 index 0000000..4bf2185 --- /dev/null +++ b/debian/patches/patchset-xanmod/xanmod/0002-kbuild-Remove-GCC-minimal-function-alignment.patch @@ -0,0 +1,76 @@ +From 867a311f231e89b4c9194579e40718f751761ffc Mon Sep 17 00:00:00 2001 +From: Alexandre Frade +Date: Sat, 31 Aug 2024 16:57:41 +0000 +Subject: [PATCH 04/19] kbuild: Remove GCC minimal function alignment + +Signed-off-by: Alexandre Frade +--- + Makefile | 7 ------- + arch/Kconfig | 12 ------------ + include/linux/compiler_types.h | 10 +++++----- + 3 files changed, 5 insertions(+), 24 deletions(-) + +--- a/Makefile ++++ b/Makefile +@@ -981,15 +981,8 @@ export CC_FLAGS_FPU + export CC_FLAGS_NO_FPU + + ifneq ($(CONFIG_FUNCTION_ALIGNMENT),0) +-# Set the minimal function alignment. Use the newer GCC option +-# -fmin-function-alignment if it is available, or fall back to -falign-funtions. +-# See also CONFIG_CC_HAS_SANE_FUNCTION_ALIGNMENT. +-ifdef CONFIG_CC_HAS_MIN_FUNCTION_ALIGNMENT +-KBUILD_CFLAGS += -fmin-function-alignment=$(CONFIG_FUNCTION_ALIGNMENT) +-else + KBUILD_CFLAGS += -falign-functions=$(CONFIG_FUNCTION_ALIGNMENT) + endif +-endif + + # arch Makefile may override CC so keep this after arch Makefile is included + NOSTDINC_FLAGS += -nostdinc +--- a/arch/Kconfig ++++ b/arch/Kconfig +@@ -1628,18 +1628,6 @@ config FUNCTION_ALIGNMENT + default 4 if FUNCTION_ALIGNMENT_4B + default 0 + +-config CC_HAS_MIN_FUNCTION_ALIGNMENT +- # Detect availability of the GCC option -fmin-function-alignment which +- # guarantees minimal alignment for all functions, unlike +- # -falign-functions which the compiler ignores for cold functions. +- def_bool $(cc-option, -fmin-function-alignment=8) +- +-config CC_HAS_SANE_FUNCTION_ALIGNMENT +- # Set if the guaranteed alignment with -fmin-function-alignment is +- # available or extra care is required in the kernel. Clang provides +- # strict alignment always, even with -falign-functions. +- def_bool CC_HAS_MIN_FUNCTION_ALIGNMENT || CC_IS_CLANG +- + config ARCH_NEED_CMPXCHG_1_EMU + bool + +--- a/include/linux/compiler_types.h ++++ b/include/linux/compiler_types.h +@@ -99,17 +99,17 @@ static inline void __chk_io_ptr(const vo + * gcc: https://gcc.gnu.org/onlinedocs/gcc/Label-Attributes.html#index-cold-label-attribute + * + * When -falign-functions=N is in use, we must avoid the cold attribute as +- * GCC drops the alignment for cold functions. Worse, GCC can implicitly mark +- * callees of cold functions as cold themselves, so it's not sufficient to add +- * __function_aligned here as that will not ensure that callees are correctly +- * aligned. ++ * contemporary versions of GCC drop the alignment for cold functions. Worse, ++ * GCC can implicitly mark callees of cold functions as cold themselves, so ++ * it's not sufficient to add __function_aligned here as that will not ensure ++ * that callees are correctly aligned. + * + * See: + * + * https://lore.kernel.org/lkml/Y77%2FqVgvaJidFpYt@FVFF77S0Q05N + * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=88345#c9 + */ +-#if defined(CONFIG_CC_HAS_SANE_FUNCTION_ALIGNMENT) || (CONFIG_FUNCTION_ALIGNMENT == 0) ++#if !defined(CONFIG_CC_IS_GCC) || (CONFIG_FUNCTION_ALIGNMENT == 0) + #define __cold __attribute__((__cold__)) + #else + #define __cold diff --git a/debian/patches/patchset-xanmod/xanmod/0003-XANMOD-fair-Set-scheduler-tunable-latencies-to-unsca.patch b/debian/patches/patchset-xanmod/xanmod/0003-XANMOD-fair-Set-scheduler-tunable-latencies-to-unsca.patch new file mode 100644 index 0000000..bcbb768 --- /dev/null +++ b/debian/patches/patchset-xanmod/xanmod/0003-XANMOD-fair-Set-scheduler-tunable-latencies-to-unsca.patch @@ -0,0 +1,22 @@ +From e98005253085b5a00881c085fab388a4742e700b Mon Sep 17 00:00:00 2001 +From: Alexandre Frade +Date: Thu, 11 May 2023 19:41:41 +0000 +Subject: [PATCH 05/19] XANMOD: fair: Set scheduler tunable latencies to + unscaled + +Signed-off-by: Alexandre Frade +--- + kernel/sched/fair.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -66,7 +66,7 @@ + * + * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus)) + */ +-unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG; ++unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE; + + /* + * Minimal preemption granularity for CPU-bound tasks: diff --git a/debian/patches/patchset-xanmod/xanmod/0004-XANMOD-sched-Add-yield_type-sysctl-to-reduce-or-disa.patch b/debian/patches/patchset-xanmod/xanmod/0004-XANMOD-sched-Add-yield_type-sysctl-to-reduce-or-disa.patch new file mode 100644 index 0000000..2f6063e --- /dev/null +++ b/debian/patches/patchset-xanmod/xanmod/0004-XANMOD-sched-Add-yield_type-sysctl-to-reduce-or-disa.patch @@ -0,0 +1,71 @@ +From 07341955b471371f5414f94814c49289de9319cf Mon Sep 17 00:00:00 2001 +From: Alexandre Frade +Date: Sun, 15 Sep 2024 23:03:38 +0000 +Subject: [PATCH 06/19] XANMOD: sched: Add yield_type sysctl to reduce or + disable sched_yield + +Signed-off-by: Alexandre Frade +--- + kernel/sched/syscalls.c | 16 +++++++++++++++- + kernel/sysctl.c | 10 ++++++++++ + 2 files changed, 25 insertions(+), 1 deletion(-) + +--- a/kernel/sched/syscalls.c ++++ b/kernel/sched/syscalls.c +@@ -1457,15 +1457,29 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t + return ret; + } + ++/** ++ * sysctl_sched_yield_type - choose the yield level that will perform. ++ * ++ * 0: No yield. ++ * 1: Yield only to better priority/deadline tasks. ++ * 2: Re-queue current tasks. (default CFS) ++ */ ++__read_mostly int sysctl_sched_yield_type = 2; ++ + static void do_sched_yield(void) + { + struct rq_flags rf; + struct rq *rq; + ++ if (!sysctl_sched_yield_type) ++ return; ++ + rq = this_rq_lock_irq(&rf); + + schedstat_inc(rq->yld_count); +- current->sched_class->yield_task(rq); ++ ++ if (sysctl_sched_yield_type > 1) ++ current->sched_class->yield_task(rq); + + preempt_disable(); + rq_unlock_irq(rq, &rf); +--- a/kernel/sysctl.c ++++ b/kernel/sysctl.c +@@ -97,6 +97,7 @@ static const int six_hundred_forty_kb = + #endif + + ++extern int sysctl_sched_yield_type; + static const int ngroups_max = NGROUPS_MAX; + static const int cap_last_cap = CAP_LAST_CAP; + +@@ -1631,6 +1632,15 @@ static struct ctl_table kern_table[] = { + .proc_handler = proc_dointvec, + }, + #endif ++ { ++ .procname = "yield_type", ++ .data = &sysctl_sched_yield_type, ++ .maxlen = sizeof (int), ++ .mode = 0644, ++ .proc_handler = proc_dointvec, ++ .extra1 = SYSCTL_ZERO, ++ .extra2 = SYSCTL_TWO, ++ }, + #ifdef CONFIG_PROC_SYSCTL + { + .procname = "tainted", diff --git a/debian/patches/patchset-xanmod/xanmod/0005-XANMOD-block-mq-deadline-Increase-write-priority-to-.patch b/debian/patches/patchset-xanmod/xanmod/0005-XANMOD-block-mq-deadline-Increase-write-priority-to-.patch new file mode 100644 index 0000000..5c5c68e --- /dev/null +++ b/debian/patches/patchset-xanmod/xanmod/0005-XANMOD-block-mq-deadline-Increase-write-priority-to-.patch @@ -0,0 +1,39 @@ +From 9d6ba825d8c8635e217f7596fb4401c9ef9d408a Mon Sep 17 00:00:00 2001 +From: Alexandre Frade +Date: Wed, 11 May 2022 18:56:51 +0000 +Subject: [PATCH 07/19] XANMOD: block/mq-deadline: Increase write priority to + improve responsiveness + +Signed-off-by: Alexandre Frade +--- + block/mq-deadline.c | 7 +++++-- + 1 file changed, 5 insertions(+), 2 deletions(-) + +--- a/block/mq-deadline.c ++++ b/block/mq-deadline.c +@@ -4,6 +4,9 @@ + * for the blk-mq scheduling framework + * + * Copyright (C) 2016 Jens Axboe ++ * ++ * Tunes for responsiveness by Alexandre Frade ++ * (C) 2022 Alexandre Frade + */ + #include + #include +@@ -28,13 +31,13 @@ + * See Documentation/block/deadline-iosched.rst + */ + static const int read_expire = HZ / 2; /* max time before a read is submitted. */ +-static const int write_expire = 5 * HZ; /* ditto for writes, these limits are SOFT! */ ++static const int write_expire = HZ; /* ditto for writes, these limits are SOFT! */ + /* + * Time after which to dispatch lower priority requests even if higher + * priority requests are pending. + */ + static const int prio_aging_expire = 10 * HZ; +-static const int writes_starved = 2; /* max times reads can starve a write */ ++static const int writes_starved = 1; /* max times reads can starve a write */ + static const int fifo_batch = 16; /* # of sequential requests treated as one + by the above parameters. For throughput. */ + diff --git a/debian/patches/patchset-xanmod/xanmod/0006-XANMOD-block-mq-deadline-Disable-front_merges-by-def.patch b/debian/patches/patchset-xanmod/xanmod/0006-XANMOD-block-mq-deadline-Disable-front_merges-by-def.patch new file mode 100644 index 0000000..00b1d78 --- /dev/null +++ b/debian/patches/patchset-xanmod/xanmod/0006-XANMOD-block-mq-deadline-Disable-front_merges-by-def.patch @@ -0,0 +1,22 @@ +From f4b45e0e5444254caf5992cde662236867ac388b Mon Sep 17 00:00:00 2001 +From: Alexandre Frade +Date: Thu, 6 Jan 2022 16:59:01 +0000 +Subject: [PATCH 08/19] XANMOD: block/mq-deadline: Disable front_merges by + default + +Signed-off-by: Alexandre Frade +--- + block/mq-deadline.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/block/mq-deadline.c ++++ b/block/mq-deadline.c +@@ -600,7 +600,7 @@ static int dd_init_sched(struct request_ + dd->fifo_expire[DD_READ] = read_expire; + dd->fifo_expire[DD_WRITE] = write_expire; + dd->writes_starved = writes_starved; +- dd->front_merges = 1; ++ dd->front_merges = 0; + dd->last_dir = DD_WRITE; + dd->fifo_batch = fifo_batch; + dd->prio_aging_expire = prio_aging_expire; diff --git a/debian/patches/patchset-xanmod/xanmod/0007-XANMOD-block-Set-rq_affinity-to-force-complete-I-O-r.patch b/debian/patches/patchset-xanmod/xanmod/0007-XANMOD-block-Set-rq_affinity-to-force-complete-I-O-r.patch new file mode 100644 index 0000000..63f64f6 --- /dev/null +++ b/debian/patches/patchset-xanmod/xanmod/0007-XANMOD-block-Set-rq_affinity-to-force-complete-I-O-r.patch @@ -0,0 +1,23 @@ +From 08580ad4d8ffb10cb30a228361d2d914a1502e3c Mon Sep 17 00:00:00 2001 +From: Alexandre Frade +Date: Mon, 16 Sep 2024 15:36:01 +0000 +Subject: [PATCH 09/19] XANMOD: block: Set rq_affinity to force complete I/O + requests on same CPU + +Signed-off-by: Alexandre Frade +--- + include/linux/blkdev.h | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +--- a/include/linux/blkdev.h ++++ b/include/linux/blkdev.h +@@ -605,7 +605,8 @@ enum { + QUEUE_FLAG_MAX + }; + +-#define QUEUE_FLAG_MQ_DEFAULT (1UL << QUEUE_FLAG_SAME_COMP) ++#define QUEUE_FLAG_MQ_DEFAULT ((1UL << QUEUE_FLAG_SAME_COMP) | \ ++ (1UL << QUEUE_FLAG_SAME_FORCE)) + + void blk_queue_flag_set(unsigned int flag, struct request_queue *q); + void blk_queue_flag_clear(unsigned int flag, struct request_queue *q); diff --git a/debian/patches/patchset-xanmod/xanmod/0008-XANMOD-blk-wbt-Set-wbt_default_latency_nsec-to-2msec.patch b/debian/patches/patchset-xanmod/xanmod/0008-XANMOD-blk-wbt-Set-wbt_default_latency_nsec-to-2msec.patch new file mode 100644 index 0000000..10bf8f8 --- /dev/null +++ b/debian/patches/patchset-xanmod/xanmod/0008-XANMOD-blk-wbt-Set-wbt_default_latency_nsec-to-2msec.patch @@ -0,0 +1,30 @@ +From 278088038e8259faf193e23ac16b48b0350da6fb Mon Sep 17 00:00:00 2001 +From: Alexandre Frade +Date: Mon, 15 Jul 2024 04:50:34 +0000 +Subject: [PATCH 10/19] XANMOD: blk-wbt: Set wbt_default_latency_nsec() to + 2msec + +Signed-off-by: Alexandre Frade +--- + block/blk-wbt.c | 10 ++-------- + 1 file changed, 2 insertions(+), 8 deletions(-) + +--- a/block/blk-wbt.c ++++ b/block/blk-wbt.c +@@ -730,14 +730,8 @@ EXPORT_SYMBOL_GPL(wbt_enable_default); + + u64 wbt_default_latency_nsec(struct request_queue *q) + { +- /* +- * We default to 2msec for non-rotational storage, and 75msec +- * for rotational storage. +- */ +- if (blk_queue_nonrot(q)) +- return 2000000ULL; +- else +- return 75000000ULL; ++ /* XanMod defaults to 2msec for any type of storage */ ++ return 2000000ULL; + } + + static int wbt_data_dir(const struct request *rq) diff --git a/debian/patches/patchset-xanmod/xanmod/0009-XANMOD-kconfig-add-500Hz-timer-interrupt-kernel-conf.patch b/debian/patches/patchset-xanmod/xanmod/0009-XANMOD-kconfig-add-500Hz-timer-interrupt-kernel-conf.patch new file mode 100644 index 0000000..1b9156c --- /dev/null +++ b/debian/patches/patchset-xanmod/xanmod/0009-XANMOD-kconfig-add-500Hz-timer-interrupt-kernel-conf.patch @@ -0,0 +1,35 @@ +From 1637fbf551572f4578ed5644ae485b8da659b509 Mon Sep 17 00:00:00 2001 +From: Alexandre Frade +Date: Mon, 29 Jan 2018 17:26:15 +0000 +Subject: [PATCH 11/19] XANMOD: kconfig: add 500Hz timer interrupt kernel + config option + +Signed-off-by: Alexandre Frade +--- + kernel/Kconfig.hz | 8 ++++++++ + 1 file changed, 8 insertions(+) + +--- a/kernel/Kconfig.hz ++++ b/kernel/Kconfig.hz +@@ -40,6 +40,13 @@ choice + on SMP and NUMA systems and exactly dividing by both PAL and + NTSC frame rates for video and multimedia work. + ++ config HZ_500 ++ bool "500 HZ" ++ help ++ 500 Hz is a balanced timer frequency. Provides fast interactivity ++ on desktops with great smoothness without increasing CPU power ++ consumption and sacrificing the battery life on laptops. ++ + config HZ_1000 + bool "1000 HZ" + help +@@ -53,6 +60,7 @@ config HZ + default 100 if HZ_100 + default 250 if HZ_250 + default 300 if HZ_300 ++ default 500 if HZ_500 + default 1000 if HZ_1000 + + config SCHED_HRTICK diff --git a/debian/patches/patchset-xanmod/xanmod/0010-XANMOD-dcache-cache_pressure-50-decreases-the-rate-a.patch b/debian/patches/patchset-xanmod/xanmod/0010-XANMOD-dcache-cache_pressure-50-decreases-the-rate-a.patch new file mode 100644 index 0000000..0cee4de --- /dev/null +++ b/debian/patches/patchset-xanmod/xanmod/0010-XANMOD-dcache-cache_pressure-50-decreases-the-rate-a.patch @@ -0,0 +1,22 @@ +From 53a8dc8afb62674acc347ac7d9ebb46338ab7d64 Mon Sep 17 00:00:00 2001 +From: Alexandre Frade +Date: Mon, 29 Jan 2018 16:59:22 +0000 +Subject: [PATCH 12/19] XANMOD: dcache: cache_pressure = 50 decreases the rate + at which VFS caches are reclaimed + +Signed-off-by: Alexandre Frade +--- + fs/dcache.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/fs/dcache.c ++++ b/fs/dcache.c +@@ -73,7 +73,7 @@ + * If no ancestor relationship: + * arbitrary, since it's serialized on rename_lock + */ +-int sysctl_vfs_cache_pressure __read_mostly = 100; ++int sysctl_vfs_cache_pressure __read_mostly = 50; + EXPORT_SYMBOL_GPL(sysctl_vfs_cache_pressure); + + __cacheline_aligned_in_smp DEFINE_SEQLOCK(rename_lock); diff --git a/debian/patches/patchset-xanmod/xanmod/0011-XANMOD-mm-vmscan-Set-minimum-amount-of-swapping.patch b/debian/patches/patchset-xanmod/xanmod/0011-XANMOD-mm-vmscan-Set-minimum-amount-of-swapping.patch new file mode 100644 index 0000000..354cd74 --- /dev/null +++ b/debian/patches/patchset-xanmod/xanmod/0011-XANMOD-mm-vmscan-Set-minimum-amount-of-swapping.patch @@ -0,0 +1,21 @@ +From 73df371934a8b28f57030858965f6869c3705836 Mon Sep 17 00:00:00 2001 +From: Alexandre Frade +Date: Wed, 14 Aug 2024 18:54:53 +0000 +Subject: [PATCH 14/19] XANMOD: mm/vmscan: Set minimum amount of swapping + +Signed-off-by: Alexandre Frade +--- + mm/vmscan.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -199,7 +199,7 @@ struct scan_control { + /* + * From 0 .. MAX_SWAPPINESS. Higher means more swappy. + */ +-int vm_swappiness = 60; ++int vm_swappiness = 1; + + #ifdef CONFIG_MEMCG + diff --git a/debian/patches/patchset-xanmod/xanmod/0012-XANMOD-sched-autogroup-Add-kernel-parameter-and-conf.patch b/debian/patches/patchset-xanmod/xanmod/0012-XANMOD-sched-autogroup-Add-kernel-parameter-and-conf.patch new file mode 100644 index 0000000..ac8cbe4 --- /dev/null +++ b/debian/patches/patchset-xanmod/xanmod/0012-XANMOD-sched-autogroup-Add-kernel-parameter-and-conf.patch @@ -0,0 +1,84 @@ +From 8b42012dc4aa0008ad10cf7575f684ebc3a587cf Mon Sep 17 00:00:00 2001 +From: Alexandre Frade +Date: Wed, 15 Jun 2022 17:07:29 +0000 +Subject: [PATCH 15/19] XANMOD: sched/autogroup: Add kernel parameter and + config option to enable/disable autogroup feature by default + +Signed-off-by: Alexandre Frade +--- + Documentation/admin-guide/kernel-parameters.txt | 6 ++++-- + init/Kconfig | 12 ++++++++++++ + kernel/sched/autogroup.c | 9 ++++++--- + 3 files changed, 22 insertions(+), 5 deletions(-) + +--- a/Documentation/admin-guide/kernel-parameters.txt ++++ b/Documentation/admin-guide/kernel-parameters.txt +@@ -493,6 +493,10 @@ + Format: (must be >=0) + Default: 64 + ++ autogroup= [KNL] Enable or disable scheduler automatic task group ++ creation. ++ Format: ++ + bau= [X86_UV] Enable the BAU on SGI UV. The default + behavior is to disable the BAU (i.e. bau=0). + Format: { "0" | "1" } +@@ -3835,8 +3839,6 @@ + noapic [SMP,APIC,EARLY] Tells the kernel to not make use of any + IOAPICs that may be present in the system. + +- noautogroup Disable scheduler automatic task group creation. +- + nocache [ARM,EARLY] + + no_console_suspend +--- a/init/Kconfig ++++ b/init/Kconfig +@@ -1309,6 +1309,18 @@ config SCHED_AUTOGROUP + desktop applications. Task group autogeneration is currently based + upon task session. + ++config SCHED_AUTOGROUP_DEFAULT_ENABLED ++ bool "Enable automatic process group scheduling feature" ++ default y ++ depends on SCHED_AUTOGROUP ++ help ++ If set, automatic process group scheduling will be enabled per ++ default but can be disabled through passing autogroup=0 on the ++ kernel commandline during boot or a value of 0 via the file ++ proc/sys/kernel/sched_autogroup_enabled. ++ ++ If unsure say Y. ++ + config RELAY + bool "Kernel->user space relay support (formerly relayfs)" + select IRQ_WORK +--- a/kernel/sched/autogroup.c ++++ b/kernel/sched/autogroup.c +@@ -4,7 +4,8 @@ + * Auto-group scheduling implementation: + */ + +-unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1; ++unsigned int __read_mostly sysctl_sched_autogroup_enabled = ++ IS_ENABLED(CONFIG_SCHED_AUTOGROUP_DEFAULT_ENABLED) ? 1 : 0; + static struct autogroup autogroup_default; + static atomic_t autogroup_seq_nr; + +@@ -219,11 +220,13 @@ void sched_autogroup_exit(struct signal_ + + static int __init setup_autogroup(char *str) + { +- sysctl_sched_autogroup_enabled = 0; ++ unsigned long enabled; ++ if (!kstrtoul(str, 0, &enabled)) ++ sysctl_sched_autogroup_enabled = enabled ? 1 : 0; + + return 1; + } +-__setup("noautogroup", setup_autogroup); ++__setup("autogroup=", setup_autogroup); + + #ifdef CONFIG_PROC_FS + diff --git a/debian/patches/patchset-xanmod/xanmod/0013-XANMOD-cpufreq-tunes-ondemand-and-conservative-gover.patch b/debian/patches/patchset-xanmod/xanmod/0013-XANMOD-cpufreq-tunes-ondemand-and-conservative-gover.patch new file mode 100644 index 0000000..b8f9198 --- /dev/null +++ b/debian/patches/patchset-xanmod/xanmod/0013-XANMOD-cpufreq-tunes-ondemand-and-conservative-gover.patch @@ -0,0 +1,62 @@ +From f41037c4ee33dd201c1750727daa390a3f652516 Mon Sep 17 00:00:00 2001 +From: Alexandre Frade +Date: Tue, 31 Mar 2020 13:32:08 -0300 +Subject: [PATCH 16/19] XANMOD: cpufreq: tunes ondemand and conservative + governor for performance + +Signed-off-by: Alexandre Frade +--- + drivers/cpufreq/cpufreq_conservative.c | 8 ++++---- + drivers/cpufreq/cpufreq_ondemand.c | 8 ++++---- + 2 files changed, 8 insertions(+), 8 deletions(-) + +--- a/drivers/cpufreq/cpufreq_conservative.c ++++ b/drivers/cpufreq/cpufreq_conservative.c +@@ -28,8 +28,8 @@ struct cs_dbs_tuners { + }; + + /* Conservative governor macros */ +-#define DEF_FREQUENCY_UP_THRESHOLD (80) +-#define DEF_FREQUENCY_DOWN_THRESHOLD (20) ++#define DEF_FREQUENCY_UP_THRESHOLD (63) ++#define DEF_FREQUENCY_DOWN_THRESHOLD (26) + #define DEF_FREQUENCY_STEP (5) + #define DEF_SAMPLING_DOWN_FACTOR (1) + #define MAX_SAMPLING_DOWN_FACTOR (10) +@@ -47,9 +47,9 @@ static inline unsigned int get_freq_step + } + + /* +- * Every sampling_rate, we check, if current idle time is less than 20% ++ * Every sampling_rate, we check, if current idle time is less than 37% + * (default), then we try to increase frequency. Every sampling_rate * +- * sampling_down_factor, we check, if current idle time is more than 80% ++ * sampling_down_factor, we check, if current idle time is more than 74% + * (default), then we try to decrease frequency + * + * Frequency updates happen at minimum steps of 5% (default) of maximum +--- a/drivers/cpufreq/cpufreq_ondemand.c ++++ b/drivers/cpufreq/cpufreq_ondemand.c +@@ -18,10 +18,10 @@ + #include "cpufreq_ondemand.h" + + /* On-demand governor macros */ +-#define DEF_FREQUENCY_UP_THRESHOLD (80) +-#define DEF_SAMPLING_DOWN_FACTOR (1) ++#define DEF_FREQUENCY_UP_THRESHOLD (63) ++#define DEF_SAMPLING_DOWN_FACTOR (100) + #define MAX_SAMPLING_DOWN_FACTOR (100000) +-#define MICRO_FREQUENCY_UP_THRESHOLD (95) ++#define MICRO_FREQUENCY_UP_THRESHOLD (70) + #define MIN_FREQUENCY_UP_THRESHOLD (1) + #define MAX_FREQUENCY_UP_THRESHOLD (100) + +@@ -128,7 +128,7 @@ static void dbs_freq_increase(struct cpu + } + + /* +- * Every sampling_rate, we check, if current idle time is less than 20% ++ * Every sampling_rate, we check, if current idle time is less than 37% + * (default), then we try to increase frequency. Else, we adjust the frequency + * proportional to load. + */ diff --git a/debian/patches/patchset-xanmod/xanmod/0014-XANMOD-lib-kconfig.debug-disable-default-SYMBOLIC_ER.patch b/debian/patches/patchset-xanmod/xanmod/0014-XANMOD-lib-kconfig.debug-disable-default-SYMBOLIC_ER.patch new file mode 100644 index 0000000..9a2d7f8 --- /dev/null +++ b/debian/patches/patchset-xanmod/xanmod/0014-XANMOD-lib-kconfig.debug-disable-default-SYMBOLIC_ER.patch @@ -0,0 +1,42 @@ +From 000614733021b6723cfd78a4908b9b2e869ea001 Mon Sep 17 00:00:00 2001 +From: Alexandre Frade +Date: Mon, 16 Sep 2024 08:09:56 +0000 +Subject: [PATCH 17/19] XANMOD: lib/kconfig.debug: disable default + SYMBOLIC_ERRNAME and DEBUG_BUGVERBOSE + +Signed-off-by: Alexandre Frade +--- + fs/bcachefs/Kconfig | 1 - + lib/Kconfig.debug | 4 ++-- + 2 files changed, 2 insertions(+), 3 deletions(-) + +--- a/fs/bcachefs/Kconfig ++++ b/fs/bcachefs/Kconfig +@@ -23,7 +23,6 @@ config BCACHEFS_FS + select XOR_BLOCKS + select XXHASH + select SRCU +- select SYMBOLIC_ERRNAME + help + The bcachefs filesystem - a modern, copy on write filesystem, with + support for multiple devices, compression, checksumming, etc. +--- a/lib/Kconfig.debug ++++ b/lib/Kconfig.debug +@@ -190,7 +190,7 @@ config DYNAMIC_DEBUG_CORE + + config SYMBOLIC_ERRNAME + bool "Support symbolic error names in printf" +- default y if PRINTK ++ default n + help + If you say Y here, the kernel's printf implementation will + be able to print symbolic error names such as ENOSPC instead +@@ -200,7 +200,7 @@ config SYMBOLIC_ERRNAME + config DEBUG_BUGVERBOSE + bool "Verbose BUG() reporting (adds 70K)" if DEBUG_KERNEL && EXPERT + depends on BUG && (GENERIC_BUG || HAVE_DEBUG_BUGVERBOSE) +- default y ++ default n + help + Say Y here to make BUG() panics output the file name and line number + of the BUG call as well as the EIP and oops trace. This aids diff --git a/debian/patches/patchset-xanmod/xanmod/0015-XANMOD-scripts-setlocalversion-remove-tag-for-git-re.patch b/debian/patches/patchset-xanmod/xanmod/0015-XANMOD-scripts-setlocalversion-remove-tag-for-git-re.patch new file mode 100644 index 0000000..86c8375 --- /dev/null +++ b/debian/patches/patchset-xanmod/xanmod/0015-XANMOD-scripts-setlocalversion-remove-tag-for-git-re.patch @@ -0,0 +1,21 @@ +From bea9f2e6b80582dc36b5693e08633546b14b2a4e Mon Sep 17 00:00:00 2001 +From: Alexandre Frade +Date: Sun, 29 May 2022 00:57:40 +0000 +Subject: [PATCH 18/19] XANMOD: scripts/setlocalversion: remove "+" tag for git + repo short version + +Signed-off-by: Alexandre Frade +--- + scripts/setlocalversion | 1 - + 1 file changed, 1 deletion(-) + +--- a/scripts/setlocalversion ++++ b/scripts/setlocalversion +@@ -92,7 +92,6 @@ scm_version() + # If only the short version is requested, don't bother + # running further git commands + if $short; then +- echo "+" + return + fi + # If we are past the tagged commit, we pretty print it. diff --git a/debian/patches/patchset-xanmod/xanmod/0016-XANMOD-scripts-setlocalversion-Move-localversion-fil.patch b/debian/patches/patchset-xanmod/xanmod/0016-XANMOD-scripts-setlocalversion-Move-localversion-fil.patch new file mode 100644 index 0000000..148106e --- /dev/null +++ b/debian/patches/patchset-xanmod/xanmod/0016-XANMOD-scripts-setlocalversion-Move-localversion-fil.patch @@ -0,0 +1,19 @@ +From eb42d6c8411f9a70472e6f3632051e2e44910843 Mon Sep 17 00:00:00 2001 +From: Alexandre Frade +Date: Mon, 24 Apr 2023 04:50:34 +0000 +Subject: [PATCH 19/19] XANMOD: scripts/setlocalversion: Move localversion* + files to the end + +Signed-off-by: Alexandre Frade +--- + scripts/setlocalversion | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/scripts/setlocalversion ++++ b/scripts/setlocalversion +@@ -182,4 +182,4 @@ elif [ "${LOCALVERSION+set}" != "set" ]; + scm_version="$(scm_version --short)" + fi + +-echo "${KERNELVERSION}${file_localversion}${config_localversion}${LOCALVERSION}${scm_version}" ++echo "${KERNELVERSION}${config_localversion}${LOCALVERSION}${scm_version}${file_localversion}" diff --git a/debian/patches/patchset-zen/fixes/0001-Partially-revert-drm-amd-amdgpu-add-pipe1-hardware-s.patch b/debian/patches/patchset-zen/fixes/0001-Partially-revert-drm-amd-amdgpu-add-pipe1-hardware-s.patch new file mode 100644 index 0000000..b43efe5 --- /dev/null +++ b/debian/patches/patchset-zen/fixes/0001-Partially-revert-drm-amd-amdgpu-add-pipe1-hardware-s.patch @@ -0,0 +1,38 @@ +From 2b70fecae5eabc439190c435be020db8afd3bd33 Mon Sep 17 00:00:00 2001 +From: Alex Deucher +Date: Wed, 7 Aug 2024 17:04:02 -0400 +Subject: Partially revert "drm/amd/amdgpu: add pipe1 hardware support" + +This partially reverts commit b7a1a0ef12b81957584fef7b61e2d5ec049c7209. + +A user reported stuttering under heavy gfx load with this commit. +I suspect it's due to the fact that the gfx contexts are shared +between the pipes so if there is alot of load on one pipe, we could +end up stalling waiting for a context. + +That said, having both pipes is useful in some contexts and +this patch was actually enabled mainly to support some SR-IOV +use cases, so leave it enabled for SR-IOV. + +Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/3519 +Signed-off-by: Alex Deucher +Cc: ZhenGuo Yin +Cc: Alex Deucher +--- + drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 5 ++++- + 1 file changed, 4 insertions(+), 1 deletion(-) + +--- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c ++++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c +@@ -4711,7 +4711,10 @@ static int gfx_v10_0_sw_init(void *handl + case IP_VERSION(10, 3, 3): + case IP_VERSION(10, 3, 7): + adev->gfx.me.num_me = 1; +- adev->gfx.me.num_pipe_per_me = 2; ++ if (amdgpu_sriov_vf(adev)) ++ adev->gfx.me.num_pipe_per_me = 2; ++ else ++ adev->gfx.me.num_pipe_per_me = 1; + adev->gfx.me.num_queue_per_pipe = 1; + adev->gfx.mec.num_mec = 2; + adev->gfx.mec.num_pipe_per_mec = 4; diff --git a/debian/patches/patchset-zen/fixes/0002-drm-amdgpu-swsmu-default-to-fullscreen-3D-profile-fo.patch b/debian/patches/patchset-zen/fixes/0002-drm-amdgpu-swsmu-default-to-fullscreen-3D-profile-fo.patch new file mode 100644 index 0000000..2fb4597 --- /dev/null +++ b/debian/patches/patchset-zen/fixes/0002-drm-amdgpu-swsmu-default-to-fullscreen-3D-profile-fo.patch @@ -0,0 +1,42 @@ +From 3c881c7323af4269aa58848dd9da694d5a73d353 Mon Sep 17 00:00:00 2001 +From: Alex Deucher +Date: Thu, 3 Oct 2024 09:57:38 -0400 +Subject: drm/amdgpu/swsmu: default to fullscreen 3D profile for dGPUs + +This uses more aggressive hueristics than the the bootup default +profile. On windows the OS has a special fullscreen 3D mode +where this is used. Since we don't have the equivalent on Linux +default to this profile for dGPUs. + +Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/3618 +Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/1500 +Link: https://gitlab.freedesktop.org/drm/amd/-/issues/3131 +Fixes: c50fe289ed72 ("drm/amdgpu/swsmu: always force a state reprogram on init") +Reviewed-by: Kenneth Feng +Signed-off-by: Alex Deucher +--- + drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c | 6 +++++- + 1 file changed, 5 insertions(+), 1 deletion(-) + +--- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c ++++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c +@@ -1257,7 +1257,6 @@ static int smu_sw_init(void *handle) + atomic_set(&smu->smu_power.power_gate.vpe_gated, 1); + atomic_set(&smu->smu_power.power_gate.umsch_mm_gated, 1); + +- smu->workload_mask = 1 << smu->workload_prority[PP_SMC_POWER_PROFILE_BOOTUP_DEFAULT]; + smu->workload_prority[PP_SMC_POWER_PROFILE_BOOTUP_DEFAULT] = 0; + smu->workload_prority[PP_SMC_POWER_PROFILE_FULLSCREEN3D] = 1; + smu->workload_prority[PP_SMC_POWER_PROFILE_POWERSAVING] = 2; +@@ -1266,6 +1265,11 @@ static int smu_sw_init(void *handle) + smu->workload_prority[PP_SMC_POWER_PROFILE_COMPUTE] = 5; + smu->workload_prority[PP_SMC_POWER_PROFILE_CUSTOM] = 6; + ++ if (smu->is_apu) ++ smu->workload_mask = 1 << smu->workload_prority[PP_SMC_POWER_PROFILE_BOOTUP_DEFAULT]; ++ else ++ smu->workload_mask = 1 << smu->workload_prority[PP_SMC_POWER_PROFILE_FULLSCREEN3D]; ++ + smu->workload_setting[0] = PP_SMC_POWER_PROFILE_BOOTUP_DEFAULT; + smu->workload_setting[1] = PP_SMC_POWER_PROFILE_FULLSCREEN3D; + smu->workload_setting[2] = PP_SMC_POWER_PROFILE_POWERSAVING; diff --git a/debian/patches/patchset-zen/sauce/0001-ZEN-Add-VHBA-driver.patch b/debian/patches/patchset-zen/sauce/0001-ZEN-Add-VHBA-driver.patch new file mode 100644 index 0000000..9e4c756 --- /dev/null +++ b/debian/patches/patchset-zen/sauce/0001-ZEN-Add-VHBA-driver.patch @@ -0,0 +1,1183 @@ +From cba900f94444c8c4cde6b5b4edfaeb6160b55ba3 Mon Sep 17 00:00:00 2001 +From: "Jan Alexander Steffens (heftig)" +Date: Mon, 26 Apr 2021 22:12:46 +0200 +Subject: ZEN: Add VHBA driver + +remote https://github.com/cdemu/cdemu +tag vhba-module-20240202 +--- + drivers/scsi/Kconfig | 2 + + drivers/scsi/Makefile | 1 + + drivers/scsi/vhba/Kconfig | 9 + + drivers/scsi/vhba/Makefile | 4 + + drivers/scsi/vhba/vhba.c | 1124 ++++++++++++++++++++++++++++++++++++ + 5 files changed, 1140 insertions(+) + create mode 100644 drivers/scsi/vhba/Kconfig + create mode 100644 drivers/scsi/vhba/Makefile + create mode 100644 drivers/scsi/vhba/vhba.c + +--- a/drivers/scsi/Kconfig ++++ b/drivers/scsi/Kconfig +@@ -1522,4 +1522,6 @@ endif # SCSI_LOWLEVEL + + source "drivers/scsi/device_handler/Kconfig" + ++source "drivers/scsi/vhba/Kconfig" ++ + endmenu +--- a/drivers/scsi/Makefile ++++ b/drivers/scsi/Makefile +@@ -153,6 +153,7 @@ obj-$(CONFIG_CHR_DEV_SCH) += ch.o + obj-$(CONFIG_SCSI_ENCLOSURE) += ses.o + + obj-$(CONFIG_SCSI_HISI_SAS) += hisi_sas/ ++obj-$(CONFIG_VHBA) += vhba/ + + # This goes last, so that "real" scsi devices probe earlier + obj-$(CONFIG_SCSI_DEBUG) += scsi_debug.o +--- /dev/null ++++ b/drivers/scsi/vhba/Kconfig +@@ -0,0 +1,9 @@ ++config VHBA ++ tristate "Virtual (SCSI) Host Bus Adapter" ++ depends on SCSI ++ help ++ This is the in-kernel part of CDEmu, a CD/DVD-ROM device ++ emulator. ++ ++ This driver can also be built as a module. If so, the module ++ will be called vhba. +--- /dev/null ++++ b/drivers/scsi/vhba/Makefile +@@ -0,0 +1,4 @@ ++VHBA_VERSION := 20240202 ++ ++obj-$(CONFIG_VHBA) += vhba.o ++ccflags-y := -DVHBA_VERSION=\"$(VHBA_VERSION)\" -Werror +--- /dev/null ++++ b/drivers/scsi/vhba/vhba.c +@@ -0,0 +1,1124 @@ ++/* ++ * vhba.c ++ * ++ * Copyright (C) 2007-2012 Chia-I Wu ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with this program; if not, write to the Free Software Foundation, Inc., ++ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. ++ */ ++ ++#define pr_fmt(fmt) "vhba: " fmt ++ ++#include ++ ++#include ++#include ++#include ++#include ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 11, 0) ++#include ++#else ++#include ++#endif ++#include ++#include ++#include ++#include ++#include ++#ifdef CONFIG_COMPAT ++#include ++#endif ++#include ++#include ++#include ++#include ++#include ++#include ++ ++ ++MODULE_AUTHOR("Chia-I Wu"); ++MODULE_VERSION(VHBA_VERSION); ++MODULE_DESCRIPTION("Virtual SCSI HBA"); ++MODULE_LICENSE("GPL"); ++ ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 15, 0) ++#define sdev_dbg(sdev, fmt, a...) \ ++ dev_dbg(&(sdev)->sdev_gendev, fmt, ##a) ++#define scmd_dbg(scmd, fmt, a...) \ ++ dev_dbg(&(scmd)->device->sdev_gendev, fmt, ##a) ++#endif ++ ++#define VHBA_MAX_SECTORS_PER_IO 256 ++#define VHBA_MAX_BUS 16 ++#define VHBA_MAX_ID 16 ++#define VHBA_MAX_DEVICES (VHBA_MAX_BUS * (VHBA_MAX_ID-1)) ++#define VHBA_KBUF_SIZE PAGE_SIZE ++ ++#define DATA_TO_DEVICE(dir) ((dir) == DMA_TO_DEVICE || (dir) == DMA_BIDIRECTIONAL) ++#define DATA_FROM_DEVICE(dir) ((dir) == DMA_FROM_DEVICE || (dir) == DMA_BIDIRECTIONAL) ++ ++ ++static int vhba_can_queue = 32; ++module_param_named(can_queue, vhba_can_queue, int, 0); ++ ++ ++enum vhba_req_state { ++ VHBA_REQ_FREE, ++ VHBA_REQ_PENDING, ++ VHBA_REQ_READING, ++ VHBA_REQ_SENT, ++ VHBA_REQ_WRITING, ++}; ++ ++struct vhba_command { ++ struct scsi_cmnd *cmd; ++ /* metatags are per-host. not to be confused with ++ queue tags that are usually per-lun */ ++ unsigned long metatag; ++ int status; ++ struct list_head entry; ++}; ++ ++struct vhba_device { ++ unsigned int num; ++ spinlock_t cmd_lock; ++ struct list_head cmd_list; ++ wait_queue_head_t cmd_wq; ++ atomic_t refcnt; ++ ++ unsigned char *kbuf; ++ size_t kbuf_size; ++}; ++ ++struct vhba_host { ++ struct Scsi_Host *shost; ++ spinlock_t cmd_lock; ++ int cmd_next; ++ struct vhba_command *commands; ++ spinlock_t dev_lock; ++ struct vhba_device *devices[VHBA_MAX_DEVICES]; ++ int num_devices; ++ DECLARE_BITMAP(chgmap, VHBA_MAX_DEVICES); ++ int chgtype[VHBA_MAX_DEVICES]; ++ struct work_struct scan_devices; ++}; ++ ++#define MAX_COMMAND_SIZE 16 ++ ++struct vhba_request { ++ __u32 metatag; ++ __u32 lun; ++ __u8 cdb[MAX_COMMAND_SIZE]; ++ __u8 cdb_len; ++ __u32 data_len; ++}; ++ ++struct vhba_response { ++ __u32 metatag; ++ __u32 status; ++ __u32 data_len; ++}; ++ ++ ++ ++static struct vhba_command *vhba_alloc_command (void); ++static void vhba_free_command (struct vhba_command *vcmd); ++ ++static struct platform_device vhba_platform_device; ++ ++ ++ ++/* These functions define a symmetric 1:1 mapping between device numbers and ++ the bus and id. We have reserved the last id per bus for the host itself. */ ++static void devnum_to_bus_and_id(unsigned int devnum, unsigned int *bus, unsigned int *id) ++{ ++ *bus = devnum / (VHBA_MAX_ID-1); ++ *id = devnum % (VHBA_MAX_ID-1); ++} ++ ++static unsigned int bus_and_id_to_devnum(unsigned int bus, unsigned int id) ++{ ++ return (bus * (VHBA_MAX_ID-1)) + id; ++} ++ ++static struct vhba_device *vhba_device_alloc (void) ++{ ++ struct vhba_device *vdev; ++ ++ vdev = kzalloc(sizeof(struct vhba_device), GFP_KERNEL); ++ if (!vdev) { ++ return NULL; ++ } ++ ++ spin_lock_init(&vdev->cmd_lock); ++ INIT_LIST_HEAD(&vdev->cmd_list); ++ init_waitqueue_head(&vdev->cmd_wq); ++ atomic_set(&vdev->refcnt, 1); ++ ++ vdev->kbuf = NULL; ++ vdev->kbuf_size = 0; ++ ++ return vdev; ++} ++ ++static void vhba_device_put (struct vhba_device *vdev) ++{ ++ if (atomic_dec_and_test(&vdev->refcnt)) { ++ kfree(vdev); ++ } ++} ++ ++static struct vhba_device *vhba_device_get (struct vhba_device *vdev) ++{ ++ atomic_inc(&vdev->refcnt); ++ ++ return vdev; ++} ++ ++static int vhba_device_queue (struct vhba_device *vdev, struct scsi_cmnd *cmd) ++{ ++ struct vhba_host *vhost; ++ struct vhba_command *vcmd; ++ unsigned long flags; ++ ++ vhost = platform_get_drvdata(&vhba_platform_device); ++ ++ vcmd = vhba_alloc_command(); ++ if (!vcmd) { ++ return SCSI_MLQUEUE_HOST_BUSY; ++ } ++ ++ vcmd->cmd = cmd; ++ ++ spin_lock_irqsave(&vdev->cmd_lock, flags); ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 15, 0) ++ vcmd->metatag = scsi_cmd_to_rq(vcmd->cmd)->tag; ++#else ++ vcmd->metatag = vcmd->cmd->request->tag; ++#endif ++ list_add_tail(&vcmd->entry, &vdev->cmd_list); ++ spin_unlock_irqrestore(&vdev->cmd_lock, flags); ++ ++ wake_up_interruptible(&vdev->cmd_wq); ++ ++ return 0; ++} ++ ++static int vhba_device_dequeue (struct vhba_device *vdev, struct scsi_cmnd *cmd) ++{ ++ struct vhba_command *vcmd; ++ int retval; ++ unsigned long flags; ++ ++ spin_lock_irqsave(&vdev->cmd_lock, flags); ++ list_for_each_entry(vcmd, &vdev->cmd_list, entry) { ++ if (vcmd->cmd == cmd) { ++ list_del_init(&vcmd->entry); ++ break; ++ } ++ } ++ ++ /* command not found */ ++ if (&vcmd->entry == &vdev->cmd_list) { ++ spin_unlock_irqrestore(&vdev->cmd_lock, flags); ++ return SUCCESS; ++ } ++ ++ while (vcmd->status == VHBA_REQ_READING || vcmd->status == VHBA_REQ_WRITING) { ++ spin_unlock_irqrestore(&vdev->cmd_lock, flags); ++ scmd_dbg(cmd, "wait for I/O before aborting\n"); ++ schedule_timeout(1); ++ spin_lock_irqsave(&vdev->cmd_lock, flags); ++ } ++ ++ retval = (vcmd->status == VHBA_REQ_SENT) ? FAILED : SUCCESS; ++ ++ vhba_free_command(vcmd); ++ ++ spin_unlock_irqrestore(&vdev->cmd_lock, flags); ++ ++ return retval; ++} ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 19, 0) ++static int vhba_slave_alloc(struct scsi_device *sdev) ++{ ++ struct Scsi_Host *shost = sdev->host; ++ ++ sdev_dbg(sdev, "enabling tagging (queue depth: %i).\n", sdev->queue_depth); ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 17, 0) ++ if (!shost_use_blk_mq(shost) && shost->bqt) { ++#else ++ if (shost->bqt) { ++#endif ++ blk_queue_init_tags(sdev->request_queue, sdev->queue_depth, shost->bqt); ++ } ++ scsi_adjust_queue_depth(sdev, 0, sdev->queue_depth); ++ ++ return 0; ++} ++#endif ++ ++static void vhba_scan_devices_add (struct vhba_host *vhost, int bus, int id) ++{ ++ struct scsi_device *sdev; ++ ++ sdev = scsi_device_lookup(vhost->shost, bus, id, 0); ++ if (!sdev) { ++ scsi_add_device(vhost->shost, bus, id, 0); ++ } else { ++ dev_warn(&vhost->shost->shost_gendev, "tried to add an already-existing device %d:%d:0!\n", bus, id); ++ scsi_device_put(sdev); ++ } ++} ++ ++static void vhba_scan_devices_remove (struct vhba_host *vhost, int bus, int id) ++{ ++ struct scsi_device *sdev; ++ ++ sdev = scsi_device_lookup(vhost->shost, bus, id, 0); ++ if (sdev) { ++ scsi_remove_device(sdev); ++ scsi_device_put(sdev); ++ } else { ++ dev_warn(&vhost->shost->shost_gendev, "tried to remove non-existing device %d:%d:0!\n", bus, id); ++ } ++} ++ ++static void vhba_scan_devices (struct work_struct *work) ++{ ++ struct vhba_host *vhost = container_of(work, struct vhba_host, scan_devices); ++ unsigned long flags; ++ int change, exists; ++ unsigned int devnum; ++ unsigned int bus, id; ++ ++ for (;;) { ++ spin_lock_irqsave(&vhost->dev_lock, flags); ++ ++ devnum = find_first_bit(vhost->chgmap, VHBA_MAX_DEVICES); ++ if (devnum >= VHBA_MAX_DEVICES) { ++ spin_unlock_irqrestore(&vhost->dev_lock, flags); ++ break; ++ } ++ change = vhost->chgtype[devnum]; ++ exists = vhost->devices[devnum] != NULL; ++ ++ vhost->chgtype[devnum] = 0; ++ clear_bit(devnum, vhost->chgmap); ++ ++ spin_unlock_irqrestore(&vhost->dev_lock, flags); ++ ++ devnum_to_bus_and_id(devnum, &bus, &id); ++ ++ if (change < 0) { ++ dev_dbg(&vhost->shost->shost_gendev, "trying to remove target %d:%d:0\n", bus, id); ++ vhba_scan_devices_remove(vhost, bus, id); ++ } else if (change > 0) { ++ dev_dbg(&vhost->shost->shost_gendev, "trying to add target %d:%d:0\n", bus, id); ++ vhba_scan_devices_add(vhost, bus, id); ++ } else { ++ /* quick sequence of add/remove or remove/add; we determine ++ which one it was by checking if device structure exists */ ++ if (exists) { ++ /* remove followed by add: remove and (re)add */ ++ dev_dbg(&vhost->shost->shost_gendev, "trying to (re)add target %d:%d:0\n", bus, id); ++ vhba_scan_devices_remove(vhost, bus, id); ++ vhba_scan_devices_add(vhost, bus, id); ++ } else { ++ /* add followed by remove: no-op */ ++ dev_dbg(&vhost->shost->shost_gendev, "no-op for target %d:%d:0\n", bus, id); ++ } ++ } ++ } ++} ++ ++static int vhba_add_device (struct vhba_device *vdev) ++{ ++ struct vhba_host *vhost; ++ unsigned int devnum; ++ unsigned long flags; ++ ++ vhost = platform_get_drvdata(&vhba_platform_device); ++ ++ vhba_device_get(vdev); ++ ++ spin_lock_irqsave(&vhost->dev_lock, flags); ++ if (vhost->num_devices >= VHBA_MAX_DEVICES) { ++ spin_unlock_irqrestore(&vhost->dev_lock, flags); ++ vhba_device_put(vdev); ++ return -EBUSY; ++ } ++ ++ for (devnum = 0; devnum < VHBA_MAX_DEVICES; devnum++) { ++ if (vhost->devices[devnum] == NULL) { ++ vdev->num = devnum; ++ vhost->devices[devnum] = vdev; ++ vhost->num_devices++; ++ set_bit(devnum, vhost->chgmap); ++ vhost->chgtype[devnum]++; ++ break; ++ } ++ } ++ spin_unlock_irqrestore(&vhost->dev_lock, flags); ++ ++ schedule_work(&vhost->scan_devices); ++ ++ return 0; ++} ++ ++static int vhba_remove_device (struct vhba_device *vdev) ++{ ++ struct vhba_host *vhost; ++ unsigned long flags; ++ ++ vhost = platform_get_drvdata(&vhba_platform_device); ++ ++ spin_lock_irqsave(&vhost->dev_lock, flags); ++ set_bit(vdev->num, vhost->chgmap); ++ vhost->chgtype[vdev->num]--; ++ vhost->devices[vdev->num] = NULL; ++ vhost->num_devices--; ++ spin_unlock_irqrestore(&vhost->dev_lock, flags); ++ ++ vhba_device_put(vdev); ++ ++ schedule_work(&vhost->scan_devices); ++ ++ return 0; ++} ++ ++static struct vhba_device *vhba_lookup_device (int devnum) ++{ ++ struct vhba_host *vhost; ++ struct vhba_device *vdev = NULL; ++ unsigned long flags; ++ ++ vhost = platform_get_drvdata(&vhba_platform_device); ++ ++ if (likely(devnum < VHBA_MAX_DEVICES)) { ++ spin_lock_irqsave(&vhost->dev_lock, flags); ++ vdev = vhost->devices[devnum]; ++ if (vdev) { ++ vdev = vhba_device_get(vdev); ++ } ++ ++ spin_unlock_irqrestore(&vhost->dev_lock, flags); ++ } ++ ++ return vdev; ++} ++ ++static struct vhba_command *vhba_alloc_command (void) ++{ ++ struct vhba_host *vhost; ++ struct vhba_command *vcmd; ++ unsigned long flags; ++ int i; ++ ++ vhost = platform_get_drvdata(&vhba_platform_device); ++ ++ spin_lock_irqsave(&vhost->cmd_lock, flags); ++ ++ vcmd = vhost->commands + vhost->cmd_next++; ++ if (vcmd->status != VHBA_REQ_FREE) { ++ for (i = 0; i < vhba_can_queue; i++) { ++ vcmd = vhost->commands + i; ++ ++ if (vcmd->status == VHBA_REQ_FREE) { ++ vhost->cmd_next = i + 1; ++ break; ++ } ++ } ++ ++ if (i == vhba_can_queue) { ++ vcmd = NULL; ++ } ++ } ++ ++ if (vcmd) { ++ vcmd->status = VHBA_REQ_PENDING; ++ } ++ ++ vhost->cmd_next %= vhba_can_queue; ++ ++ spin_unlock_irqrestore(&vhost->cmd_lock, flags); ++ ++ return vcmd; ++} ++ ++static void vhba_free_command (struct vhba_command *vcmd) ++{ ++ struct vhba_host *vhost; ++ unsigned long flags; ++ ++ vhost = platform_get_drvdata(&vhba_platform_device); ++ ++ spin_lock_irqsave(&vhost->cmd_lock, flags); ++ vcmd->status = VHBA_REQ_FREE; ++ spin_unlock_irqrestore(&vhost->cmd_lock, flags); ++} ++ ++static int vhba_queuecommand (struct Scsi_Host *shost, struct scsi_cmnd *cmd) ++{ ++ struct vhba_device *vdev; ++ int retval; ++ unsigned int devnum; ++ ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 15, 0) ++ scmd_dbg(cmd, "queue %p tag %i\n", cmd, scsi_cmd_to_rq(cmd)->tag); ++#else ++ scmd_dbg(cmd, "queue %p tag %i\n", cmd, cmd->request->tag); ++#endif ++ ++ devnum = bus_and_id_to_devnum(cmd->device->channel, cmd->device->id); ++ vdev = vhba_lookup_device(devnum); ++ if (!vdev) { ++ scmd_dbg(cmd, "no such device\n"); ++ ++ cmd->result = DID_NO_CONNECT << 16; ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 16, 0) ++ scsi_done(cmd); ++#else ++ cmd->scsi_done(cmd); ++#endif ++ ++ return 0; ++ } ++ ++ retval = vhba_device_queue(vdev, cmd); ++ ++ vhba_device_put(vdev); ++ ++ return retval; ++} ++ ++static int vhba_abort (struct scsi_cmnd *cmd) ++{ ++ struct vhba_device *vdev; ++ int retval = SUCCESS; ++ unsigned int devnum; ++ ++ scmd_dbg(cmd, "abort %p\n", cmd); ++ ++ devnum = bus_and_id_to_devnum(cmd->device->channel, cmd->device->id); ++ vdev = vhba_lookup_device(devnum); ++ if (vdev) { ++ retval = vhba_device_dequeue(vdev, cmd); ++ vhba_device_put(vdev); ++ } else { ++ cmd->result = DID_NO_CONNECT << 16; ++ } ++ ++ return retval; ++} ++ ++static struct scsi_host_template vhba_template = { ++ .module = THIS_MODULE, ++ .name = "vhba", ++ .proc_name = "vhba", ++ .queuecommand = vhba_queuecommand, ++ .eh_abort_handler = vhba_abort, ++ .this_id = -1, ++ .max_sectors = VHBA_MAX_SECTORS_PER_IO, ++ .sg_tablesize = 256, ++#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 19, 0) ++ .slave_alloc = vhba_slave_alloc, ++#endif ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 0, 0) ++ .tag_alloc_policy = BLK_TAG_ALLOC_RR, ++#endif ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 19, 0) && LINUX_VERSION_CODE < KERNEL_VERSION(4, 4, 0) ++ .use_blk_tags = 1, ++#endif ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 0, 0) ++ .max_segment_size = VHBA_KBUF_SIZE, ++#endif ++}; ++ ++static ssize_t do_request (struct vhba_device *vdev, unsigned long metatag, struct scsi_cmnd *cmd, char __user *buf, size_t buf_len) ++{ ++ struct vhba_request vreq; ++ ssize_t ret; ++ ++ scmd_dbg(cmd, "request %lu (%p), cdb 0x%x, bufflen %d, sg count %d\n", ++ metatag, cmd, cmd->cmnd[0], scsi_bufflen(cmd), scsi_sg_count(cmd)); ++ ++ ret = sizeof(vreq); ++ if (DATA_TO_DEVICE(cmd->sc_data_direction)) { ++ ret += scsi_bufflen(cmd); ++ } ++ ++ if (ret > buf_len) { ++ scmd_dbg(cmd, "buffer too small (%zd < %zd) for a request\n", buf_len, ret); ++ return -EIO; ++ } ++ ++ vreq.metatag = metatag; ++ vreq.lun = cmd->device->lun; ++ memcpy(vreq.cdb, cmd->cmnd, MAX_COMMAND_SIZE); ++ vreq.cdb_len = cmd->cmd_len; ++ vreq.data_len = scsi_bufflen(cmd); ++ ++ if (copy_to_user(buf, &vreq, sizeof(vreq))) { ++ return -EFAULT; ++ } ++ ++ if (DATA_TO_DEVICE(cmd->sc_data_direction) && vreq.data_len) { ++ buf += sizeof(vreq); ++ ++ if (scsi_sg_count(cmd)) { ++ unsigned char *kaddr, *uaddr; ++ struct scatterlist *sglist = scsi_sglist(cmd); ++ struct scatterlist *sg; ++ int i; ++ ++ uaddr = (unsigned char *) buf; ++ ++ for_each_sg(sglist, sg, scsi_sg_count(cmd), i) { ++ size_t len = sg->length; ++ ++ if (len > vdev->kbuf_size) { ++ scmd_dbg(cmd, "segment size (%zu) exceeds kbuf size (%zu)!", len, vdev->kbuf_size); ++ len = vdev->kbuf_size; ++ } ++ ++ kaddr = kmap_atomic(sg_page(sg)); ++ memcpy(vdev->kbuf, kaddr + sg->offset, len); ++ kunmap_atomic(kaddr); ++ ++ if (copy_to_user(uaddr, vdev->kbuf, len)) { ++ return -EFAULT; ++ } ++ uaddr += len; ++ } ++ } else { ++ if (copy_to_user(buf, scsi_sglist(cmd), vreq.data_len)) { ++ return -EFAULT; ++ } ++ } ++ } ++ ++ return ret; ++} ++ ++static ssize_t do_response (struct vhba_device *vdev, unsigned long metatag, struct scsi_cmnd *cmd, const char __user *buf, size_t buf_len, struct vhba_response *res) ++{ ++ ssize_t ret = 0; ++ ++ scmd_dbg(cmd, "response %lu (%p), status %x, data len %d, sg count %d\n", ++ metatag, cmd, res->status, res->data_len, scsi_sg_count(cmd)); ++ ++ if (res->status) { ++ if (res->data_len > SCSI_SENSE_BUFFERSIZE) { ++ scmd_dbg(cmd, "truncate sense (%d < %d)", SCSI_SENSE_BUFFERSIZE, res->data_len); ++ res->data_len = SCSI_SENSE_BUFFERSIZE; ++ } ++ ++ if (copy_from_user(cmd->sense_buffer, buf, res->data_len)) { ++ return -EFAULT; ++ } ++ ++ cmd->result = res->status; ++ ++ ret += res->data_len; ++ } else if (DATA_FROM_DEVICE(cmd->sc_data_direction) && scsi_bufflen(cmd)) { ++ size_t to_read; ++ ++ if (res->data_len > scsi_bufflen(cmd)) { ++ scmd_dbg(cmd, "truncate data (%d < %d)\n", scsi_bufflen(cmd), res->data_len); ++ res->data_len = scsi_bufflen(cmd); ++ } ++ ++ to_read = res->data_len; ++ ++ if (scsi_sg_count(cmd)) { ++ unsigned char *kaddr, *uaddr; ++ struct scatterlist *sglist = scsi_sglist(cmd); ++ struct scatterlist *sg; ++ int i; ++ ++ uaddr = (unsigned char *)buf; ++ ++ for_each_sg(sglist, sg, scsi_sg_count(cmd), i) { ++ size_t len = (sg->length < to_read) ? sg->length : to_read; ++ ++ if (len > vdev->kbuf_size) { ++ scmd_dbg(cmd, "segment size (%zu) exceeds kbuf size (%zu)!", len, vdev->kbuf_size); ++ len = vdev->kbuf_size; ++ } ++ ++ if (copy_from_user(vdev->kbuf, uaddr, len)) { ++ return -EFAULT; ++ } ++ uaddr += len; ++ ++ kaddr = kmap_atomic(sg_page(sg)); ++ memcpy(kaddr + sg->offset, vdev->kbuf, len); ++ kunmap_atomic(kaddr); ++ ++ to_read -= len; ++ if (to_read == 0) { ++ break; ++ } ++ } ++ } else { ++ if (copy_from_user(scsi_sglist(cmd), buf, res->data_len)) { ++ return -EFAULT; ++ } ++ ++ to_read -= res->data_len; ++ } ++ ++ scsi_set_resid(cmd, to_read); ++ ++ ret += res->data_len - to_read; ++ } ++ ++ return ret; ++} ++ ++static struct vhba_command *next_command (struct vhba_device *vdev) ++{ ++ struct vhba_command *vcmd; ++ ++ list_for_each_entry(vcmd, &vdev->cmd_list, entry) { ++ if (vcmd->status == VHBA_REQ_PENDING) { ++ break; ++ } ++ } ++ ++ if (&vcmd->entry == &vdev->cmd_list) { ++ vcmd = NULL; ++ } ++ ++ return vcmd; ++} ++ ++static struct vhba_command *match_command (struct vhba_device *vdev, __u32 metatag) ++{ ++ struct vhba_command *vcmd; ++ ++ list_for_each_entry(vcmd, &vdev->cmd_list, entry) { ++ if (vcmd->metatag == metatag) { ++ break; ++ } ++ } ++ ++ if (&vcmd->entry == &vdev->cmd_list) { ++ vcmd = NULL; ++ } ++ ++ return vcmd; ++} ++ ++static struct vhba_command *wait_command (struct vhba_device *vdev, unsigned long flags) ++{ ++ struct vhba_command *vcmd; ++ DEFINE_WAIT(wait); ++ ++ while (!(vcmd = next_command(vdev))) { ++ if (signal_pending(current)) { ++ break; ++ } ++ ++ prepare_to_wait(&vdev->cmd_wq, &wait, TASK_INTERRUPTIBLE); ++ ++ spin_unlock_irqrestore(&vdev->cmd_lock, flags); ++ ++ schedule(); ++ ++ spin_lock_irqsave(&vdev->cmd_lock, flags); ++ } ++ ++ finish_wait(&vdev->cmd_wq, &wait); ++ if (vcmd) { ++ vcmd->status = VHBA_REQ_READING; ++ } ++ ++ return vcmd; ++} ++ ++static ssize_t vhba_ctl_read (struct file *file, char __user *buf, size_t buf_len, loff_t *offset) ++{ ++ struct vhba_device *vdev; ++ struct vhba_command *vcmd; ++ ssize_t ret; ++ unsigned long flags; ++ ++ vdev = file->private_data; ++ ++ /* Get next command */ ++ if (file->f_flags & O_NONBLOCK) { ++ /* Non-blocking variant */ ++ spin_lock_irqsave(&vdev->cmd_lock, flags); ++ vcmd = next_command(vdev); ++ spin_unlock_irqrestore(&vdev->cmd_lock, flags); ++ ++ if (!vcmd) { ++ return -EWOULDBLOCK; ++ } ++ } else { ++ /* Blocking variant */ ++ spin_lock_irqsave(&vdev->cmd_lock, flags); ++ vcmd = wait_command(vdev, flags); ++ spin_unlock_irqrestore(&vdev->cmd_lock, flags); ++ ++ if (!vcmd) { ++ return -ERESTARTSYS; ++ } ++ } ++ ++ ret = do_request(vdev, vcmd->metatag, vcmd->cmd, buf, buf_len); ++ ++ spin_lock_irqsave(&vdev->cmd_lock, flags); ++ if (ret >= 0) { ++ vcmd->status = VHBA_REQ_SENT; ++ *offset += ret; ++ } else { ++ vcmd->status = VHBA_REQ_PENDING; ++ } ++ ++ spin_unlock_irqrestore(&vdev->cmd_lock, flags); ++ ++ return ret; ++} ++ ++static ssize_t vhba_ctl_write (struct file *file, const char __user *buf, size_t buf_len, loff_t *offset) ++{ ++ struct vhba_device *vdev; ++ struct vhba_command *vcmd; ++ struct vhba_response res; ++ ssize_t ret; ++ unsigned long flags; ++ ++ if (buf_len < sizeof(res)) { ++ return -EIO; ++ } ++ ++ if (copy_from_user(&res, buf, sizeof(res))) { ++ return -EFAULT; ++ } ++ ++ vdev = file->private_data; ++ ++ spin_lock_irqsave(&vdev->cmd_lock, flags); ++ vcmd = match_command(vdev, res.metatag); ++ if (!vcmd || vcmd->status != VHBA_REQ_SENT) { ++ spin_unlock_irqrestore(&vdev->cmd_lock, flags); ++ pr_debug("ctl dev #%u not expecting response\n", vdev->num); ++ return -EIO; ++ } ++ vcmd->status = VHBA_REQ_WRITING; ++ spin_unlock_irqrestore(&vdev->cmd_lock, flags); ++ ++ ret = do_response(vdev, vcmd->metatag, vcmd->cmd, buf + sizeof(res), buf_len - sizeof(res), &res); ++ ++ spin_lock_irqsave(&vdev->cmd_lock, flags); ++ if (ret >= 0) { ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 16, 0) ++ scsi_done(vcmd->cmd); ++#else ++ vcmd->cmd->scsi_done(vcmd->cmd); ++#endif ++ ret += sizeof(res); ++ ++ /* don't compete with vhba_device_dequeue */ ++ if (!list_empty(&vcmd->entry)) { ++ list_del_init(&vcmd->entry); ++ vhba_free_command(vcmd); ++ } ++ } else { ++ vcmd->status = VHBA_REQ_SENT; ++ } ++ ++ spin_unlock_irqrestore(&vdev->cmd_lock, flags); ++ ++ return ret; ++} ++ ++static long vhba_ctl_ioctl (struct file *file, unsigned int cmd, unsigned long arg) ++{ ++ struct vhba_device *vdev = file->private_data; ++ struct vhba_host *vhost = platform_get_drvdata(&vhba_platform_device); ++ ++ switch (cmd) { ++ case 0xBEEF001: { ++ unsigned int ident[4]; /* host, channel, id, lun */ ++ ++ ident[0] = vhost->shost->host_no; ++ devnum_to_bus_and_id(vdev->num, &ident[1], &ident[2]); ++ ident[3] = 0; /* lun */ ++ ++ if (copy_to_user((void *) arg, ident, sizeof(ident))) { ++ return -EFAULT; ++ } ++ ++ return 0; ++ } ++ case 0xBEEF002: { ++ unsigned int devnum = vdev->num; ++ ++ if (copy_to_user((void *) arg, &devnum, sizeof(devnum))) { ++ return -EFAULT; ++ } ++ ++ return 0; ++ } ++ } ++ ++ return -ENOTTY; ++} ++ ++#ifdef CONFIG_COMPAT ++static long vhba_ctl_compat_ioctl (struct file *file, unsigned int cmd, unsigned long arg) ++{ ++ unsigned long compat_arg = (unsigned long)compat_ptr(arg); ++ return vhba_ctl_ioctl(file, cmd, compat_arg); ++} ++#endif ++ ++static unsigned int vhba_ctl_poll (struct file *file, poll_table *wait) ++{ ++ struct vhba_device *vdev = file->private_data; ++ unsigned int mask = 0; ++ unsigned long flags; ++ ++ poll_wait(file, &vdev->cmd_wq, wait); ++ ++ spin_lock_irqsave(&vdev->cmd_lock, flags); ++ if (next_command(vdev)) { ++ mask |= POLLIN | POLLRDNORM; ++ } ++ spin_unlock_irqrestore(&vdev->cmd_lock, flags); ++ ++ return mask; ++} ++ ++static int vhba_ctl_open (struct inode *inode, struct file *file) ++{ ++ struct vhba_device *vdev; ++ int retval; ++ ++ pr_debug("ctl dev open\n"); ++ ++ /* check if vhba is probed */ ++ if (!platform_get_drvdata(&vhba_platform_device)) { ++ return -ENODEV; ++ } ++ ++ vdev = vhba_device_alloc(); ++ if (!vdev) { ++ return -ENOMEM; ++ } ++ ++ vdev->kbuf_size = VHBA_KBUF_SIZE; ++ vdev->kbuf = kzalloc(vdev->kbuf_size, GFP_KERNEL); ++ if (!vdev->kbuf) { ++ return -ENOMEM; ++ } ++ ++ if (!(retval = vhba_add_device(vdev))) { ++ file->private_data = vdev; ++ } ++ ++ vhba_device_put(vdev); ++ ++ return retval; ++} ++ ++static int vhba_ctl_release (struct inode *inode, struct file *file) ++{ ++ struct vhba_device *vdev; ++ struct vhba_command *vcmd; ++ unsigned long flags; ++ ++ vdev = file->private_data; ++ ++ pr_debug("ctl dev release\n"); ++ ++ vhba_device_get(vdev); ++ vhba_remove_device(vdev); ++ ++ spin_lock_irqsave(&vdev->cmd_lock, flags); ++ list_for_each_entry(vcmd, &vdev->cmd_list, entry) { ++ WARN_ON(vcmd->status == VHBA_REQ_READING || vcmd->status == VHBA_REQ_WRITING); ++ ++ scmd_dbg(vcmd->cmd, "device released with command %lu (%p)\n", vcmd->metatag, vcmd->cmd); ++ vcmd->cmd->result = DID_NO_CONNECT << 16; ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 16, 0) ++ scsi_done(vcmd->cmd); ++#else ++ vcmd->cmd->scsi_done(vcmd->cmd); ++#endif ++ vhba_free_command(vcmd); ++ } ++ INIT_LIST_HEAD(&vdev->cmd_list); ++ spin_unlock_irqrestore(&vdev->cmd_lock, flags); ++ ++ kfree(vdev->kbuf); ++ vdev->kbuf = NULL; ++ ++ vhba_device_put(vdev); ++ ++ return 0; ++} ++ ++static struct file_operations vhba_ctl_fops = { ++ .owner = THIS_MODULE, ++ .open = vhba_ctl_open, ++ .release = vhba_ctl_release, ++ .read = vhba_ctl_read, ++ .write = vhba_ctl_write, ++ .poll = vhba_ctl_poll, ++ .unlocked_ioctl = vhba_ctl_ioctl, ++#ifdef CONFIG_COMPAT ++ .compat_ioctl = vhba_ctl_compat_ioctl, ++#endif ++}; ++ ++static struct miscdevice vhba_miscdev = { ++ .minor = MISC_DYNAMIC_MINOR, ++ .name = "vhba_ctl", ++ .fops = &vhba_ctl_fops, ++}; ++ ++static int vhba_probe (struct platform_device *pdev) ++{ ++ struct Scsi_Host *shost; ++ struct vhba_host *vhost; ++ int i; ++ ++ vhba_can_queue = clamp(vhba_can_queue, 1, 256); ++ ++ shost = scsi_host_alloc(&vhba_template, sizeof(struct vhba_host)); ++ if (!shost) { ++ return -ENOMEM; ++ } ++ ++ shost->max_channel = VHBA_MAX_BUS-1; ++ shost->max_id = VHBA_MAX_ID; ++ /* we don't support lun > 0 */ ++ shost->max_lun = 1; ++ shost->max_cmd_len = MAX_COMMAND_SIZE; ++ shost->can_queue = vhba_can_queue; ++ shost->cmd_per_lun = vhba_can_queue; ++ ++ vhost = (struct vhba_host *)shost->hostdata; ++ memset(vhost, 0, sizeof(struct vhba_host)); ++ ++ vhost->shost = shost; ++ vhost->num_devices = 0; ++ spin_lock_init(&vhost->dev_lock); ++ spin_lock_init(&vhost->cmd_lock); ++ INIT_WORK(&vhost->scan_devices, vhba_scan_devices); ++ vhost->cmd_next = 0; ++ vhost->commands = kzalloc(vhba_can_queue * sizeof(struct vhba_command), GFP_KERNEL); ++ if (!vhost->commands) { ++ return -ENOMEM; ++ } ++ ++ for (i = 0; i < vhba_can_queue; i++) { ++ vhost->commands[i].status = VHBA_REQ_FREE; ++ } ++ ++ platform_set_drvdata(pdev, vhost); ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 4, 0) ++ i = scsi_init_shared_tag_map(shost, vhba_can_queue); ++ if (i) return i; ++#endif ++ ++ if (scsi_add_host(shost, &pdev->dev)) { ++ scsi_host_put(shost); ++ return -ENOMEM; ++ } ++ ++ return 0; ++} ++ ++static int vhba_remove (struct platform_device *pdev) ++{ ++ struct vhba_host *vhost; ++ struct Scsi_Host *shost; ++ ++ vhost = platform_get_drvdata(pdev); ++ shost = vhost->shost; ++ ++ scsi_remove_host(shost); ++ scsi_host_put(shost); ++ ++ kfree(vhost->commands); ++ ++ return 0; ++} ++ ++static void vhba_release (struct device * dev) ++{ ++ return; ++} ++ ++static struct platform_device vhba_platform_device = { ++ .name = "vhba", ++ .id = -1, ++ .dev = { ++ .release = vhba_release, ++ }, ++}; ++ ++static struct platform_driver vhba_platform_driver = { ++ .driver = { ++ .owner = THIS_MODULE, ++ .name = "vhba", ++ }, ++ .probe = vhba_probe, ++ .remove = vhba_remove, ++}; ++ ++static int __init vhba_init (void) ++{ ++ int ret; ++ ++ ret = platform_device_register(&vhba_platform_device); ++ if (ret < 0) { ++ return ret; ++ } ++ ++ ret = platform_driver_register(&vhba_platform_driver); ++ if (ret < 0) { ++ platform_device_unregister(&vhba_platform_device); ++ return ret; ++ } ++ ++ ret = misc_register(&vhba_miscdev); ++ if (ret < 0) { ++ platform_driver_unregister(&vhba_platform_driver); ++ platform_device_unregister(&vhba_platform_device); ++ return ret; ++ } ++ ++ return 0; ++} ++ ++static void __exit vhba_exit(void) ++{ ++ misc_deregister(&vhba_miscdev); ++ platform_driver_unregister(&vhba_platform_driver); ++ platform_device_unregister(&vhba_platform_device); ++} ++ ++module_init(vhba_init); ++module_exit(vhba_exit); ++ diff --git a/debian/patches/patchset-zen/sauce/0002-vhba-Fix-compat-with-kernel-6.11.patch b/debian/patches/patchset-zen/sauce/0002-vhba-Fix-compat-with-kernel-6.11.patch new file mode 100644 index 0000000..e4b2fd3 --- /dev/null +++ b/debian/patches/patchset-zen/sauce/0002-vhba-Fix-compat-with-kernel-6.11.patch @@ -0,0 +1,35 @@ +From 6df7338351c342060088aa9abd561b81ccc113d2 Mon Sep 17 00:00:00 2001 +From: "Jan Alexander Steffens (heftig)" +Date: Sun, 15 Sep 2024 19:05:46 +0000 +Subject: vhba: Fix compat with kernel 6.11 + +Upstream commit 0edb555a65d1ef047a9805051c36922b52a38a9d changed the +return value of the `remove` callback from `int` to `void`. +--- + drivers/scsi/vhba/vhba.c | 6 ++++++ + 1 file changed, 6 insertions(+) + +--- a/drivers/scsi/vhba/vhba.c ++++ b/drivers/scsi/vhba/vhba.c +@@ -1049,7 +1049,11 @@ static int vhba_probe (struct platform_d + return 0; + } + ++#if LINUX_VERSION_CODE < KERNEL_VERSION(6, 11, 0) + static int vhba_remove (struct platform_device *pdev) ++#else ++static void vhba_remove (struct platform_device *pdev) ++#endif + { + struct vhba_host *vhost; + struct Scsi_Host *shost; +@@ -1062,7 +1066,9 @@ static int vhba_remove (struct platform_ + + kfree(vhost->commands); + ++#if LINUX_VERSION_CODE < KERNEL_VERSION(6, 11, 0) + return 0; ++#endif + } + + static void vhba_release (struct device * dev) diff --git a/debian/patches/patchset-zen/sauce/0003-ZEN-PCI-Add-Intel-remapped-NVMe-device-support.patch b/debian/patches/patchset-zen/sauce/0003-ZEN-PCI-Add-Intel-remapped-NVMe-device-support.patch new file mode 100644 index 0000000..69c7a89 --- /dev/null +++ b/debian/patches/patchset-zen/sauce/0003-ZEN-PCI-Add-Intel-remapped-NVMe-device-support.patch @@ -0,0 +1,626 @@ +From 567908cc05dc5e02b1b9c26620bce3791559f9d4 Mon Sep 17 00:00:00 2001 +From: Daniel Drake +Date: Tue, 4 Jun 2019 14:51:21 +0800 +Subject: ZEN: PCI: Add Intel remapped NVMe device support + +Contains: + - PCI: Add Intel remapped NVMe device support + + Consumer products that are configured by default to run the Intel SATA AHCI + controller in "RAID" or "Intel RST Premium With Intel Optane System + Acceleration" mode are becoming increasingly prevalent. + + Unde this mode, NVMe devices are remapped into the SATA device and become + hidden from the PCI bus, which means that Linux users cannot access their + storage devices unless they go into the firmware setup menu to revert back + to AHCI mode - assuming such option is available. Lack of support for this + mode is also causing complications for vendors who distribute Linux. + + Add support for the remapped NVMe mode by creating a virtual PCI bus, + where the AHCI and NVMe devices are presented separately, allowing the + ahci and nvme drivers to bind in the normal way. + + Unfortunately the NVMe device configuration space is inaccesible under + this scheme, so we provide a fake one, and hope that no DeviceID-based + quirks are needed. The interrupt is shared between the AHCI and NVMe + devices. + + Allow pci_real_dma_dev() to traverse back to the real DMA device from + the PCI devices created on our virtual bus, in case the iommu driver + will be involved with data transfers here. + + The existing ahci driver is modified to not claim devices where remapped + NVMe devices are present, allowing this new driver to step in. + + The details of the remapping scheme came from patches previously + posted by Dan Williams and the resulting discussion. + + https://phabricator.endlessm.com/T24358 + https://phabricator.endlessm.com/T29119 + + Signed-off-by: Daniel Drake + + - PCI: Fix order of remapped NVMe devices +--- + arch/x86/include/asm/pci.h | 6 + + arch/x86/pci/common.c | 7 +- + drivers/ata/ahci.c | 23 +- + drivers/pci/controller/Makefile | 6 + + drivers/pci/controller/intel-nvme-remap.c | 462 ++++++++++++++++++++++ + 5 files changed, 488 insertions(+), 16 deletions(-) + create mode 100644 drivers/pci/controller/intel-nvme-remap.c + +--- a/arch/x86/include/asm/pci.h ++++ b/arch/x86/include/asm/pci.h +@@ -26,6 +26,7 @@ struct pci_sysdata { + #if IS_ENABLED(CONFIG_VMD) + struct pci_dev *vmd_dev; /* VMD Device if in Intel VMD domain */ + #endif ++ struct pci_dev *nvme_remap_dev; /* AHCI Device if NVME remapped bus */ + }; + + extern int pci_routeirq; +@@ -69,6 +70,11 @@ static inline bool is_vmd(struct pci_bus + #define is_vmd(bus) false + #endif /* CONFIG_VMD */ + ++static inline bool is_nvme_remap(struct pci_bus *bus) ++{ ++ return to_pci_sysdata(bus)->nvme_remap_dev != NULL; ++} ++ + /* Can be used to override the logic in pci_scan_bus for skipping + already-configured bus numbers - to be used for buggy BIOSes + or architectures with incomplete PCI setup by the loader */ +--- a/arch/x86/pci/common.c ++++ b/arch/x86/pci/common.c +@@ -723,12 +723,15 @@ int pci_ext_cfg_avail(void) + return 0; + } + +-#if IS_ENABLED(CONFIG_VMD) + struct pci_dev *pci_real_dma_dev(struct pci_dev *dev) + { ++#if IS_ENABLED(CONFIG_VMD) + if (is_vmd(dev->bus)) + return to_pci_sysdata(dev->bus)->vmd_dev; ++#endif ++ ++ if (is_nvme_remap(dev->bus)) ++ return to_pci_sysdata(dev->bus)->nvme_remap_dev; + + return dev; + } +-#endif +--- a/drivers/ata/ahci.c ++++ b/drivers/ata/ahci.c +@@ -1618,7 +1618,7 @@ static irqreturn_t ahci_thunderx_irq_han + } + #endif + +-static void ahci_remap_check(struct pci_dev *pdev, int bar, ++static int ahci_remap_check(struct pci_dev *pdev, int bar, + struct ahci_host_priv *hpriv) + { + int i; +@@ -1631,7 +1631,7 @@ static void ahci_remap_check(struct pci_ + pci_resource_len(pdev, bar) < SZ_512K || + bar != AHCI_PCI_BAR_STANDARD || + !(readl(hpriv->mmio + AHCI_VSCAP) & 1)) +- return; ++ return 0; + + cap = readq(hpriv->mmio + AHCI_REMAP_CAP); + for (i = 0; i < AHCI_MAX_REMAP; i++) { +@@ -1646,18 +1646,11 @@ static void ahci_remap_check(struct pci_ + } + + if (!hpriv->remapped_nvme) +- return; ++ return 0; + +- dev_warn(&pdev->dev, "Found %u remapped NVMe devices.\n", +- hpriv->remapped_nvme); +- dev_warn(&pdev->dev, +- "Switch your BIOS from RAID to AHCI mode to use them.\n"); +- +- /* +- * Don't rely on the msi-x capability in the remap case, +- * share the legacy interrupt across ahci and remapped devices. +- */ +- hpriv->flags |= AHCI_HFLAG_NO_MSI; ++ /* Abort probe, allowing intel-nvme-remap to step in when available */ ++ dev_info(&pdev->dev, "Device will be handled by intel-nvme-remap.\n"); ++ return -ENODEV; + } + + static int ahci_get_irq_vector(struct ata_host *host, int port) +@@ -1896,7 +1889,9 @@ static int ahci_init_one(struct pci_dev + hpriv->mmio = pcim_iomap_table(pdev)[ahci_pci_bar]; + + /* detect remapped nvme devices */ +- ahci_remap_check(pdev, ahci_pci_bar, hpriv); ++ rc = ahci_remap_check(pdev, ahci_pci_bar, hpriv); ++ if (rc) ++ return rc; + + sysfs_add_file_to_group(&pdev->dev.kobj, + &dev_attr_remapped_nvme.attr, +--- a/drivers/pci/controller/Makefile ++++ b/drivers/pci/controller/Makefile +@@ -1,4 +1,10 @@ + # SPDX-License-Identifier: GPL-2.0 ++ifdef CONFIG_X86_64 ++ifdef CONFIG_SATA_AHCI ++obj-y += intel-nvme-remap.o ++endif ++endif ++ + obj-$(CONFIG_PCIE_CADENCE) += cadence/ + obj-$(CONFIG_PCI_FTPCI100) += pci-ftpci100.o + obj-$(CONFIG_PCI_IXP4XX) += pci-ixp4xx.o +--- /dev/null ++++ b/drivers/pci/controller/intel-nvme-remap.c +@@ -0,0 +1,462 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* ++ * Intel remapped NVMe device support. ++ * ++ * Copyright (c) 2019 Endless Mobile, Inc. ++ * Author: Daniel Drake ++ * ++ * Some products ship by default with the SATA controller in "RAID" or ++ * "Intel RST Premium With Intel Optane System Acceleration" mode. Under this ++ * mode, which we refer to as "remapped NVMe" mode, any installed NVMe ++ * devices disappear from the PCI bus, and instead their I/O memory becomes ++ * available within the AHCI device BARs. ++ * ++ * This scheme is understood to be a way of avoiding usage of the standard ++ * Windows NVMe driver under that OS, instead mandating usage of Intel's ++ * driver instead, which has better power management, and presumably offers ++ * some RAID/disk-caching solutions too. ++ * ++ * Here in this driver, we support the remapped NVMe mode by claiming the ++ * AHCI device and creating a fake PCIe root port. On the new bus, the ++ * original AHCI device is exposed with only minor tweaks. Then, fake PCI ++ * devices corresponding to the remapped NVMe devices are created. The usual ++ * ahci and nvme drivers are then expected to bind to these devices and ++ * operate as normal. ++ * ++ * The PCI configuration space for the NVMe devices is completely ++ * unavailable, so we fake a minimal one and hope for the best. ++ * ++ * Interrupts are shared between the AHCI and NVMe devices. For simplicity, ++ * we only support the legacy interrupt here, although MSI support ++ * could potentially be added later. ++ */ ++ ++#define MODULE_NAME "intel-nvme-remap" ++ ++#include ++#include ++#include ++#include ++#include ++ ++#define AHCI_PCI_BAR_STANDARD 5 ++ ++struct nvme_remap_dev { ++ struct pci_dev *dev; /* AHCI device */ ++ struct pci_bus *bus; /* our fake PCI bus */ ++ struct pci_sysdata sysdata; ++ int irq_base; /* our fake interrupts */ ++ ++ /* ++ * When we detect an all-ones write to a BAR register, this flag ++ * is set, so that we return the BAR size on the next read (a ++ * standard PCI behaviour). ++ * This includes the assumption that an all-ones BAR write is ++ * immediately followed by a read of the same register. ++ */ ++ bool bar_sizing; ++ ++ /* ++ * Resources copied from the AHCI device, to be regarded as ++ * resources on our fake bus. ++ */ ++ struct resource ahci_resources[PCI_NUM_RESOURCES]; ++ ++ /* Resources corresponding to the NVMe devices. */ ++ struct resource remapped_dev_mem[AHCI_MAX_REMAP]; ++ ++ /* Number of remapped NVMe devices found. */ ++ int num_remapped_devices; ++}; ++ ++static inline struct nvme_remap_dev *nrdev_from_bus(struct pci_bus *bus) ++{ ++ return container_of(bus->sysdata, struct nvme_remap_dev, sysdata); ++} ++ ++ ++/******** PCI configuration space **********/ ++ ++/* ++ * Helper macros for tweaking returned contents of PCI configuration space. ++ * ++ * value contains len bytes of data read from reg. ++ * If fixup_reg is included in that range, fix up the contents of that ++ * register to fixed_value. ++ */ ++#define NR_FIX8(fixup_reg, fixed_value) do { \ ++ if (reg <= fixup_reg && fixup_reg < reg + len) \ ++ ((u8 *) value)[fixup_reg - reg] = (u8) (fixed_value); \ ++ } while (0) ++ ++#define NR_FIX16(fixup_reg, fixed_value) do { \ ++ NR_FIX8(fixup_reg, fixed_value); \ ++ NR_FIX8(fixup_reg + 1, fixed_value >> 8); \ ++ } while (0) ++ ++#define NR_FIX24(fixup_reg, fixed_value) do { \ ++ NR_FIX8(fixup_reg, fixed_value); \ ++ NR_FIX8(fixup_reg + 1, fixed_value >> 8); \ ++ NR_FIX8(fixup_reg + 2, fixed_value >> 16); \ ++ } while (0) ++ ++#define NR_FIX32(fixup_reg, fixed_value) do { \ ++ NR_FIX16(fixup_reg, (u16) fixed_value); \ ++ NR_FIX16(fixup_reg + 2, fixed_value >> 16); \ ++ } while (0) ++ ++/* ++ * Read PCI config space of the slot 0 (AHCI) device. ++ * We pass through the read request to the underlying device, but ++ * tweak the results in some cases. ++ */ ++static int nvme_remap_pci_read_slot0(struct pci_bus *bus, int reg, ++ int len, u32 *value) ++{ ++ struct nvme_remap_dev *nrdev = nrdev_from_bus(bus); ++ struct pci_bus *ahci_dev_bus = nrdev->dev->bus; ++ int ret; ++ ++ ret = ahci_dev_bus->ops->read(ahci_dev_bus, nrdev->dev->devfn, ++ reg, len, value); ++ if (ret) ++ return ret; ++ ++ /* ++ * Adjust the device class, to prevent this driver from attempting to ++ * additionally probe the device we're simulating here. ++ */ ++ NR_FIX24(PCI_CLASS_PROG, PCI_CLASS_STORAGE_SATA_AHCI); ++ ++ /* ++ * Unset interrupt pin, otherwise ACPI tries to find routing ++ * info for our virtual IRQ, fails, and complains. ++ */ ++ NR_FIX8(PCI_INTERRUPT_PIN, 0); ++ ++ /* ++ * Truncate the AHCI BAR to not include the region that covers the ++ * hidden devices. This will cause the ahci driver to successfully ++ * probe th new device (instead of handing it over to this driver). ++ */ ++ if (nrdev->bar_sizing) { ++ NR_FIX32(PCI_BASE_ADDRESS_5, ~(SZ_16K - 1)); ++ nrdev->bar_sizing = false; ++ } ++ ++ return PCIBIOS_SUCCESSFUL; ++} ++ ++/* ++ * Read PCI config space of a remapped device. ++ * Since the original PCI config space is inaccessible, we provide a minimal, ++ * fake config space instead. ++ */ ++static int nvme_remap_pci_read_remapped(struct pci_bus *bus, unsigned int port, ++ int reg, int len, u32 *value) ++{ ++ struct nvme_remap_dev *nrdev = nrdev_from_bus(bus); ++ struct resource *remapped_mem; ++ ++ if (port > nrdev->num_remapped_devices) ++ return PCIBIOS_DEVICE_NOT_FOUND; ++ ++ *value = 0; ++ remapped_mem = &nrdev->remapped_dev_mem[port - 1]; ++ ++ /* Set a Vendor ID, otherwise Linux assumes no device is present */ ++ NR_FIX16(PCI_VENDOR_ID, PCI_VENDOR_ID_INTEL); ++ ++ /* Always appear on & bus mastering */ ++ NR_FIX16(PCI_COMMAND, PCI_COMMAND_MEMORY | PCI_COMMAND_MASTER); ++ ++ /* Set class so that nvme driver probes us */ ++ NR_FIX24(PCI_CLASS_PROG, PCI_CLASS_STORAGE_EXPRESS); ++ ++ if (nrdev->bar_sizing) { ++ NR_FIX32(PCI_BASE_ADDRESS_0, ++ ~(resource_size(remapped_mem) - 1)); ++ nrdev->bar_sizing = false; ++ } else { ++ resource_size_t mem_start = remapped_mem->start; ++ ++ mem_start |= PCI_BASE_ADDRESS_MEM_TYPE_64; ++ NR_FIX32(PCI_BASE_ADDRESS_0, mem_start); ++ mem_start >>= 32; ++ NR_FIX32(PCI_BASE_ADDRESS_1, mem_start); ++ } ++ ++ return PCIBIOS_SUCCESSFUL; ++} ++ ++/* Read PCI configuration space. */ ++static int nvme_remap_pci_read(struct pci_bus *bus, unsigned int devfn, ++ int reg, int len, u32 *value) ++{ ++ if (PCI_SLOT(devfn) == 0) ++ return nvme_remap_pci_read_slot0(bus, reg, len, value); ++ else ++ return nvme_remap_pci_read_remapped(bus, PCI_SLOT(devfn), ++ reg, len, value); ++} ++ ++/* ++ * Write PCI config space of the slot 0 (AHCI) device. ++ * Apart from the special case of BAR sizing, we disable all writes. ++ * Otherwise, the ahci driver could make changes (e.g. unset PCI bus master) ++ * that would affect the operation of the NVMe devices. ++ */ ++static int nvme_remap_pci_write_slot0(struct pci_bus *bus, int reg, ++ int len, u32 value) ++{ ++ struct nvme_remap_dev *nrdev = nrdev_from_bus(bus); ++ struct pci_bus *ahci_dev_bus = nrdev->dev->bus; ++ ++ if (reg >= PCI_BASE_ADDRESS_0 && reg <= PCI_BASE_ADDRESS_5) { ++ /* ++ * Writing all-ones to a BAR means that the size of the ++ * memory region is being checked. Flag this so that we can ++ * reply with an appropriate size on the next read. ++ */ ++ if (value == ~0) ++ nrdev->bar_sizing = true; ++ ++ return ahci_dev_bus->ops->write(ahci_dev_bus, ++ nrdev->dev->devfn, ++ reg, len, value); ++ } ++ ++ return PCIBIOS_SET_FAILED; ++} ++ ++/* ++ * Write PCI config space of a remapped device. ++ * Since the original PCI config space is inaccessible, we reject all ++ * writes, except for the special case of BAR probing. ++ */ ++static int nvme_remap_pci_write_remapped(struct pci_bus *bus, ++ unsigned int port, ++ int reg, int len, u32 value) ++{ ++ struct nvme_remap_dev *nrdev = nrdev_from_bus(bus); ++ ++ if (port > nrdev->num_remapped_devices) ++ return PCIBIOS_DEVICE_NOT_FOUND; ++ ++ /* ++ * Writing all-ones to a BAR means that the size of the memory ++ * region is being checked. Flag this so that we can reply with ++ * an appropriate size on the next read. ++ */ ++ if (value == ~0 && reg >= PCI_BASE_ADDRESS_0 ++ && reg <= PCI_BASE_ADDRESS_5) { ++ nrdev->bar_sizing = true; ++ return PCIBIOS_SUCCESSFUL; ++ } ++ ++ return PCIBIOS_SET_FAILED; ++} ++ ++/* Write PCI configuration space. */ ++static int nvme_remap_pci_write(struct pci_bus *bus, unsigned int devfn, ++ int reg, int len, u32 value) ++{ ++ if (PCI_SLOT(devfn) == 0) ++ return nvme_remap_pci_write_slot0(bus, reg, len, value); ++ else ++ return nvme_remap_pci_write_remapped(bus, PCI_SLOT(devfn), ++ reg, len, value); ++} ++ ++static struct pci_ops nvme_remap_pci_ops = { ++ .read = nvme_remap_pci_read, ++ .write = nvme_remap_pci_write, ++}; ++ ++ ++/******** Initialization & exit **********/ ++ ++/* ++ * Find a PCI domain ID to use for our fake bus. ++ * Start at 0x10000 to not clash with ACPI _SEG domains (16 bits). ++ */ ++static int find_free_domain(void) ++{ ++ int domain = 0xffff; ++ struct pci_bus *bus = NULL; ++ ++ while ((bus = pci_find_next_bus(bus)) != NULL) ++ domain = max_t(int, domain, pci_domain_nr(bus)); ++ ++ return domain + 1; ++} ++ ++static int find_remapped_devices(struct nvme_remap_dev *nrdev, ++ struct list_head *resources) ++{ ++ void __iomem *mmio; ++ int i, count = 0; ++ u32 cap; ++ ++ mmio = pcim_iomap(nrdev->dev, AHCI_PCI_BAR_STANDARD, ++ pci_resource_len(nrdev->dev, ++ AHCI_PCI_BAR_STANDARD)); ++ if (!mmio) ++ return -ENODEV; ++ ++ /* Check if this device might have remapped nvme devices. */ ++ if (pci_resource_len(nrdev->dev, AHCI_PCI_BAR_STANDARD) < SZ_512K || ++ !(readl(mmio + AHCI_VSCAP) & 1)) ++ return -ENODEV; ++ ++ cap = readq(mmio + AHCI_REMAP_CAP); ++ for (i = AHCI_MAX_REMAP-1; i >= 0; i--) { ++ struct resource *remapped_mem; ++ ++ if ((cap & (1 << i)) == 0) ++ continue; ++ if (readl(mmio + ahci_remap_dcc(i)) ++ != PCI_CLASS_STORAGE_EXPRESS) ++ continue; ++ ++ /* We've found a remapped device */ ++ remapped_mem = &nrdev->remapped_dev_mem[count++]; ++ remapped_mem->start = ++ pci_resource_start(nrdev->dev, AHCI_PCI_BAR_STANDARD) ++ + ahci_remap_base(i); ++ remapped_mem->end = remapped_mem->start ++ + AHCI_REMAP_N_SIZE - 1; ++ remapped_mem->flags = IORESOURCE_MEM | IORESOURCE_PCI_FIXED; ++ pci_add_resource(resources, remapped_mem); ++ } ++ ++ pcim_iounmap(nrdev->dev, mmio); ++ ++ if (count == 0) ++ return -ENODEV; ++ ++ nrdev->num_remapped_devices = count; ++ dev_info(&nrdev->dev->dev, "Found %d remapped NVMe devices\n", ++ nrdev->num_remapped_devices); ++ return 0; ++} ++ ++static void nvme_remap_remove_root_bus(void *data) ++{ ++ struct pci_bus *bus = data; ++ ++ pci_stop_root_bus(bus); ++ pci_remove_root_bus(bus); ++} ++ ++static int nvme_remap_probe(struct pci_dev *dev, ++ const struct pci_device_id *id) ++{ ++ struct nvme_remap_dev *nrdev; ++ LIST_HEAD(resources); ++ int i; ++ int ret; ++ struct pci_dev *child; ++ ++ nrdev = devm_kzalloc(&dev->dev, sizeof(*nrdev), GFP_KERNEL); ++ nrdev->sysdata.domain = find_free_domain(); ++ nrdev->sysdata.nvme_remap_dev = dev; ++ nrdev->dev = dev; ++ pci_set_drvdata(dev, nrdev); ++ ++ ret = pcim_enable_device(dev); ++ if (ret < 0) ++ return ret; ++ ++ pci_set_master(dev); ++ ++ ret = find_remapped_devices(nrdev, &resources); ++ if (ret) ++ return ret; ++ ++ /* Add resources from the original AHCI device */ ++ for (i = 0; i < PCI_NUM_RESOURCES; i++) { ++ struct resource *res = &dev->resource[i]; ++ ++ if (res->start) { ++ struct resource *nr_res = &nrdev->ahci_resources[i]; ++ ++ nr_res->start = res->start; ++ nr_res->end = res->end; ++ nr_res->flags = res->flags; ++ pci_add_resource(&resources, nr_res); ++ } ++ } ++ ++ /* Create virtual interrupts */ ++ nrdev->irq_base = devm_irq_alloc_descs(&dev->dev, -1, 0, ++ nrdev->num_remapped_devices + 1, ++ 0); ++ if (nrdev->irq_base < 0) ++ return nrdev->irq_base; ++ ++ /* Create and populate PCI bus */ ++ nrdev->bus = pci_create_root_bus(&dev->dev, 0, &nvme_remap_pci_ops, ++ &nrdev->sysdata, &resources); ++ if (!nrdev->bus) ++ return -ENODEV; ++ ++ if (devm_add_action_or_reset(&dev->dev, nvme_remap_remove_root_bus, ++ nrdev->bus)) ++ return -ENOMEM; ++ ++ /* We don't support sharing MSI interrupts between these devices */ ++ nrdev->bus->bus_flags |= PCI_BUS_FLAGS_NO_MSI; ++ ++ pci_scan_child_bus(nrdev->bus); ++ ++ list_for_each_entry(child, &nrdev->bus->devices, bus_list) { ++ /* ++ * Prevent PCI core from trying to move memory BARs around. ++ * The hidden NVMe devices are at fixed locations. ++ */ ++ for (i = 0; i < PCI_NUM_RESOURCES; i++) { ++ struct resource *res = &child->resource[i]; ++ ++ if (res->flags & IORESOURCE_MEM) ++ res->flags |= IORESOURCE_PCI_FIXED; ++ } ++ ++ /* Share the legacy IRQ between all devices */ ++ child->irq = dev->irq; ++ } ++ ++ pci_assign_unassigned_bus_resources(nrdev->bus); ++ pci_bus_add_devices(nrdev->bus); ++ ++ return 0; ++} ++ ++static const struct pci_device_id nvme_remap_ids[] = { ++ /* ++ * Match all Intel RAID controllers. ++ * ++ * There's overlap here with the set of devices detected by the ahci ++ * driver, but ahci will only successfully probe when there ++ * *aren't* any remapped NVMe devices, and this driver will only ++ * successfully probe when there *are* remapped NVMe devices that ++ * need handling. ++ */ ++ { ++ PCI_VDEVICE(INTEL, PCI_ANY_ID), ++ .class = PCI_CLASS_STORAGE_RAID << 8, ++ .class_mask = 0xffffff00, ++ }, ++ {0,} ++}; ++MODULE_DEVICE_TABLE(pci, nvme_remap_ids); ++ ++static struct pci_driver nvme_remap_drv = { ++ .name = MODULE_NAME, ++ .id_table = nvme_remap_ids, ++ .probe = nvme_remap_probe, ++}; ++module_pci_driver(nvme_remap_drv); ++ ++MODULE_AUTHOR("Daniel Drake "); ++MODULE_LICENSE("GPL v2"); diff --git a/debian/patches/patchset-zen/sauce/0004-ZEN-Disable-stack-conservation-for-GCC.patch b/debian/patches/patchset-zen/sauce/0004-ZEN-Disable-stack-conservation-for-GCC.patch new file mode 100644 index 0000000..e2a70b0 --- /dev/null +++ b/debian/patches/patchset-zen/sauce/0004-ZEN-Disable-stack-conservation-for-GCC.patch @@ -0,0 +1,29 @@ +From 89a4975b413afa5f591c7a18109d35b5e848b582 Mon Sep 17 00:00:00 2001 +From: Sultan Alsawaf +Date: Sun, 8 Mar 2020 00:31:35 -0800 +Subject: ZEN: Disable stack conservation for GCC + +There's plenty of room on the stack for a few more inlined bytes here +and there. The measured stack usage at runtime is still safe without +this, and performance is surely improved at a microscopic level, so +remove it. + +Signed-off-by: Sultan Alsawaf +--- + Makefile | 5 ----- + 1 file changed, 5 deletions(-) + +--- a/Makefile ++++ b/Makefile +@@ -1003,11 +1003,6 @@ KBUILD_CFLAGS += -fno-strict-overflow + # Make sure -fstack-check isn't enabled (like gentoo apparently did) + KBUILD_CFLAGS += -fno-stack-check + +-# conserve stack if available +-ifdef CONFIG_CC_IS_GCC +-KBUILD_CFLAGS += -fconserve-stack +-endif +- + # change __FILE__ to the relative path from the srctree + KBUILD_CPPFLAGS += $(call cc-option,-fmacro-prefix-map=$(srctree)/=) + diff --git a/debian/patches/patchset-zen/sauce/0005-ZEN-Initialize-ata-before-graphics.patch b/debian/patches/patchset-zen/sauce/0005-ZEN-Initialize-ata-before-graphics.patch new file mode 100644 index 0000000..4ef331c --- /dev/null +++ b/debian/patches/patchset-zen/sauce/0005-ZEN-Initialize-ata-before-graphics.patch @@ -0,0 +1,43 @@ +From 5f2e6f795ce9908851acb20ab03af6550ae54f3b Mon Sep 17 00:00:00 2001 +From: Arjan van de Ven +Date: Thu, 2 Jun 2016 23:36:32 -0500 +Subject: ZEN: Initialize ata before graphics + +ATA init is the long pole in the boot process, and its asynchronous. +move the graphics init after it so that ata and graphics initialize +in parallel +--- + drivers/Makefile | 13 +++++++------ + 1 file changed, 7 insertions(+), 6 deletions(-) + +--- a/drivers/Makefile ++++ b/drivers/Makefile +@@ -61,14 +61,8 @@ obj-y += char/ + # iommu/ comes before gpu as gpu are using iommu controllers + obj-y += iommu/ + +-# gpu/ comes after char for AGP vs DRM startup and after iommu +-obj-y += gpu/ +- + obj-$(CONFIG_CONNECTOR) += connector/ + +-# i810fb depends on char/agp/ +-obj-$(CONFIG_FB_I810) += video/fbdev/i810/ +- + obj-$(CONFIG_PARPORT) += parport/ + obj-y += base/ block/ misc/ mfd/ nfc/ + obj-$(CONFIG_LIBNVDIMM) += nvdimm/ +@@ -80,6 +74,13 @@ obj-y += macintosh/ + obj-y += scsi/ + obj-y += nvme/ + obj-$(CONFIG_ATA) += ata/ ++ ++# gpu/ comes after char for AGP vs DRM startup and after iommu ++obj-y += gpu/ ++ ++# i810fb depends on char/agp/ ++obj-$(CONFIG_FB_I810) += video/fbdev/i810/ ++ + obj-$(CONFIG_TARGET_CORE) += target/ + obj-$(CONFIG_MTD) += mtd/ + obj-$(CONFIG_SPI) += spi/ diff --git a/debian/patches/patchset-zen/sauce/0006-ZEN-Input-evdev-use-call_rcu-when-detaching-client.patch b/debian/patches/patchset-zen/sauce/0006-ZEN-Input-evdev-use-call_rcu-when-detaching-client.patch new file mode 100644 index 0000000..b229161 --- /dev/null +++ b/debian/patches/patchset-zen/sauce/0006-ZEN-Input-evdev-use-call_rcu-when-detaching-client.patch @@ -0,0 +1,105 @@ +From 3d92c251c04b1b4c6363018220af42ec3a294d1e Mon Sep 17 00:00:00 2001 +From: Kenny Levinsen +Date: Sun, 27 Dec 2020 14:43:13 +0000 +Subject: ZEN: Input: evdev - use call_rcu when detaching client + +Significant time was spent on synchronize_rcu in evdev_detach_client +when applications closed evdev devices. Switching VT away from a +graphical environment commonly leads to mass input device closures, +which could lead to noticable delays on systems with many input devices. + +Replace synchronize_rcu with call_rcu, deferring reclaim of the evdev +client struct till after the RCU grace period instead of blocking the +calling application. + +While this does not solve all slow evdev fd closures, it takes care of a +good portion of them, including this simple test: + + #include + #include + + int main(int argc, char *argv[]) + { + int idx, fd; + const char *path = "/dev/input/event0"; + for (idx = 0; idx < 1000; idx++) { + if ((fd = open(path, O_RDWR)) == -1) { + return -1; + } + close(fd); + } + return 0; + } + +Time to completion of above test when run locally: + + Before: 0m27.111s + After: 0m0.018s + +Signed-off-by: Kenny Levinsen +--- + drivers/input/evdev.c | 19 +++++++++++-------- + 1 file changed, 11 insertions(+), 8 deletions(-) + +--- a/drivers/input/evdev.c ++++ b/drivers/input/evdev.c +@@ -46,6 +46,7 @@ struct evdev_client { + struct fasync_struct *fasync; + struct evdev *evdev; + struct list_head node; ++ struct rcu_head rcu; + enum input_clock_type clk_type; + bool revoked; + unsigned long *evmasks[EV_CNT]; +@@ -368,13 +369,22 @@ static void evdev_attach_client(struct e + spin_unlock(&evdev->client_lock); + } + ++static void evdev_reclaim_client(struct rcu_head *rp) ++{ ++ struct evdev_client *client = container_of(rp, struct evdev_client, rcu); ++ unsigned int i; ++ for (i = 0; i < EV_CNT; ++i) ++ bitmap_free(client->evmasks[i]); ++ kvfree(client); ++} ++ + static void evdev_detach_client(struct evdev *evdev, + struct evdev_client *client) + { + spin_lock(&evdev->client_lock); + list_del_rcu(&client->node); + spin_unlock(&evdev->client_lock); +- synchronize_rcu(); ++ call_rcu(&client->rcu, evdev_reclaim_client); + } + + static int evdev_open_device(struct evdev *evdev) +@@ -427,7 +437,6 @@ static int evdev_release(struct inode *i + { + struct evdev_client *client = file->private_data; + struct evdev *evdev = client->evdev; +- unsigned int i; + + mutex_lock(&evdev->mutex); + +@@ -439,11 +448,6 @@ static int evdev_release(struct inode *i + + evdev_detach_client(evdev, client); + +- for (i = 0; i < EV_CNT; ++i) +- bitmap_free(client->evmasks[i]); +- +- kvfree(client); +- + evdev_close_device(evdev); + + return 0; +@@ -486,7 +490,6 @@ static int evdev_open(struct inode *inod + + err_free_client: + evdev_detach_client(evdev, client); +- kvfree(client); + return error; + } + diff --git a/debian/patches/patchset-zen/sauce/0007-ZEN-cpufreq-Remove-schedutil-dependency-on-Intel-AMD.patch b/debian/patches/patchset-zen/sauce/0007-ZEN-cpufreq-Remove-schedutil-dependency-on-Intel-AMD.patch new file mode 100644 index 0000000..c94ccd2 --- /dev/null +++ b/debian/patches/patchset-zen/sauce/0007-ZEN-cpufreq-Remove-schedutil-dependency-on-Intel-AMD.patch @@ -0,0 +1,31 @@ +From 67c446794b5fc16009bc1f31aee8846576796b11 Mon Sep 17 00:00:00 2001 +From: Steven Barrett +Date: Mon, 11 Jul 2022 19:10:30 -0500 +Subject: ZEN: cpufreq: Remove schedutil dependency on Intel/AMD P-State + drivers + +Although both P-State drivers depend on schedutil in Kconfig, both code +bases do not use any schedutil code. This arbitrarily enables schedutil +when unwanted in some configurations. +--- + drivers/cpufreq/Kconfig.x86 | 2 -- + 1 file changed, 2 deletions(-) + +--- a/drivers/cpufreq/Kconfig.x86 ++++ b/drivers/cpufreq/Kconfig.x86 +@@ -9,7 +9,6 @@ config X86_INTEL_PSTATE + select ACPI_PROCESSOR if ACPI + select ACPI_CPPC_LIB if X86_64 && ACPI && SCHED_MC_PRIO + select CPU_FREQ_GOV_PERFORMANCE +- select CPU_FREQ_GOV_SCHEDUTIL if SMP + help + This driver provides a P state for Intel core processors. + The driver implements an internal governor and will become +@@ -39,7 +38,6 @@ config X86_AMD_PSTATE + depends on X86 && ACPI + select ACPI_PROCESSOR + select ACPI_CPPC_LIB if X86_64 +- select CPU_FREQ_GOV_SCHEDUTIL if SMP + help + This driver adds a CPUFreq driver which utilizes a fine grain + processor performance frequency control range instead of legacy diff --git a/debian/patches/patchset-zen/sauce/0008-ZEN-intel-pstate-Implement-enable-parameter.patch b/debian/patches/patchset-zen/sauce/0008-ZEN-intel-pstate-Implement-enable-parameter.patch new file mode 100644 index 0000000..007f9c6 --- /dev/null +++ b/debian/patches/patchset-zen/sauce/0008-ZEN-intel-pstate-Implement-enable-parameter.patch @@ -0,0 +1,53 @@ +From 1d5cc90283f9de0c4cc996a2f3e6ba0306c1f14d Mon Sep 17 00:00:00 2001 +From: Steven Barrett +Date: Wed, 15 Jan 2020 20:43:56 -0600 +Subject: ZEN: intel-pstate: Implement "enable" parameter + +If intel-pstate is compiled into the kernel, it will preempt the loading +of acpi-cpufreq so you can take advantage of hardware p-states without +any friction. + +However, intel-pstate is not completely superior to cpufreq's ondemand +for one reason. There's no concept of an up_threshold property. + +In ondemand, up_threshold essentially reduces the maximum utilization to +compare against, allowing you to hit max frequencies and turbo boost +from a much lower core utilization. + +With intel-pstate, you have the concept of minimum and maximum +performance, but no tunable that lets you define, maximum frequency +means 50% core utilization. For just this oversight, there's reasons +you may want ondemand. + +Lets support setting "enable" in kernel boot parameters. This lets +kernel maintainers include "intel_pstate=disable" statically in the +static boot parameters, but let users of the kernel override this +selection. +--- + Documentation/admin-guide/kernel-parameters.txt | 3 +++ + drivers/cpufreq/intel_pstate.c | 2 ++ + 2 files changed, 5 insertions(+) + +--- a/Documentation/admin-guide/kernel-parameters.txt ++++ b/Documentation/admin-guide/kernel-parameters.txt +@@ -2237,6 +2237,9 @@ + disable + Do not enable intel_pstate as the default + scaling driver for the supported processors ++ enable ++ Enable intel_pstate in-case "disable" was passed ++ previously in the kernel boot parameters + active + Use intel_pstate driver to bypass the scaling + governors layer of cpufreq and provides it own +--- a/drivers/cpufreq/intel_pstate.c ++++ b/drivers/cpufreq/intel_pstate.c +@@ -3524,6 +3524,8 @@ static int __init intel_pstate_setup(cha + + if (!strcmp(str, "disable")) + no_load = 1; ++ else if (!strcmp(str, "enable")) ++ no_load = 0; + else if (!strcmp(str, "active")) + default_driver = &intel_pstate; + else if (!strcmp(str, "passive")) diff --git a/debian/patches/patchset-zen/sauce/0009-ZEN-drm-amdgpu-pm-Allow-override-of-min_power_limit-.patch b/debian/patches/patchset-zen/sauce/0009-ZEN-drm-amdgpu-pm-Allow-override-of-min_power_limit-.patch new file mode 100644 index 0000000..d67e7f0 --- /dev/null +++ b/debian/patches/patchset-zen/sauce/0009-ZEN-drm-amdgpu-pm-Allow-override-of-min_power_limit-.patch @@ -0,0 +1,91 @@ +From 493aee188c1c7c5cee2791820bfc779932bc10dc Mon Sep 17 00:00:00 2001 +From: Steven Barrett +Date: Fri, 15 Mar 2024 12:36:51 -0500 +Subject: ZEN: drm/amdgpu/pm: Allow override of min_power_limit with + ignore_min_pcap + +--- + drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 + + drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 10 ++++++++++ + drivers/gpu/drm/amd/pm/amdgpu_pm.c | 3 +++ + drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c | 14 ++++++++++++-- + 4 files changed, 26 insertions(+), 2 deletions(-) + +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h +@@ -162,6 +162,7 @@ struct amdgpu_watchdog_timer { + */ + extern int amdgpu_modeset; + extern unsigned int amdgpu_vram_limit; ++extern int amdgpu_ignore_min_pcap; + extern int amdgpu_vis_vram_limit; + extern int amdgpu_gart_size; + extern int amdgpu_gtt_size; +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +@@ -135,6 +135,7 @@ enum AMDGPU_DEBUG_MASK { + }; + + unsigned int amdgpu_vram_limit = UINT_MAX; ++int amdgpu_ignore_min_pcap = 0; /* do not ignore by default */ + int amdgpu_vis_vram_limit; + int amdgpu_gart_size = -1; /* auto */ + int amdgpu_gtt_size = -1; /* auto */ +@@ -249,6 +250,15 @@ struct amdgpu_watchdog_timer amdgpu_watc + }; + + /** ++ * DOC: ignore_min_pcap (int) ++ * Ignore the minimum power cap. ++ * Useful on graphics cards where the minimum power cap is very high. ++ * The default is 0 (Do not ignore). ++ */ ++MODULE_PARM_DESC(ignore_min_pcap, "Ignore the minimum power cap"); ++module_param_named(ignore_min_pcap, amdgpu_ignore_min_pcap, int, 0600); ++ ++/** + * DOC: vramlimit (int) + * Restrict the total amount of VRAM in MiB for testing. The default is 0 (Use full VRAM). + */ +--- a/drivers/gpu/drm/amd/pm/amdgpu_pm.c ++++ b/drivers/gpu/drm/amd/pm/amdgpu_pm.c +@@ -3272,6 +3272,9 @@ static ssize_t amdgpu_hwmon_show_power_c + struct device_attribute *attr, + char *buf) + { ++ if (amdgpu_ignore_min_pcap) ++ return sysfs_emit(buf, "%i\n", 0); ++ + return amdgpu_hwmon_show_power_cap_generic(dev, attr, buf, PP_PWR_LIMIT_MIN); + } + +--- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c ++++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c +@@ -2762,7 +2762,10 @@ int smu_get_power_limit(void *handle, + *limit = smu->max_power_limit; + break; + case SMU_PPT_LIMIT_MIN: +- *limit = smu->min_power_limit; ++ if (amdgpu_ignore_min_pcap) ++ *limit = 0; ++ else ++ *limit = smu->min_power_limit; + break; + default: + return -EINVAL; +@@ -2786,7 +2789,14 @@ static int smu_set_power_limit(void *han + if (smu->ppt_funcs->set_power_limit) + return smu->ppt_funcs->set_power_limit(smu, limit_type, limit); + +- if ((limit > smu->max_power_limit) || (limit < smu->min_power_limit)) { ++ if (amdgpu_ignore_min_pcap) { ++ if ((limit > smu->max_power_limit)) { ++ dev_err(smu->adev->dev, ++ "New power limit (%d) is over the max allowed %d\n", ++ limit, smu->max_power_limit); ++ return -EINVAL; ++ } ++ } else if ((limit > smu->max_power_limit) || (limit < smu->min_power_limit)) { + dev_err(smu->adev->dev, + "New power limit (%d) is out of range [%d,%d]\n", + limit, smu->min_power_limit, smu->max_power_limit); diff --git a/debian/patches/patchset-zen/sauce/0010-ZEN-Set-default-max-map-count-to-INT_MAX-5.patch b/debian/patches/patchset-zen/sauce/0010-ZEN-Set-default-max-map-count-to-INT_MAX-5.patch new file mode 100644 index 0000000..b6d30fe --- /dev/null +++ b/debian/patches/patchset-zen/sauce/0010-ZEN-Set-default-max-map-count-to-INT_MAX-5.patch @@ -0,0 +1,29 @@ +From 531d884259632814e998d1662690daa1e57dd98c Mon Sep 17 00:00:00 2001 +From: Steven Barrett +Date: Thu, 27 Apr 2023 14:43:57 -0500 +Subject: ZEN: Set default max map count to (INT_MAX - 5) + +Per [Fedora][1], they intend to change the default max map count for +their distribution to improve OOTB compatibility with games played +through Steam/Proton. The value they picked comes from the Steam Deck, +which defaults to INT_MAX - MAPCOUNT_ELF_CORE_MARGIN. + +Since most ZEN and Liquorix users probably play games, follow Valve's +lead and raise this value to their default. + +[1]: https://fedoraproject.org/wiki/Changes/IncreaseVmMaxMapCount +--- + include/linux/mm.h | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/include/linux/mm.h ++++ b/include/linux/mm.h +@@ -197,7 +197,7 @@ static inline void __mm_zero_struct_page + * that. + */ + #define MAPCOUNT_ELF_CORE_MARGIN (5) +-#define DEFAULT_MAX_MAP_COUNT (USHRT_MAX - MAPCOUNT_ELF_CORE_MARGIN) ++#define DEFAULT_MAX_MAP_COUNT (INT_MAX - MAPCOUNT_ELF_CORE_MARGIN) + + extern int sysctl_max_map_count; + diff --git a/debian/patches/patchset-zen/sauce/0011-ZEN-INTERACTIVE-Base-config-item.patch b/debian/patches/patchset-zen/sauce/0011-ZEN-INTERACTIVE-Base-config-item.patch new file mode 100644 index 0000000..58ddd57 --- /dev/null +++ b/debian/patches/patchset-zen/sauce/0011-ZEN-INTERACTIVE-Base-config-item.patch @@ -0,0 +1,24 @@ +From 032775267df11a87616d2ec7f09c0b1b12da5da7 Mon Sep 17 00:00:00 2001 +From: "Jan Alexander Steffens (heftig)" +Date: Mon, 27 Jan 2020 18:10:06 +0100 +Subject: ZEN: INTERACTIVE: Base config item + +--- + init/Kconfig | 6 ++++++ + 1 file changed, 6 insertions(+) + +--- a/init/Kconfig ++++ b/init/Kconfig +@@ -134,6 +134,12 @@ config THREAD_INFO_IN_TASK + + menu "General setup" + ++config ZEN_INTERACTIVE ++ bool "Tune kernel for interactivity" ++ default y ++ help ++ Tunes the kernel for responsiveness at the cost of throughput and power usage. ++ + config BROKEN + bool + diff --git a/debian/patches/patchset-zen/sauce/0012-ZEN-INTERACTIVE-Use-BFQ-as-the-elevator-for-SQ-devic.patch b/debian/patches/patchset-zen/sauce/0012-ZEN-INTERACTIVE-Use-BFQ-as-the-elevator-for-SQ-devic.patch new file mode 100644 index 0000000..c74d54f --- /dev/null +++ b/debian/patches/patchset-zen/sauce/0012-ZEN-INTERACTIVE-Use-BFQ-as-the-elevator-for-SQ-devic.patch @@ -0,0 +1,37 @@ +From c614dbbfd3480cf18c90fd51bb52abd53339b790 Mon Sep 17 00:00:00 2001 +From: "Jan Alexander Steffens (heftig)" +Date: Mon, 27 Jan 2020 18:11:05 +0100 +Subject: ZEN: INTERACTIVE: Use BFQ as the elevator for SQ devices + +--- + block/elevator.c | 4 ++++ + init/Kconfig | 4 ++++ + 2 files changed, 8 insertions(+) + +--- a/block/elevator.c ++++ b/block/elevator.c +@@ -569,7 +569,11 @@ static struct elevator_type *elevator_ge + !blk_mq_is_shared_tags(q->tag_set->flags)) + return NULL; + ++#if defined(CONFIG_ZEN_INTERACTIVE) && defined(CONFIG_IOSCHED_BFQ) ++ return elevator_find_get(q, "bfq"); ++#else + return elevator_find_get(q, "mq-deadline"); ++#endif + } + + /* +--- a/init/Kconfig ++++ b/init/Kconfig +@@ -140,6 +140,10 @@ config ZEN_INTERACTIVE + help + Tunes the kernel for responsiveness at the cost of throughput and power usage. + ++ --- Block Layer ---------------------------------------- ++ ++ Default scheduler for SQ..: mq-deadline -> bfq ++ + config BROKEN + bool + diff --git a/debian/patches/patchset-zen/sauce/0013-ZEN-INTERACTIVE-Use-Kyber-as-the-elevator-for-MQ-dev.patch b/debian/patches/patchset-zen/sauce/0013-ZEN-INTERACTIVE-Use-Kyber-as-the-elevator-for-MQ-dev.patch new file mode 100644 index 0000000..4be7534 --- /dev/null +++ b/debian/patches/patchset-zen/sauce/0013-ZEN-INTERACTIVE-Use-Kyber-as-the-elevator-for-MQ-dev.patch @@ -0,0 +1,36 @@ +From b87281991e8f34e557cf2fb1614b3f4808100233 Mon Sep 17 00:00:00 2001 +From: "Jan Alexander Steffens (heftig)" +Date: Mon, 12 Dec 2022 00:03:03 +0100 +Subject: ZEN: INTERACTIVE: Use Kyber as the elevator for MQ devices + +--- + block/elevator.c | 6 ++++++ + init/Kconfig | 1 + + 2 files changed, 7 insertions(+) + +--- a/block/elevator.c ++++ b/block/elevator.c +@@ -567,7 +567,13 @@ static struct elevator_type *elevator_ge + + if (q->nr_hw_queues != 1 && + !blk_mq_is_shared_tags(q->tag_set->flags)) ++#if defined(CONFIG_ZEN_INTERACTIVE) && defined(CONFIG_MQ_IOSCHED_KYBER) ++ return elevator_find_get(q, "kyber"); ++#elif defined(CONFIG_ZEN_INTERACTIVE) ++ return elevator_find_get(q, "mq-deadline"); ++#else + return NULL; ++#endif + + #if defined(CONFIG_ZEN_INTERACTIVE) && defined(CONFIG_IOSCHED_BFQ) + return elevator_find_get(q, "bfq"); +--- a/init/Kconfig ++++ b/init/Kconfig +@@ -143,6 +143,7 @@ config ZEN_INTERACTIVE + --- Block Layer ---------------------------------------- + + Default scheduler for SQ..: mq-deadline -> bfq ++ Default scheduler for MQ..: none -> kyber + + config BROKEN + bool diff --git a/debian/patches/patchset-zen/sauce/0014-ZEN-INTERACTIVE-Enable-background-reclaim-of-hugepag.patch b/debian/patches/patchset-zen/sauce/0014-ZEN-INTERACTIVE-Enable-background-reclaim-of-hugepag.patch new file mode 100644 index 0000000..7ddc6fc --- /dev/null +++ b/debian/patches/patchset-zen/sauce/0014-ZEN-INTERACTIVE-Enable-background-reclaim-of-hugepag.patch @@ -0,0 +1,59 @@ +From e2db8ce3c52c7bd37e93728d6c12a483f17634bc Mon Sep 17 00:00:00 2001 +From: "Jan Alexander Steffens (heftig)" +Date: Mon, 27 Jan 2020 18:21:09 +0100 +Subject: ZEN: INTERACTIVE: Enable background reclaim of hugepages + +Use [defer+madvise] as default khugepaged defrag strategy: + +For some reason, the default strategy to respond to THP fault fallbacks +is still just madvise, meaning stall if the program wants transparent +hugepages, but don't trigger a background reclaim / compaction if THP +begins to fail allocations. This creates a snowball affect where we +still use the THP code paths, but we almost always fail once a system +has been active and busy for a while. + +The option "defer" was created for interactive systems where THP can +still improve performance. If we have to fallback to a regular page due +to an allocation failure or anything else, we will trigger a background +reclaim and compaction so future THP attempts succeed and previous +attempts eventually have their smaller pages combined without stalling +running applications. + +We still want madvise to stall applications that explicitely want THP, +so defer+madvise _does_ make a ton of sense. Make it the default for +interactive systems, especially if the kernel maintainer left +transparent hugepages on "always". + +Reasoning and details in the original patch: https://lwn.net/Articles/711248/ +--- + init/Kconfig | 4 ++++ + mm/huge_memory.c | 4 ++++ + 2 files changed, 8 insertions(+) + +--- a/init/Kconfig ++++ b/init/Kconfig +@@ -145,6 +145,10 @@ config ZEN_INTERACTIVE + Default scheduler for SQ..: mq-deadline -> bfq + Default scheduler for MQ..: none -> kyber + ++ --- Virtual Memory Subsystem --------------------------- ++ ++ Background-reclaim hugepages...: no -> yes ++ + config BROKEN + bool + +--- a/mm/huge_memory.c ++++ b/mm/huge_memory.c +@@ -64,7 +64,11 @@ unsigned long transparent_hugepage_flags + #ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE + (1< +Date: Wed, 11 Aug 2021 18:47:46 -0500 +Subject: ZEN: INTERACTIVE: Tune mgLRU to protect cache used in the last second + +Although not identical to the le9 patches that protect a byte-amount of +cache through tunables, multigenerational LRU now supports protecting +cache accessed in the last X milliseconds. + +In #218, Yu recommends starting with 1000ms and tuning as needed. This +looks like a safe default and turning on this feature should help users +that don't know they need it. +--- + init/Kconfig | 1 + + mm/vmscan.c | 4 ++++ + 2 files changed, 5 insertions(+) + +--- a/init/Kconfig ++++ b/init/Kconfig +@@ -148,6 +148,7 @@ config ZEN_INTERACTIVE + --- Virtual Memory Subsystem --------------------------- + + Background-reclaim hugepages...: no -> yes ++ MG-LRU minimum cache TTL.......: 0 -> 1000 ms + + config BROKEN + bool +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -3968,7 +3968,11 @@ static bool lruvec_is_reclaimable(struct + } + + /* to protect the working set of the last N jiffies */ ++#ifdef CONFIG_ZEN_INTERACTIVE ++static unsigned long lru_gen_min_ttl __read_mostly = HZ; ++#else + static unsigned long lru_gen_min_ttl __read_mostly; ++#endif + + static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) + { diff --git a/debian/patches/patchset-zen/sauce/0016-ZEN-INTERACTIVE-Tune-EEVDF-for-interactivity.patch b/debian/patches/patchset-zen/sauce/0016-ZEN-INTERACTIVE-Tune-EEVDF-for-interactivity.patch new file mode 100644 index 0000000..75a0131 --- /dev/null +++ b/debian/patches/patchset-zen/sauce/0016-ZEN-INTERACTIVE-Tune-EEVDF-for-interactivity.patch @@ -0,0 +1,104 @@ +From 44a6d7ca11b601b34724dc41e086576499a096bd Mon Sep 17 00:00:00 2001 +From: "Jan Alexander Steffens (heftig)" +Date: Tue, 31 Oct 2023 19:03:10 +0100 +Subject: ZEN: INTERACTIVE: Tune EEVDF for interactivity + +5.7: +Take "sysctl_sched_nr_migrate" tune from early XanMod builds of 128. As +of 5.7, XanMod uses 256 but that may affect applications that require +timely response to IRQs. + +5.15: +Per [a comment][1] on our ZEN INTERACTIVE commit, reducing the cost of +migration causes the system less responsive under high load. Most +likely the combination of reduced migration cost + the higher number of +tasks that can be migrated at once contributes to this. + +To better handle this situation, restore the mainline migration cost +value and also reduce the max number of tasks that can be migrated in +batch from 128 to 64. + +If this doesn't help, we'll restore the reduced migration cost and keep +total number of tasks that can be migrated at once to 32. + +[1]: https://github.com/zen-kernel/zen-kernel/commit/be5ba234ca0a5aabe74bfc7e1f636f085bd3823c#commitcomment-63159674 + +6.6: +Port the tuning to EEVDF, which removed a couple of settings. + +6.7: +Instead of increasing the number of tasks that migrate at once, migrate +the amount acceptable for PREEMPT_RT, but reduce the cost so migrations +occur more often. + +This should make CFS/EEVDF behave more like out-of-tree schedulers that +aggressively use idle cores to reduce latency, but without the jank +caused by rebalancing too many tasks at once. +--- + init/Kconfig | 7 +++++++ + kernel/sched/fair.c | 13 +++++++++++++ + kernel/sched/sched.h | 2 +- + 3 files changed, 21 insertions(+), 1 deletion(-) + +--- a/init/Kconfig ++++ b/init/Kconfig +@@ -150,6 +150,13 @@ config ZEN_INTERACTIVE + Background-reclaim hugepages...: no -> yes + MG-LRU minimum cache TTL.......: 0 -> 1000 ms + ++ --- EEVDF CPU Scheduler -------------------------------- ++ ++ Minimal granularity............: 0.75 -> 0.4 ms ++ Migration cost.................: 0.5 -> 0.25 ms ++ Bandwidth slice size...........: 5 -> 3 ms ++ Task rebalancing threshold.....: 32 -> 8 ++ + config BROKEN + bool + +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -73,10 +73,19 @@ unsigned int sysctl_sched_tunable_scalin + * + * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds) + */ ++#ifdef CONFIG_ZEN_INTERACTIVE ++unsigned int sysctl_sched_base_slice = 400000ULL; ++static unsigned int normalized_sysctl_sched_base_slice = 400000ULL; ++#else + unsigned int sysctl_sched_base_slice = 750000ULL; + static unsigned int normalized_sysctl_sched_base_slice = 750000ULL; ++#endif + ++#ifdef CONFIG_ZEN_INTERACTIVE ++const_debug unsigned int sysctl_sched_migration_cost = 250000UL; ++#else + const_debug unsigned int sysctl_sched_migration_cost = 500000UL; ++#endif + + static int __init setup_sched_thermal_decay_shift(char *str) + { +@@ -121,8 +130,12 @@ int __weak arch_asym_cpu_priority(int cp + * + * (default: 5 msec, units: microseconds) + */ ++#ifdef CONFIG_ZEN_INTERACTIVE ++static unsigned int sysctl_sched_cfs_bandwidth_slice = 3000UL; ++#else + static unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; + #endif ++#endif + + #ifdef CONFIG_NUMA_BALANCING + /* Restrict the NUMA promotion throughput (MB/s) for each target node. */ +--- a/kernel/sched/sched.h ++++ b/kernel/sched/sched.h +@@ -2591,7 +2591,7 @@ extern void deactivate_task(struct rq *r + + extern void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags); + +-#ifdef CONFIG_PREEMPT_RT ++#if defined(CONFIG_PREEMPT_RT) || defined(CONFIG_ZEN_INTERACTIVE) + # define SCHED_NR_MIGRATE_BREAK 8 + #else + # define SCHED_NR_MIGRATE_BREAK 32 diff --git a/debian/patches/patchset-zen/sauce/0017-ZEN-INTERACTIVE-Tune-ondemand-governor-for-interacti.patch b/debian/patches/patchset-zen/sauce/0017-ZEN-INTERACTIVE-Tune-ondemand-governor-for-interacti.patch new file mode 100644 index 0000000..5cef79c --- /dev/null +++ b/debian/patches/patchset-zen/sauce/0017-ZEN-INTERACTIVE-Tune-ondemand-governor-for-interacti.patch @@ -0,0 +1,90 @@ +From e9c6f500f4429c32f583d6da11352b2f0bcce4c8 Mon Sep 17 00:00:00 2001 +From: "Jan Alexander Steffens (heftig)" +Date: Mon, 27 Jan 2020 18:27:16 +0100 +Subject: ZEN: INTERACTIVE: Tune ondemand governor for interactivity + +4.10: +During some personal testing with the Dolphin emulator, MuQSS has +serious problems scaling its frequencies causing poor performance where +boosting the CPU frequencies would have fixed them. Reducing the +up_threshold to 45 with MuQSS appears to fix the issue, letting the +introduction to "Star Wars: Rogue Leader" run at 100% speed versus about +80% on my test system. + +Also, lets refactor the definitions and include some indentation to help +the reader discern what the scope of all the macros are. + +5.4: +On the last custom kernel benchmark from Phoronix with Xanmod, Michael +configured all the kernels to run using ondemand instead of the kernel's +[default selection][1]. This reminded me that another option outside of +the kernels control is the user's choice to change the cpufreq governor, +for better or for worse. + +In Liquorix, performance is the default governor whether you're running +acpi-cpufreq or intel-pstate. I expect laptop users to install TLP or +LMT to control the power balance on their system, especially when +they're plugged in or on battery. However, it's pretty clear to me a +lot of people would choose ondemand over performance since it's not +obvious it has huge performance ramifications with MuQSS, and ondemand +otherwise is "good enough" for most people. + +Lets codify lower up thresholds for MuQSS to more closely synergize with +its aggressive thread migration behavior. This way when ondemand is +configured, you get sort of a "performance-lite" type of result but with +the power savings you expect when leaving the running system idle. + +[1]: https://www.phoronix.com/scan.php?page=article&item=xanmod-2020-kernel + +5.14: +Although CFS and similar schedulers (BMQ, PDS, and CacULE), reuse a lot +more of mainline scheduling and do a good job of pinning single threaded +tasks to their respective core, there's still applications that +confusingly run steady near 50% and benefit from going full speed or +turbo when they need to run (emulators for more recent consoles come to +mind). + +Drop the up threshold for all non-MuQSS schedulers from 80/95 to 55/60. + +5.15: +Remove MuQSS cpufreq configuration. +--- + drivers/cpufreq/cpufreq_ondemand.c | 8 +++++++- + init/Kconfig | 6 ++++++ + 2 files changed, 13 insertions(+), 1 deletion(-) + +--- a/drivers/cpufreq/cpufreq_ondemand.c ++++ b/drivers/cpufreq/cpufreq_ondemand.c +@@ -18,10 +18,16 @@ + #include "cpufreq_ondemand.h" + + /* On-demand governor macros */ ++#if defined(CONFIG_ZEN_INTERACTIVE) ++#define DEF_FREQUENCY_UP_THRESHOLD (55) ++#define MICRO_FREQUENCY_UP_THRESHOLD (60) ++#define DEF_SAMPLING_DOWN_FACTOR (5) ++#else + #define DEF_FREQUENCY_UP_THRESHOLD (63) ++#define MICRO_FREQUENCY_UP_THRESHOLD (70) + #define DEF_SAMPLING_DOWN_FACTOR (100) ++#endif + #define MAX_SAMPLING_DOWN_FACTOR (100000) +-#define MICRO_FREQUENCY_UP_THRESHOLD (70) + #define MIN_FREQUENCY_UP_THRESHOLD (1) + #define MAX_FREQUENCY_UP_THRESHOLD (100) + +--- a/init/Kconfig ++++ b/init/Kconfig +@@ -157,6 +157,12 @@ config ZEN_INTERACTIVE + Bandwidth slice size...........: 5 -> 3 ms + Task rebalancing threshold.....: 32 -> 8 + ++ --- CPUFreq Settings ----------------------------------- ++ ++ Ondemand sampling down factor..: 100 -> 5 ++ Ondemand default up threshold..: 63 -> 55 ++ Ondemand micro up threshold....: 70 -> 60 ++ + config BROKEN + bool + diff --git a/debian/patches/patchset-zen/sauce/0018-ZEN-INTERACTIVE-mm-Disable-unevictable-compaction.patch b/debian/patches/patchset-zen/sauce/0018-ZEN-INTERACTIVE-mm-Disable-unevictable-compaction.patch new file mode 100644 index 0000000..e6a0e18 --- /dev/null +++ b/debian/patches/patchset-zen/sauce/0018-ZEN-INTERACTIVE-mm-Disable-unevictable-compaction.patch @@ -0,0 +1,33 @@ +From 4706a3fb5823c97dc6acc1e86958b71e2c048ec5 Mon Sep 17 00:00:00 2001 +From: Steven Barrett +Date: Sat, 5 Mar 2022 11:37:14 -0600 +Subject: ZEN: INTERACTIVE: mm: Disable unevictable compaction + +This option is already disabled when CONFIG_PREEMPT_RT is enabled, lets +turn it off when CONFIG_ZEN_INTERACTIVE is set as well. +--- + init/Kconfig | 1 + + mm/Kconfig | 2 +- + 2 files changed, 2 insertions(+), 1 deletion(-) + +--- a/init/Kconfig ++++ b/init/Kconfig +@@ -149,6 +149,7 @@ config ZEN_INTERACTIVE + + Background-reclaim hugepages...: no -> yes + MG-LRU minimum cache TTL.......: 0 -> 1000 ms ++ Compact unevictable............: yes -> no + + --- EEVDF CPU Scheduler -------------------------------- + +--- a/mm/Kconfig ++++ b/mm/Kconfig +@@ -649,7 +649,7 @@ config COMPACTION + config COMPACT_UNEVICTABLE_DEFAULT + int + depends on COMPACTION +- default 0 if PREEMPT_RT ++ default 0 if PREEMPT_RT || ZEN_INTERACTIVE + default 1 + + # diff --git a/debian/patches/patchset-zen/sauce/0019-ZEN-INTERACTIVE-mm-Disable-proactive-compaction-by-d.patch b/debian/patches/patchset-zen/sauce/0019-ZEN-INTERACTIVE-mm-Disable-proactive-compaction-by-d.patch new file mode 100644 index 0000000..2927530 --- /dev/null +++ b/debian/patches/patchset-zen/sauce/0019-ZEN-INTERACTIVE-mm-Disable-proactive-compaction-by-d.patch @@ -0,0 +1,38 @@ +From 8146f220f871c4db77c8363c831784041a5bcf7b Mon Sep 17 00:00:00 2001 +From: Sultan Alsawaf +Date: Sat, 24 Oct 2020 22:17:49 -0700 +Subject: ZEN: INTERACTIVE: mm: Disable proactive compaction by default + +On-demand compaction works fine assuming that you don't have a need to +spam the page allocator nonstop for large order page allocations. + +Signed-off-by: Sultan Alsawaf +--- + init/Kconfig | 1 + + mm/compaction.c | 4 ++++ + 2 files changed, 5 insertions(+) + +--- a/init/Kconfig ++++ b/init/Kconfig +@@ -150,6 +150,7 @@ config ZEN_INTERACTIVE + Background-reclaim hugepages...: no -> yes + MG-LRU minimum cache TTL.......: 0 -> 1000 ms + Compact unevictable............: yes -> no ++ Compaction proactiveness.......: 20 -> 0 + + --- EEVDF CPU Scheduler -------------------------------- + +--- a/mm/compaction.c ++++ b/mm/compaction.c +@@ -1950,7 +1950,11 @@ static int sysctl_compact_unevictable_al + * aggressively the kernel should compact memory in the + * background. It takes values in the range [0, 100]. + */ ++#ifdef CONFIG_ZEN_INTERACTIVE ++static unsigned int __read_mostly sysctl_compaction_proactiveness; ++#else + static unsigned int __read_mostly sysctl_compaction_proactiveness = 20; ++#endif + static int sysctl_extfrag_threshold = 500; + static int __read_mostly sysctl_compact_memory; + diff --git a/debian/patches/patchset-zen/sauce/0020-ZEN-INTERACTIVE-mm-Disable-watermark-boosting-by-def.patch b/debian/patches/patchset-zen/sauce/0020-ZEN-INTERACTIVE-mm-Disable-watermark-boosting-by-def.patch new file mode 100644 index 0000000..e745c99 --- /dev/null +++ b/debian/patches/patchset-zen/sauce/0020-ZEN-INTERACTIVE-mm-Disable-watermark-boosting-by-def.patch @@ -0,0 +1,57 @@ +From 5f16843397798d2c709e3b8af4b1a73539d13aa8 Mon Sep 17 00:00:00 2001 +From: Sultan Alsawaf +Date: Sat, 28 Mar 2020 13:06:28 -0700 +Subject: ZEN: INTERACTIVE: mm: Disable watermark boosting by default + +What watermark boosting does is preemptively fire up kswapd to free +memory when there hasn't been an allocation failure. It does this by +increasing kswapd's high watermark goal and then firing up kswapd. The +reason why this causes freezes is because, with the increased high +watermark goal, kswapd will steal memory from processes that need it in +order to make forward progress. These processes will, in turn, try to +allocate memory again, which will cause kswapd to steal necessary pages +from those processes again, in a positive feedback loop known as page +thrashing. When page thrashing occurs, your system is essentially +livelocked until the necessary forward progress can be made to stop +processes from trying to continuously allocate memory and trigger +kswapd to steal it back. + +This problem already occurs with kswapd *without* watermark boosting, +but it's usually only encountered on machines with a small amount of +memory and/or a slow CPU. Watermark boosting just makes the existing +problem worse enough to notice on higher spec'd machines. + +Disable watermark boosting by default since it's a total dumpster fire. +I can't imagine why anyone would want to explicitly enable it, but the +option is there in case someone does. + +Signed-off-by: Sultan Alsawaf +--- + init/Kconfig | 1 + + mm/page_alloc.c | 4 ++++ + 2 files changed, 5 insertions(+) + +--- a/init/Kconfig ++++ b/init/Kconfig +@@ -151,6 +151,7 @@ config ZEN_INTERACTIVE + MG-LRU minimum cache TTL.......: 0 -> 1000 ms + Compact unevictable............: yes -> no + Compaction proactiveness.......: 20 -> 0 ++ Watermark boost factor.........: 1.5 -> 0 + + --- EEVDF CPU Scheduler -------------------------------- + +--- a/mm/page_alloc.c ++++ b/mm/page_alloc.c +@@ -271,7 +271,11 @@ const char * const migratetype_names[MIG + + int min_free_kbytes = 1024; + int user_min_free_kbytes = -1; ++#ifdef CONFIG_ZEN_INTERACTIVE ++static int watermark_boost_factor __read_mostly; ++#else + static int watermark_boost_factor __read_mostly = 15000; ++#endif + static int watermark_scale_factor = 10; + + /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ diff --git a/debian/patches/patchset-zen/sauce/0021-ZEN-INTERACTIVE-mm-Lower-the-non-hugetlbpage-pageblo.patch b/debian/patches/patchset-zen/sauce/0021-ZEN-INTERACTIVE-mm-Lower-the-non-hugetlbpage-pageblo.patch new file mode 100644 index 0000000..0333d12 --- /dev/null +++ b/debian/patches/patchset-zen/sauce/0021-ZEN-INTERACTIVE-mm-Lower-the-non-hugetlbpage-pageblo.patch @@ -0,0 +1,57 @@ +From eb51c53e5ded1743830368815c550b871f950738 Mon Sep 17 00:00:00 2001 +From: Sultan Alsawaf +Date: Wed, 20 Oct 2021 20:50:11 -0700 +Subject: ZEN: INTERACTIVE: mm: Lower the non-hugetlbpage pageblock size to + reduce scheduling delays + +The page allocator processes free pages in groups of pageblocks, where +the size of a pageblock is typically quite large (1024 pages without +hugetlbpage support). Pageblocks are processed atomically with the zone +lock held, which can cause severe scheduling delays on both the CPU +going through the pageblock and any other CPUs waiting to acquire the +zone lock. A frequent offender is move_freepages_block(), which is used +by rmqueue() for page allocation. + +As it turns out, there's no requirement for pageblocks to be so large, +so the pageblock order can simply be reduced to ease the scheduling +delays and zone lock contention. PAGE_ALLOC_COSTLY_ORDER is used as a +reasonable setting to ensure non-costly page allocation requests can +still be serviced without always needing to free up more than one +pageblock's worth of pages at a time. + +This has a noticeable effect on overall system latency when memory +pressure is elevated. The various mm functions which operate on +pageblocks no longer appear in the preemptoff tracer, where previously +they would spend up to 100 ms on a mobile arm64 CPU processing a +pageblock with preemption disabled and the zone lock held. + +Signed-off-by: Sultan Alsawaf +--- + include/linux/pageblock-flags.h | 4 ++++ + init/Kconfig | 1 + + 2 files changed, 5 insertions(+) + +--- a/include/linux/pageblock-flags.h ++++ b/include/linux/pageblock-flags.h +@@ -52,7 +52,11 @@ extern unsigned int pageblock_order; + #else /* CONFIG_TRANSPARENT_HUGEPAGE */ + + /* If huge pages are not used, group by MAX_ORDER_NR_PAGES */ ++#ifdef CONFIG_ZEN_INTERACTIVE ++#define pageblock_order PAGE_ALLOC_COSTLY_ORDER ++#else + #define pageblock_order MAX_PAGE_ORDER ++#endif + + #endif /* CONFIG_HUGETLB_PAGE */ + +--- a/init/Kconfig ++++ b/init/Kconfig +@@ -152,6 +152,7 @@ config ZEN_INTERACTIVE + Compact unevictable............: yes -> no + Compaction proactiveness.......: 20 -> 0 + Watermark boost factor.........: 1.5 -> 0 ++ Pageblock order................: 10 -> 3 + + --- EEVDF CPU Scheduler -------------------------------- + diff --git a/debian/patches/patchset-zen/sauce/0022-ZEN-INTERACTIVE-dm-crypt-Disable-workqueues-for-cryp.patch b/debian/patches/patchset-zen/sauce/0022-ZEN-INTERACTIVE-dm-crypt-Disable-workqueues-for-cryp.patch new file mode 100644 index 0000000..2ead0f7 --- /dev/null +++ b/debian/patches/patchset-zen/sauce/0022-ZEN-INTERACTIVE-dm-crypt-Disable-workqueues-for-cryp.patch @@ -0,0 +1,44 @@ +From a8a0d4b9f356610babe5b884500799310fe6dcdd Mon Sep 17 00:00:00 2001 +From: Steven Barrett +Date: Sat, 21 May 2022 15:15:09 -0500 +Subject: ZEN: INTERACTIVE: dm-crypt: Disable workqueues for crypto ops + +Queueing in dm-crypt for crypto operations reduces performance on modern +systems. As discussed in an article from Cloudflare, they discovered +that queuing was introduced because the crypto subsystem used to be +synchronous. Since it's now asynchronous, we get double queueing when +using the subsystem through dm-crypt. This is obviously undesirable and +reduces throughput and increases latency. + +Disable queueing when using our Zen Interactive configuration. + +Fixes: https://github.com/zen-kernel/zen-kernel/issues/282 +--- + drivers/md/dm-crypt.c | 5 +++++ + init/Kconfig | 1 + + 2 files changed, 6 insertions(+) + +--- a/drivers/md/dm-crypt.c ++++ b/drivers/md/dm-crypt.c +@@ -3310,6 +3310,11 @@ static int crypt_ctr(struct dm_target *t + goto bad; + } + ++#ifdef CONFIG_ZEN_INTERACTIVE ++ set_bit(DM_CRYPT_NO_READ_WORKQUEUE, &cc->flags); ++ set_bit(DM_CRYPT_NO_WRITE_WORKQUEUE, &cc->flags); ++#endif ++ + ret = crypt_ctr_cipher(ti, argv[0], argv[1]); + if (ret < 0) + goto bad; +--- a/init/Kconfig ++++ b/init/Kconfig +@@ -144,6 +144,7 @@ config ZEN_INTERACTIVE + + Default scheduler for SQ..: mq-deadline -> bfq + Default scheduler for MQ..: none -> kyber ++ DM-Crypt workqueues.......: yes -> no + + --- Virtual Memory Subsystem --------------------------- + diff --git a/debian/patches/patchset-zen/sauce/0023-ZEN-INTERACTIVE-mm-swap-Disable-swap-in-readahead.patch b/debian/patches/patchset-zen/sauce/0023-ZEN-INTERACTIVE-mm-swap-Disable-swap-in-readahead.patch new file mode 100644 index 0000000..35de516 --- /dev/null +++ b/debian/patches/patchset-zen/sauce/0023-ZEN-INTERACTIVE-mm-swap-Disable-swap-in-readahead.patch @@ -0,0 +1,49 @@ +From 5a8fabcd4e7396500f2c0070f8b7ce9106eb9bfa Mon Sep 17 00:00:00 2001 +From: Steven Barrett +Date: Mon, 5 Sep 2022 11:35:20 -0500 +Subject: ZEN: INTERACTIVE: mm/swap: Disable swap-in readahead + +Per an [issue][1] on the chromium project, swap-in readahead causes more +jank than not. This might be caused by poor optimization on the +swapping code, or the fact under memory pressure, we're pulling in pages +we don't need, causing more swapping. + +Either way, this is mainline/upstream to Chromium, and ChromeOS +developers care a lot about system responsiveness. Lets implement the +same change so Zen Kernel users benefit. + +[1]: https://bugs.chromium.org/p/chromium/issues/detail?id=263561 +--- + init/Kconfig | 1 + + mm/swap.c | 5 +++++ + 2 files changed, 6 insertions(+) + +--- a/init/Kconfig ++++ b/init/Kconfig +@@ -154,6 +154,7 @@ config ZEN_INTERACTIVE + Compaction proactiveness.......: 20 -> 0 + Watermark boost factor.........: 1.5 -> 0 + Pageblock order................: 10 -> 3 ++ Swap-in readahead..............: 3 -> 0 + + --- EEVDF CPU Scheduler -------------------------------- + +--- a/mm/swap.c ++++ b/mm/swap.c +@@ -1126,6 +1126,10 @@ void folio_batch_remove_exceptionals(str + */ + void __init swap_setup(void) + { ++#ifdef CONFIG_ZEN_INTERACTIVE ++ /* Only swap-in pages requested, avoid readahead */ ++ page_cluster = 0; ++#else + unsigned long megs = totalram_pages() >> (20 - PAGE_SHIFT); + + /* Use a smaller cluster for small-memory machines */ +@@ -1137,4 +1141,5 @@ void __init swap_setup(void) + * Right now other parts of the system means that we + * _really_ don't want to cluster much more + */ ++#endif + } diff --git a/debian/patches/patchset-zen/sauce/0024-ZEN-Update-VHBA-driver.patch b/debian/patches/patchset-zen/sauce/0024-ZEN-Update-VHBA-driver.patch new file mode 100644 index 0000000..d89a19c --- /dev/null +++ b/debian/patches/patchset-zen/sauce/0024-ZEN-Update-VHBA-driver.patch @@ -0,0 +1,19 @@ +From 238d23d6cb3f8610aa1cd3bdaeb398c63a4c9cb2 Mon Sep 17 00:00:00 2001 +From: "Jan Alexander Steffens (heftig)" +Date: Tue, 1 Oct 2024 02:22:46 +0200 +Subject: ZEN: Update VHBA driver + +remote https://github.com/cdemu/cdemu +tag vhba-module-20240917 +--- + drivers/scsi/vhba/Makefile | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/drivers/scsi/vhba/Makefile ++++ b/drivers/scsi/vhba/Makefile +@@ -1,4 +1,4 @@ +-VHBA_VERSION := 20240202 ++VHBA_VERSION := 20240917 + + obj-$(CONFIG_VHBA) += vhba.o + ccflags-y := -DVHBA_VERSION=\"$(VHBA_VERSION)\" -Werror diff --git a/debian/patches/series b/debian/patches/series index 074ab11..1045ec3 100644 --- a/debian/patches/series +++ b/debian/patches/series @@ -108,3 +108,212 @@ bugfix/all/perf-tools-pass-extra_cflags-through-to-libbpf-build-again.patch bugfix/all/kbuild-bpf-fix-btf-reproducibility.patch # ABI maintenance + +## custom patches + +krd/0001-Revert-objtool-dont-fail-the-kernel-build-on-fatal-errors.patch +krd/0002-established-timeout.patch +krd/0003-local-ports.patch +krd/0004-bridge-group_fwd_mask.patch + +## 3rd party patches + +mixed-arch/0001-ZEN-Add-graysky-s-more-ISA-levels-and-uarches.patch +mixed-arch/0002-ZEN-Fixup-graysky-s-more-ISA-levels-and-uarches.patch +mixed-arch/0003-ZEN-Restore-CONFIG_OPTIMIZE_FOR_PERFORMANCE_O3.patch +mixed-arch/0004-krd-adjust-CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3.patch +mixed-arch/0005-XANMOD-x86-build-Prevent-generating-avx2-and-avx512-.patch +mixed-arch/0006-krd-adjust-KBUILD_CFLAGS-fno-tree-vectorize.patch + +misc-bbr3/0001-net-tcp_bbr-broaden-app-limited-rate-sample-detectio.patch +misc-bbr3/0002-net-tcp_bbr-v2-shrink-delivered_mstamp-first_tx_msta.patch +misc-bbr3/0003-net-tcp_bbr-v2-snapshot-packets-in-flight-at-transmi.patch +misc-bbr3/0004-net-tcp_bbr-v2-count-packets-lost-over-TCP-rate-samp.patch +misc-bbr3/0005-net-tcp_bbr-v2-export-FLAG_ECE-in-rate_sample.is_ece.patch +misc-bbr3/0006-net-tcp_bbr-v2-introduce-ca_ops-skb_marked_lost-CC-m.patch +misc-bbr3/0007-net-tcp_bbr-v2-adjust-skb-tx.in_flight-upon-merge-in.patch +misc-bbr3/0008-net-tcp_bbr-v2-adjust-skb-tx.in_flight-upon-split-in.patch +misc-bbr3/0009-net-tcp-add-new-ca-opts-flag-TCP_CONG_WANTS_CE_EVENT.patch +misc-bbr3/0010-net-tcp-re-generalize-TSO-sizing-in-TCP-CC-module-AP.patch +misc-bbr3/0011-net-tcp-add-fast_ack_mode-1-skip-rwin-check-in-tcp_f.patch +misc-bbr3/0012-net-tcp_bbr-v2-record-app-limited-status-of-TLP-repa.patch +misc-bbr3/0013-net-tcp_bbr-v2-inform-CC-module-of-losses-repaired-b.patch +misc-bbr3/0014-net-tcp_bbr-v2-introduce-is_acking_tlp_retrans_seq-i.patch +misc-bbr3/0015-tcp-introduce-per-route-feature-RTAX_FEATURE_ECN_LOW.patch +misc-bbr3/0016-net-tcp_bbr-v3-update-TCP-bbr-congestion-control-mod.patch +misc-bbr3/0017-net-tcp_bbr-v3-ensure-ECN-enabled-BBR-flows-set-ECT-.patch +misc-bbr3/0018-tcp-export-TCPI_OPT_ECN_LOW-in-tcp_info-tcpi_options.patch +misc-bbr3/0019-x86-cfi-bpf-Add-tso_segs-and-skb_marked_lost-to-bpf_.patch + +misc-ntsync5/0001-ntsync-Introduce-NTSYNC_IOC_WAIT_ANY.patch +misc-ntsync5/0002-ntsync-Introduce-NTSYNC_IOC_WAIT_ALL.patch +misc-ntsync5/0003-ntsync-Introduce-NTSYNC_IOC_CREATE_MUTEX.patch +misc-ntsync5/0004-ntsync-Introduce-NTSYNC_IOC_MUTEX_UNLOCK.patch +misc-ntsync5/0005-ntsync-Introduce-NTSYNC_IOC_MUTEX_KILL.patch +misc-ntsync5/0006-ntsync-Introduce-NTSYNC_IOC_CREATE_EVENT.patch +misc-ntsync5/0007-ntsync-Introduce-NTSYNC_IOC_EVENT_SET.patch +misc-ntsync5/0008-ntsync-Introduce-NTSYNC_IOC_EVENT_RESET.patch +misc-ntsync5/0009-ntsync-Introduce-NTSYNC_IOC_EVENT_PULSE.patch +misc-ntsync5/0010-ntsync-Introduce-NTSYNC_IOC_SEM_READ.patch +misc-ntsync5/0011-ntsync-Introduce-NTSYNC_IOC_MUTEX_READ.patch +misc-ntsync5/0012-ntsync-Introduce-NTSYNC_IOC_EVENT_READ.patch +misc-ntsync5/0013-ntsync-Introduce-alertable-waits.patch +misc-ntsync5/0014-maintainers-Add-an-entry-for-ntsync.patch +misc-ntsync5/0015-docs-ntsync-Add-documentation-for-the-ntsync-uAPI.patch +misc-ntsync5/0016-Revert-misc-ntsync-mark-driver-as-broken-to-prevent-.patch + +misc-openwrt/0001-mac80211-ignore-AP-power-level-when-tx-power-type-is.patch +misc-openwrt/0002-net-enable-fraglist-GRO-by-default.patch +misc-openwrt/0003-net-remove-NETIF_F_GSO_FRAGLIST-from-NETIF_F_GSO_SOF.patch + +patchset-pf/amd-pstate/0001-cpufreq-amd-pstate-add-quirk-for-Ryzen-3000-series-p.patch +patchset-pf/amd-pstate/0002-cpufreq-amd-pstate-Export-symbols-for-changing-modes.patch +patchset-pf/amd-pstate/0003-cpufreq-amd-pstate-ut-Add-test-case-for-mode-switche.patch +patchset-pf/amd-pstate/0004-cpufreq-amd-pstate-Catch-failures-for-amd_pstate_epp.patch +patchset-pf/amd-pstate/0005-x86-amd-Move-amd_get_highest_perf-from-amd.c-to-cppc.patch +patchset-pf/amd-pstate/0006-ACPI-CPPC-Adjust-return-code-for-inline-functions-in.patch +patchset-pf/amd-pstate/0007-x86-amd-Rename-amd_get_highest_perf-to-amd_get_boost.patch +patchset-pf/amd-pstate/0008-ACPI-CPPC-Drop-check-for-non-zero-perf-ratio.patch +patchset-pf/amd-pstate/0009-ACPI-CPPC-Adjust-debug-messages-in-amd_set_max_freq_.patch +patchset-pf/amd-pstate/0010-x86-amd-Move-amd_get_highest_perf-out-of-amd-pstate.patch +patchset-pf/amd-pstate/0011-x86-amd-Detect-preferred-cores-in-amd_get_boost_rati.patch +patchset-pf/amd-pstate/0012-cpufreq-amd-pstate-Merge-amd_pstate_highest_perf_set.patch +patchset-pf/amd-pstate/0013-cpufreq-amd-pstate-Optimize-amd_pstate_update_limits.patch +patchset-pf/amd-pstate/0014-cpufreq-amd-pstate-Add-documentation-for-amd_pstate_.patch +patchset-pf/amd-pstate/0015-amd-pstate-Add-missing-documentation-for-amd_pstate_.patch +patchset-pf/amd-pstate/0016-cpufreq-amd-pstate-Fix-non-kerneldoc-comment.patch +patchset-pf/amd-pstate/0017-cpufreq-amd-pstate-ut-Fix-an-Uninitialized-variables.patch +patchset-pf/amd-pstate/0018-cpufreq-amd-pstate-Rename-MSR-and-shared-memory-spec.patch +patchset-pf/amd-pstate/0019-cpufreq-Add-a-callback-to-update-the-min_freq_req-fr.patch +patchset-pf/amd-pstate/0020-cpufreq-amd-pstate-Set-the-initial-min_freq-to-lowes.patch +patchset-pf/amd-pstate/0021-cpufreq-amd-pstate-Cleanup-the-old-min_freq-qos-requ.patch +patchset-pf/amd-pstate/0022-cpufreq-amd-pstate-Fix-amd_pstate-mode-switch-on-sha.patch +patchset-pf/amd-pstate/0023-cpufreq-amd-pstate-Use-nominal-perf-for-limits-when-.patch +patchset-pf/amd-pstate/0024-cpufreq-amd-pstate-Don-t-update-CPPC-request-in-amd_.patch +patchset-pf/amd-pstate/0025-cpufreq-amd-pstate-Use-amd_pstate_update_min_max_lim.patch +patchset-pf/amd-pstate/0026-cpufreq-amd-pstate-Drop-needless-EPP-initialization.patch +patchset-pf/amd-pstate/0027-amd-pstate-6.11-update-setting-the-minimum-frequency.patch +patchset-pf/amd-pstate/0028-cpufreq-amd-pstate-Call-amd_pstate_register-in-amd_p.patch +patchset-pf/amd-pstate/0029-cpufreq-amd-pstate-Call-amd_pstate_set_driver-in-amd.patch +patchset-pf/amd-pstate/0030-cpufreq-amd-pstate-Remove-the-switch-case-in-amd_pst.patch +patchset-pf/amd-pstate/0031-cpufreq-amd-pstate-Remove-the-redundant-amd_pstate_s.patch +patchset-pf/amd-pstate/0032-cpufreq-amd-pstate-ut-Add-fix-for-min-freq-unit-test.patch +patchset-pf/amd-pstate/0033-amd-pstate-Set-min_perf-to-nominal_perf-for-active-m.patch +patchset-pf/amd-pstate/0034-amd-pstate-Switch-to-amd-pstate-by-default-on-some-S.patch +patchset-pf/amd-pstate/0035-cpufreq-amd-pstate-Push-adjust_perf-vfunc-init-into-.patch +patchset-pf/amd-pstate/0036-cpufreq-amd-pstate-Move-registration-after-static-fu.patch + +patchset-pf/amd-rapl/0001-perf-Generic-hotplug-support-for-a-PMU-with-a-scope.patch +patchset-pf/amd-rapl/0002-perf-Add-PERF_EV_CAP_READ_SCOPE.patch +patchset-pf/amd-rapl/0003-perf-x86-intel-cstate-Clean-up-cpumask-and-hotplug.patch +patchset-pf/amd-rapl/0004-iommu-vt-d-Clean-up-cpumask-and-hotplug-for-perfmon.patch +patchset-pf/amd-rapl/0005-dmaengine-idxd-Clean-up-cpumask-and-hotplug-for-perf.patch +patchset-pf/amd-rapl/0006-perf-x86-rapl-Move-the-pmu-allocation-out-of-CPU-hot.patch +patchset-pf/amd-rapl/0007-perf-x86-rapl-Clean-up-cpumask-and-hotplug.patch +patchset-pf/amd-rapl/0008-perf-x86-rapl-Fix-the-energy-pkg-event-for-AMD-CPUs.patch +patchset-pf/amd-rapl/0009-x86-topology-Introduce-topology_logical_core_id.patch +patchset-pf/amd-rapl/0010-perf-x86-rapl-Remove-the-cpu_to_rapl_pmu-function.patch +patchset-pf/amd-rapl/0011-perf-x86-rapl-Rename-rapl_pmu-variables.patch +patchset-pf/amd-rapl/0012-perf-x86-rapl-Make-rapl_model-struct-global.patch +patchset-pf/amd-rapl/0013-perf-x86-rapl-Add-arguments-to-the-cleanup-and-init-.patch +patchset-pf/amd-rapl/0014-perf-x86-rapl-Modify-the-generic-variable-names-to-_.patch +patchset-pf/amd-rapl/0015-perf-x86-rapl-Remove-the-global-variable-rapl_msrs.patch +patchset-pf/amd-rapl/0016-perf-x86-rapl-Move-the-cntr_mask-to-rapl_pmus-struct.patch +patchset-pf/amd-rapl/0017-perf-x86-rapl-Add-per-core-energy-counter-support-fo.patch + +patchset-pf/cpuidle/0001-cpuidle-menu-Remove-iowait-influence.patch +patchset-pf/cpuidle/0002-cpuidle-Prefer-teo-over-menu-governor.patch +patchset-pf/cpuidle/0003-TEST-cpufreq-schedutil-Linear-iowait-boost-step.patch +patchset-pf/cpuidle/0004-TEST-cpufreq-schedutil-iowait-boost-cap-sysfs.patch +patchset-pf/cpuidle/0005-cpufreq-schedutil-Remove-iowait-boost.patch +patchset-pf/cpuidle/0006-cpufreq-intel_pstate-Remove-iowait-boost.patch +patchset-pf/cpuidle/0007-cpufreq-Remove-SCHED_CPUFREQ_IOWAIT-update.patch +patchset-pf/cpuidle/0008-io_uring-Do-not-set-iowait-before-sleeping.patch + +patchset-pf/crypto/0001-crypto-x86-crc32c-simplify-code-for-handling-fewer-t.patch +patchset-pf/crypto/0002-crypto-x86-crc32c-access-32-bit-arguments-as-32-bit.patch +patchset-pf/crypto/0003-crypto-x86-crc32c-eliminate-jump-table-and-excessive.patch + +patchset-pf/ksm/0001-mm-expose-per-process-KSM-control-via-syscalls.patch +patchset-pf/ksm/0002-mm-process_ksm-use-pidfd_get_task-instead-of-pidfd_g.patch + +patchset-pf/zstd/0001-zstd-import-upstream-v1.5.6.patch +patchset-pf/zstd/0002-lib-zstd-Refactor-intentional-wrap-around-test.patch + +patchset-xanmod/binder/0001-binder-turn-into-module.patch + +patchset-xanmod/clearlinux/0001-sched-wait-Do-accept-in-LIFO-order-for-cache-efficie.patch +patchset-xanmod/clearlinux/0002-firmware-Enable-stateless-firmware-loading.patch +patchset-xanmod/clearlinux/0003-locking-rwsem-spin-faster.patch + +patchset-xanmod/net/netfilter/0001-netfilter-Add-netfilter-nf_tables-fullcone-support.patch +patchset-xanmod/net/netfilter/0002-netfilter-add-xt_FLOWOFFLOAD-target.patch + +patchset-xanmod/net/tcp/cloudflare/0001-tcp-Add-a-sysctl-to-skip-tcp-collapse-processing-whe.patch + +patchset-xanmod/pci_acso/0001-PCI-Enable-overrides-for-missing-ACS-capabilities.patch + +patchset-xanmod/valve/0001-extcon-Add-driver-for-Steam-Deck.patch +patchset-xanmod/valve/0002-hwmon-Add-driver-for-Steam-Deck-s-EC-sensors.patch +patchset-xanmod/valve/0003-hwmon-steamdeck-hwmon-Add-support-for-max-battery-le.patch +patchset-xanmod/valve/0004-leds-steamdeck-Add-support-for-Steam-Deck-LED.patch +patchset-xanmod/valve/0005-mfd-Add-MFD-core-driver-for-Steam-Deck.patch +patchset-xanmod/valve/0006-mfd-steamdeck-Expose-controller-board-power-in-sysfs.patch + +patchset-xanmod/xanmod/0001-XANMOD-kbuild-Add-GCC-SMS-based-modulo-scheduling-fl.patch +patchset-xanmod/xanmod/0002-kbuild-Remove-GCC-minimal-function-alignment.patch +patchset-xanmod/xanmod/0003-XANMOD-fair-Set-scheduler-tunable-latencies-to-unsca.patch +patchset-xanmod/xanmod/0004-XANMOD-sched-Add-yield_type-sysctl-to-reduce-or-disa.patch +patchset-xanmod/xanmod/0005-XANMOD-block-mq-deadline-Increase-write-priority-to-.patch +patchset-xanmod/xanmod/0006-XANMOD-block-mq-deadline-Disable-front_merges-by-def.patch +patchset-xanmod/xanmod/0007-XANMOD-block-Set-rq_affinity-to-force-complete-I-O-r.patch +patchset-xanmod/xanmod/0008-XANMOD-blk-wbt-Set-wbt_default_latency_nsec-to-2msec.patch +patchset-xanmod/xanmod/0009-XANMOD-kconfig-add-500Hz-timer-interrupt-kernel-conf.patch +patchset-xanmod/xanmod/0010-XANMOD-dcache-cache_pressure-50-decreases-the-rate-a.patch +patchset-xanmod/xanmod/0011-XANMOD-mm-vmscan-Set-minimum-amount-of-swapping.patch +patchset-xanmod/xanmod/0012-XANMOD-sched-autogroup-Add-kernel-parameter-and-conf.patch +patchset-xanmod/xanmod/0013-XANMOD-cpufreq-tunes-ondemand-and-conservative-gover.patch +patchset-xanmod/xanmod/0014-XANMOD-lib-kconfig.debug-disable-default-SYMBOLIC_ER.patch +patchset-xanmod/xanmod/0015-XANMOD-scripts-setlocalversion-remove-tag-for-git-re.patch +patchset-xanmod/xanmod/0016-XANMOD-scripts-setlocalversion-Move-localversion-fil.patch + +patchset-zen/sauce/0001-ZEN-Add-VHBA-driver.patch +patchset-zen/sauce/0002-vhba-Fix-compat-with-kernel-6.11.patch +patchset-zen/sauce/0003-ZEN-PCI-Add-Intel-remapped-NVMe-device-support.patch +patchset-zen/sauce/0004-ZEN-Disable-stack-conservation-for-GCC.patch +patchset-zen/sauce/0005-ZEN-Initialize-ata-before-graphics.patch +patchset-zen/sauce/0006-ZEN-Input-evdev-use-call_rcu-when-detaching-client.patch +patchset-zen/sauce/0007-ZEN-cpufreq-Remove-schedutil-dependency-on-Intel-AMD.patch +patchset-zen/sauce/0008-ZEN-intel-pstate-Implement-enable-parameter.patch +patchset-zen/sauce/0009-ZEN-drm-amdgpu-pm-Allow-override-of-min_power_limit-.patch +patchset-zen/sauce/0010-ZEN-Set-default-max-map-count-to-INT_MAX-5.patch +patchset-zen/sauce/0011-ZEN-INTERACTIVE-Base-config-item.patch +patchset-zen/sauce/0012-ZEN-INTERACTIVE-Use-BFQ-as-the-elevator-for-SQ-devic.patch +patchset-zen/sauce/0013-ZEN-INTERACTIVE-Use-Kyber-as-the-elevator-for-MQ-dev.patch +patchset-zen/sauce/0014-ZEN-INTERACTIVE-Enable-background-reclaim-of-hugepag.patch +patchset-zen/sauce/0015-ZEN-INTERACTIVE-Tune-mgLRU-to-protect-cache-used-in-.patch +patchset-zen/sauce/0016-ZEN-INTERACTIVE-Tune-EEVDF-for-interactivity.patch +patchset-zen/sauce/0017-ZEN-INTERACTIVE-Tune-ondemand-governor-for-interacti.patch +patchset-zen/sauce/0018-ZEN-INTERACTIVE-mm-Disable-unevictable-compaction.patch +patchset-zen/sauce/0019-ZEN-INTERACTIVE-mm-Disable-proactive-compaction-by-d.patch +patchset-zen/sauce/0020-ZEN-INTERACTIVE-mm-Disable-watermark-boosting-by-def.patch +patchset-zen/sauce/0021-ZEN-INTERACTIVE-mm-Lower-the-non-hugetlbpage-pageblo.patch +patchset-zen/sauce/0022-ZEN-INTERACTIVE-dm-crypt-Disable-workqueues-for-cryp.patch +patchset-zen/sauce/0023-ZEN-INTERACTIVE-mm-swap-Disable-swap-in-readahead.patch +patchset-zen/sauce/0024-ZEN-Update-VHBA-driver.patch + +patchset-pf/fixes/0001-arch-Kconfig-Default-to-maximum-amount-of-ASLR-bits.patch +patchset-pf/fixes/0002-cpufreq-Remove-LATENCY_MULTIPLIER.patch +patchset-pf/fixes/0003-drivers-firmware-skip-simpledrm-if-nvidia-drm.modese.patch +patchset-pf/fixes/0004-nfsd-add-more-info-to-WARN_ON_ONCE-on-failed-callbac.patch +patchset-pf/fixes/0005-e1000e-Remove-Meteor-Lake-SMBUS-workarounds.patch +patchset-pf/fixes/0006-btrfs-zoned-fix-zone-unusable-accounting-for-freed-r.patch +patchset-pf/fixes/0007-btrfs-clear-force-compress-on-remount-when-compress-.patch +patchset-pf/fixes/0008-btrfs-qgroup-set-a-more-sane-default-value-for-subtr.patch +patchset-pf/fixes/0009-btrfs-also-add-stripe-entries-for-NOCOW-writes.patch +patchset-pf/fixes/0010-btrfs-fix-read-corruption-due-to-race-with-extent-ma.patch +patchset-pf/fixes/0011-btrfs-reject-ro-rw-reconfiguration-if-there-are-hard.patch +patchset-pf/fixes/0012-btrfs-fix-passing-0-to-ERR_PTR-in-btrfs_search_dir_i.patch + +patchset-zen/fixes/0001-Partially-revert-drm-amd-amdgpu-add-pipe1-hardware-s.patch +patchset-zen/fixes/0002-drm-amdgpu-swsmu-default-to-fullscreen-3D-profile-fo.patch