diff --git a/debian/bin/genpatch-pfkernel b/debian/bin/genpatch-pfkernel
new file mode 100755
index 0000000..1356fdb
--- /dev/null
+++ b/debian/bin/genpatch-pfkernel
@@ -0,0 +1,61 @@
+#!/bin/sh
+set -ef
+
+export GIT_OPTIONAL_LOCKS=0
+
+w=$(git rev-parse --path-format=absolute --show-toplevel) ; : "${w:?}" ; cd "$w"
+
+dst='debian/patches/pf'
+src='../linux-extras'
+branches='amd-pstate amd-rapl cpuidle crypto fixes ksm zstd'
+
+[ -d "${dst}" ]
+
+kver=
+if [ -n "$1" ] ; then
+	kver="$1"
+else
+	kver=$(dpkg-parsechangelog --show-field=Version | sed -E 's/^[0-9]+://;s/-[^-]*$//' | cut -d. -f1-2)
+fi
+from="upstream/linux-${kver}.y"
+
+t=$(mktemp -d) ; : "${t:?}"
+
+cp -ar "${src}" "$t/"
+cd "$t/${src##*/}"
+
+git config advice.skippedCherryPicks false
+
+for b in ${branches} ; do
+	ref="pf/$b-${kver}"
+	r="tmp-rebase-$b"
+
+	git switch --detach "${ref}"
+	git switch -C "$r"
+
+	if git rebase "${from}" ; then
+		[ -d "$w/${dst}/$b/" ] || mkdir -p "$w/${dst}/$b"
+
+		set +e
+		env -C "$w" git ls-files -z | grep -zF "${dst}/$b/" | grep -zFv '/.' | env -C "$w" -u GIT_OPTIONAL_LOCKS xargs -r -0 git rm -f
+		find "$w/${dst}/$b/" -name '*.patch' -type f -exec rm -f {} +
+		set -e
+
+		git format-patch -N --subject-prefix='' --output-directory "$w/${dst}/$b" "${from}..$r"
+	else
+		echo >&2
+		git rebase --abort
+		echo >&2
+	fi
+
+	git switch -q --detach "${ref}"
+	git branch -D "$r"
+	echo >&2
+done
+
+cd "$w" ; rm -rf "$t"
+
+echo >&2
+echo 'put in debian/patches/series' >&2
+echo >&2
+find "${dst}/" -type f -name '*.patch' | sed -E 's#^debian/patches/##' | sort -V
diff --git a/debian/patches/krd/0001-Revert-objtool-dont-fail-the-kernel-build-on-fatal-errors.patch b/debian/patches/krd/0001-Revert-objtool-dont-fail-the-kernel-build-on-fatal-errors.patch
new file mode 100644
index 0000000..30a817b
--- /dev/null
+++ b/debian/patches/krd/0001-Revert-objtool-dont-fail-the-kernel-build-on-fatal-errors.patch
@@ -0,0 +1,52 @@
+this reverts following commit:
+
+    From: Josh Poimboeuf <jpoimboe@redhat.com>
+    Date: Thu, 14 Jan 2021 16:32:42 -0600
+    Subject: objtool: Don't fail the kernel build on fatal errors
+
+    [ Upstream commit 655cf86548a3938538642a6df27dd359e13c86bd ]
+
+    This is basically a revert of commit 644592d32837 ("objtool: Fail the
+    kernel build on fatal errors").
+
+    That change turned out to be more trouble than it's worth.  Failing the
+    build is an extreme measure which sometimes gets too much attention and
+    blocks CI build testing.
+
+    These fatal-type warnings aren't yet as rare as we'd hope, due to the
+    ever-increasing matrix of supported toolchains/plugins and their
+    fast-changing nature as of late.
+
+    Also, there are more people (and bots) looking for objtool warnings than
+    ever before, so even non-fatal warnings aren't likely to be ignored for
+    long.
+
+    Suggested-by: Nick Desaulniers <ndesaulniers@google.com>
+    Reviewed-by: Miroslav Benes <mbenes@suse.cz>
+    Reviewed-by: Nick Desaulniers <ndesaulniers@google.com>
+    Reviewed-by: Kamalesh Babulal <kamalesh@linux.vnet.ibm.com>
+    Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
+    Signed-off-by: Sasha Levin <sashal@kernel.org>
+
+--- a/tools/objtool/check.c
++++ b/tools/objtool/check.c
+@@ -4872,10 +4872,14 @@ int check(struct objtool_file *file)
+ 	}
+ 
+ out:
+-	/*
+-	 *  For now, don't fail the kernel build on fatal warnings.  These
+-	 *  errors are still fairly common due to the growing matrix of
+-	 *  supported toolchains and their recent pace of change.
+-	 */
++	if (ret < 0) {
++		/*
++		 *  Fatal error.  The binary is corrupt or otherwise broken in
++		 *  some way, or objtool itself is broken.  Fail the kernel
++		 *  build.
++		 */
++		return ret;
++	}
++
+ 	return 0;
+ }
diff --git a/debian/patches/krd/0002-established-timeout.patch b/debian/patches/krd/0002-established-timeout.patch
new file mode 100644
index 0000000..3330b53
--- /dev/null
+++ b/debian/patches/krd/0002-established-timeout.patch
@@ -0,0 +1,11 @@
+--- a/net/netfilter/nf_conntrack_proto_tcp.c
++++ b/net/netfilter/nf_conntrack_proto_tcp.c
+@@ -61,7 +61,7 @@ enum nf_ct_tcp_action {
+ static const unsigned int tcp_timeouts[TCP_CONNTRACK_TIMEOUT_MAX] = {
+ 	[TCP_CONNTRACK_SYN_SENT]	= 2 MINS,
+ 	[TCP_CONNTRACK_SYN_RECV]	= 60 SECS,
+-	[TCP_CONNTRACK_ESTABLISHED]	= 5 DAYS,
++	[TCP_CONNTRACK_ESTABLISHED]	= 128 MINS,
+ 	[TCP_CONNTRACK_FIN_WAIT]	= 2 MINS,
+ 	[TCP_CONNTRACK_CLOSE_WAIT]	= 60 SECS,
+ 	[TCP_CONNTRACK_LAST_ACK]	= 30 SECS,
diff --git a/debian/patches/krd/0003-local-ports.patch b/debian/patches/krd/0003-local-ports.patch
new file mode 100644
index 0000000..fd20a12
--- /dev/null
+++ b/debian/patches/krd/0003-local-ports.patch
@@ -0,0 +1,11 @@
+--- a/net/ipv4/af_inet.c
++++ b/net/ipv4/af_inet.c
+@@ -1802,7 +1802,7 @@ static __net_init int inet_init_net(stru
+ 	/*
+ 	 * Set defaults for local port range
+ 	 */
+-	net->ipv4.ip_local_ports.range = 60999u << 16 | 32768u;
++	net->ipv4.ip_local_ports.range = 65533u << 16 | 49152u;
+ 
+ 	seqlock_init(&net->ipv4.ping_group_range.lock);
+ 	/*
diff --git a/debian/patches/krd/0004-bridge-group_fwd_mask.patch b/debian/patches/krd/0004-bridge-group_fwd_mask.patch
new file mode 100644
index 0000000..61ac436
--- /dev/null
+++ b/debian/patches/krd/0004-bridge-group_fwd_mask.patch
@@ -0,0 +1,37 @@
+--- a/net/bridge/br_input.c
++++ b/net/bridge/br_input.c
+@@ -374,7 +374,11 @@ static rx_handler_result_t br_handle_fra
+ 			return RX_HANDLER_PASS;
+ 
+ 		case 0x01:	/* IEEE MAC (Pause) */
+-			goto drop;
++			fwd_mask |= p->br->group_fwd_mask;
++			if (fwd_mask & (1u << dest[5]))
++				goto forward;
++			else
++				goto drop;
+ 
+ 		case 0x0E:	/* 802.1AB LLDP */
+ 			fwd_mask |= p->br->group_fwd_mask;
+--- a/net/bridge/br_netlink.c
++++ b/net/bridge/br_netlink.c
+@@ -1365,8 +1365,6 @@ static int br_changelink(struct net_devi
+ 	if (data[IFLA_BR_GROUP_FWD_MASK]) {
+ 		u16 fwd_mask = nla_get_u16(data[IFLA_BR_GROUP_FWD_MASK]);
+ 
+-		if (fwd_mask & BR_GROUPFWD_RESTRICTED)
+-			return -EINVAL;
+ 		br->group_fwd_mask = fwd_mask;
+ 	}
+ 
+--- a/net/bridge/br_sysfs_br.c
++++ b/net/bridge/br_sysfs_br.c
+@@ -179,8 +179,6 @@ static ssize_t group_fwd_mask_show(struc
+ static int set_group_fwd_mask(struct net_bridge *br, unsigned long val,
+ 			      struct netlink_ext_ack *extack)
+ {
+-	if (val & BR_GROUPFWD_RESTRICTED)
+-		return -EINVAL;
+ 
+ 	br->group_fwd_mask = val;
+ 
diff --git a/debian/patches/misc-bbr3/0001-net-tcp_bbr-broaden-app-limited-rate-sample-detectio.patch b/debian/patches/misc-bbr3/0001-net-tcp_bbr-broaden-app-limited-rate-sample-detectio.patch
new file mode 100644
index 0000000..620da28
--- /dev/null
+++ b/debian/patches/misc-bbr3/0001-net-tcp_bbr-broaden-app-limited-rate-sample-detectio.patch
@@ -0,0 +1,52 @@
+From ce1cd7869a208112a8728d1fe9e373f78a2e4a6e Mon Sep 17 00:00:00 2001
+From: Neal Cardwell <ncardwell@google.com>
+Date: Tue, 11 Jun 2019 12:26:55 -0400
+Subject: [PATCH 01/19] net-tcp_bbr: broaden app-limited rate sample detection
+
+This commit is a bug fix for the Linux TCP app-limited
+(application-limited) logic that is used for collecting rate
+(bandwidth) samples.
+
+Previously the app-limited logic only looked for "bubbles" of
+silence in between application writes, by checking at the start
+of each sendmsg. But "bubbles" of silence can also happen before
+retransmits: e.g. bubbles can happen between an application write
+and a retransmit, or between two retransmits.
+
+Retransmits are triggered by ACKs or timers. So this commit checks
+for bubbles of app-limited silence upon ACKs or timers.
+
+Why does this commit check for app-limited state at the start of
+ACKs and timer handling? Because at that point we know whether
+inflight was fully using the cwnd.  During processing the ACK or
+timer event we often change the cwnd; after changing the cwnd we
+can't know whether inflight was fully using the old cwnd.
+
+Origin-9xx-SHA1: 3fe9b53291e018407780fb8c356adb5666722cbc
+Change-Id: I37221506f5166877c2b110753d39bb0757985e68
+Signed-off-by: Alexandre Frade <kernel@xanmod.org>
+---
+ net/ipv4/tcp_input.c | 1 +
+ net/ipv4/tcp_timer.c | 1 +
+ 2 files changed, 2 insertions(+)
+
+--- a/net/ipv4/tcp_input.c
++++ b/net/ipv4/tcp_input.c
+@@ -3961,6 +3961,7 @@ static int tcp_ack(struct sock *sk, cons
+ 
+ 	prior_fack = tcp_is_sack(tp) ? tcp_highest_sack_seq(tp) : tp->snd_una;
+ 	rs.prior_in_flight = tcp_packets_in_flight(tp);
++	tcp_rate_check_app_limited(sk);
+ 
+ 	/* ts_recent update must be made after we are sure that the packet
+ 	 * is in window.
+--- a/net/ipv4/tcp_timer.c
++++ b/net/ipv4/tcp_timer.c
+@@ -689,6 +689,7 @@ void tcp_write_timer_handler(struct sock
+ 		return;
+ 	}
+ 
++	tcp_rate_check_app_limited(sk);
+ 	tcp_mstamp_refresh(tcp_sk(sk));
+ 	event = icsk->icsk_pending;
+ 
diff --git a/debian/patches/misc-bbr3/0002-net-tcp_bbr-v2-shrink-delivered_mstamp-first_tx_msta.patch b/debian/patches/misc-bbr3/0002-net-tcp_bbr-v2-shrink-delivered_mstamp-first_tx_msta.patch
new file mode 100644
index 0000000..b279c96
--- /dev/null
+++ b/debian/patches/misc-bbr3/0002-net-tcp_bbr-v2-shrink-delivered_mstamp-first_tx_msta.patch
@@ -0,0 +1,74 @@
+From b32715fbe2ab96d1060ec37bb9c03feedf366494 Mon Sep 17 00:00:00 2001
+From: Neal Cardwell <ncardwell@google.com>
+Date: Sun, 24 Jun 2018 21:55:59 -0400
+Subject: [PATCH 02/19] net-tcp_bbr: v2: shrink delivered_mstamp,
+ first_tx_mstamp to u32 to free up 8 bytes
+
+Free up some space for tracking inflight and losses for each
+bw sample, in upcoming commits.
+
+These timestamps are in microseconds, and are now stored in 32
+bits. So they can only hold time intervals up to roughly 2^12 = 4096
+seconds.  But Linux TCP RTT and RTO tracking has the same 32-bit
+microsecond implementation approach and resulting deployment
+limitations. So this is not introducing a new limit. And these should
+not be a limitation for the foreseeable future.
+
+Effort: net-tcp_bbr
+Origin-9xx-SHA1: 238a7e6b5d51625fef1ce7769826a7b21b02ae55
+Change-Id: I3b779603797263b52a61ad57c565eb91fe42680c
+Signed-off-by: Alexandre Frade <kernel@xanmod.org>
+---
+ include/net/tcp.h   | 9 +++++++--
+ net/ipv4/tcp_rate.c | 7 ++++---
+ 2 files changed, 11 insertions(+), 5 deletions(-)
+
+--- a/include/net/tcp.h
++++ b/include/net/tcp.h
+@@ -884,6 +884,11 @@ static inline u32 tcp_stamp_us_delta(u64
+ 	return max_t(s64, t1 - t0, 0);
+ }
+ 
++static inline u32 tcp_stamp32_us_delta(u32 t1, u32 t0)
++{
++	return max_t(s32, t1 - t0, 0);
++}
++
+ /* provide the departure time in us unit */
+ static inline u64 tcp_skb_timestamp_us(const struct sk_buff *skb)
+ {
+@@ -973,9 +978,9 @@ struct tcp_skb_cb {
+ 			/* pkts S/ACKed so far upon tx of skb, incl retrans: */
+ 			__u32 delivered;
+ 			/* start of send pipeline phase */
+-			u64 first_tx_mstamp;
++			u32 first_tx_mstamp;
+ 			/* when we reached the "delivered" count */
+-			u64 delivered_mstamp;
++			u32 delivered_mstamp;
+ 		} tx;   /* only used for outgoing skbs */
+ 		union {
+ 			struct inet_skb_parm	h4;
+--- a/net/ipv4/tcp_rate.c
++++ b/net/ipv4/tcp_rate.c
+@@ -101,8 +101,9 @@ void tcp_rate_skb_delivered(struct sock
+ 		/* Record send time of most recently ACKed packet: */
+ 		tp->first_tx_mstamp  = tx_tstamp;
+ 		/* Find the duration of the "send phase" of this window: */
+-		rs->interval_us = tcp_stamp_us_delta(tp->first_tx_mstamp,
+-						     scb->tx.first_tx_mstamp);
++		rs->interval_us      = tcp_stamp32_us_delta(
++						tp->first_tx_mstamp,
++						scb->tx.first_tx_mstamp);
+ 
+ 	}
+ 	/* Mark off the skb delivered once it's sacked to avoid being
+@@ -155,7 +156,7 @@ void tcp_rate_gen(struct sock *sk, u32 d
+ 	 * longer phase.
+ 	 */
+ 	snd_us = rs->interval_us;				/* send phase */
+-	ack_us = tcp_stamp_us_delta(tp->tcp_mstamp,
++	ack_us = tcp_stamp32_us_delta(tp->tcp_mstamp,
+ 				    rs->prior_mstamp); /* ack phase */
+ 	rs->interval_us = max(snd_us, ack_us);
+ 
diff --git a/debian/patches/misc-bbr3/0003-net-tcp_bbr-v2-snapshot-packets-in-flight-at-transmi.patch b/debian/patches/misc-bbr3/0003-net-tcp_bbr-v2-snapshot-packets-in-flight-at-transmi.patch
new file mode 100644
index 0000000..07bebb5
--- /dev/null
+++ b/debian/patches/misc-bbr3/0003-net-tcp_bbr-v2-snapshot-packets-in-flight-at-transmi.patch
@@ -0,0 +1,109 @@
+From 25856231832186fe13189b986cc0e91860c18201 Mon Sep 17 00:00:00 2001
+From: Neal Cardwell <ncardwell@google.com>
+Date: Sat, 5 Aug 2017 11:49:50 -0400
+Subject: [PATCH 03/19] net-tcp_bbr: v2: snapshot packets in flight at transmit
+ time and pass in rate_sample
+
+CC algorithms may want to snapshot the number of packets in flight at
+transmit time and pass in rate_sample, to understand the relationship
+between inflight and losses or ECN signals, to try to find the highest
+inflight value that has acceptable levels of loss/ECN marking.
+
+We split out the code to set an skb's tx.in_flight field into its own
+function, so that this code can be used for the TCP_REPAIR "fake send"
+code path that inserts skbs into the rtx queue without sending them.
+
+Effort: net-tcp_bbr
+Origin-9xx-SHA1: b3eb4f2d20efab4ca001f32c9294739036c493ea
+Origin-9xx-SHA1: e880fc907d06ea7354333f60f712748ebce9497b
+Origin-9xx-SHA1: 330f825a08a6fe92cef74d799cc468864c479f63
+Change-Id: I7314047d0ff14dd261a04b1969a46dc658c8836a
+Signed-off-by: Alexandre Frade <kernel@xanmod.org>
+---
+ include/net/tcp.h     |  6 ++++++
+ net/ipv4/tcp_output.c |  1 +
+ net/ipv4/tcp_rate.c   | 20 ++++++++++++++++++++
+ 3 files changed, 27 insertions(+)
+
+--- a/include/net/tcp.h
++++ b/include/net/tcp.h
+@@ -981,6 +981,10 @@ struct tcp_skb_cb {
+ 			u32 first_tx_mstamp;
+ 			/* when we reached the "delivered" count */
+ 			u32 delivered_mstamp;
++#define TCPCB_IN_FLIGHT_BITS 20
++#define TCPCB_IN_FLIGHT_MAX ((1U << TCPCB_IN_FLIGHT_BITS) - 1)
++			u32 in_flight:20,   /* packets in flight at transmit */
++			    unused2:12;
+ 		} tx;   /* only used for outgoing skbs */
+ 		union {
+ 			struct inet_skb_parm	h4;
+@@ -1136,6 +1140,7 @@ struct rate_sample {
+ 	u64  prior_mstamp; /* starting timestamp for interval */
+ 	u32  prior_delivered;	/* tp->delivered at "prior_mstamp" */
+ 	u32  prior_delivered_ce;/* tp->delivered_ce at "prior_mstamp" */
++	u32 tx_in_flight;	/* packets in flight at starting timestamp */
+ 	s32  delivered;		/* number of packets delivered over interval */
+ 	s32  delivered_ce;	/* number of packets delivered w/ CE marks*/
+ 	long interval_us;	/* time for tp->delivered to incr "delivered" */
+@@ -1258,6 +1263,7 @@ static inline void tcp_ca_event(struct s
+ void tcp_set_ca_state(struct sock *sk, const u8 ca_state);
+ 
+ /* From tcp_rate.c */
++void tcp_set_tx_in_flight(struct sock *sk, struct sk_buff *skb);
+ void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb);
+ void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb,
+ 			    struct rate_sample *rs);
+--- a/net/ipv4/tcp_output.c
++++ b/net/ipv4/tcp_output.c
+@@ -2765,6 +2765,7 @@ static bool tcp_write_xmit(struct sock *
+ 			skb_set_delivery_time(skb, tp->tcp_wstamp_ns, SKB_CLOCK_MONOTONIC);
+ 			list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue);
+ 			tcp_init_tso_segs(skb, mss_now);
++			tcp_set_tx_in_flight(sk, skb);
+ 			goto repair; /* Skip network transmission */
+ 		}
+ 
+--- a/net/ipv4/tcp_rate.c
++++ b/net/ipv4/tcp_rate.c
+@@ -34,6 +34,24 @@
+  * ready to send in the write queue.
+  */
+ 
++void tcp_set_tx_in_flight(struct sock *sk, struct sk_buff *skb)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	u32 in_flight;
++
++	/* Check, sanitize, and record packets in flight after skb was sent. */
++	in_flight = tcp_packets_in_flight(tp) + tcp_skb_pcount(skb);
++	if (WARN_ONCE(in_flight > TCPCB_IN_FLIGHT_MAX,
++		      "insane in_flight %u cc %s mss %u "
++		      "cwnd %u pif %u %u %u %u\n",
++		      in_flight, inet_csk(sk)->icsk_ca_ops->name,
++		      tp->mss_cache, tp->snd_cwnd,
++		      tp->packets_out, tp->retrans_out,
++		      tp->sacked_out, tp->lost_out))
++		in_flight = TCPCB_IN_FLIGHT_MAX;
++	TCP_SKB_CB(skb)->tx.in_flight = in_flight;
++}
++
+ /* Snapshot the current delivery information in the skb, to generate
+  * a rate sample later when the skb is (s)acked in tcp_rate_skb_delivered().
+  */
+@@ -67,6 +85,7 @@ void tcp_rate_skb_sent(struct sock *sk,
+ 	TCP_SKB_CB(skb)->tx.delivered		= tp->delivered;
+ 	TCP_SKB_CB(skb)->tx.delivered_ce	= tp->delivered_ce;
+ 	TCP_SKB_CB(skb)->tx.is_app_limited	= tp->app_limited ? 1 : 0;
++	tcp_set_tx_in_flight(sk, skb);
+ }
+ 
+ /* When an skb is sacked or acked, we fill in the rate sample with the (prior)
+@@ -96,6 +115,7 @@ void tcp_rate_skb_delivered(struct sock
+ 		rs->prior_mstamp     = scb->tx.delivered_mstamp;
+ 		rs->is_app_limited   = scb->tx.is_app_limited;
+ 		rs->is_retrans	     = scb->sacked & TCPCB_RETRANS;
++		rs->tx_in_flight     = scb->tx.in_flight;
+ 		rs->last_end_seq     = scb->end_seq;
+ 
+ 		/* Record send time of most recently ACKed packet: */
diff --git a/debian/patches/misc-bbr3/0004-net-tcp_bbr-v2-count-packets-lost-over-TCP-rate-samp.patch b/debian/patches/misc-bbr3/0004-net-tcp_bbr-v2-count-packets-lost-over-TCP-rate-samp.patch
new file mode 100644
index 0000000..8882ff8
--- /dev/null
+++ b/debian/patches/misc-bbr3/0004-net-tcp_bbr-v2-count-packets-lost-over-TCP-rate-samp.patch
@@ -0,0 +1,70 @@
+From b1772710e8b5b98c09e96d4f1af620cd938fddf7 Mon Sep 17 00:00:00 2001
+From: Neal Cardwell <ncardwell@google.com>
+Date: Thu, 12 Oct 2017 23:44:27 -0400
+Subject: [PATCH 04/19] net-tcp_bbr: v2: count packets lost over TCP rate
+ sampling interval
+
+For understanding the relationship between inflight and packet loss
+signals, to try to find the highest inflight value that has acceptable
+levels of packet losses.
+
+Effort: net-tcp_bbr
+Origin-9xx-SHA1: 4527e26b2bd7756a88b5b9ef1ada3da33dd609ab
+Change-Id: I594c2500868d9c530770e7ddd68ffc87c57f4fd5
+Signed-off-by: Alexandre Frade <kernel@xanmod.org>
+---
+ include/net/tcp.h   | 5 ++++-
+ net/ipv4/tcp_rate.c | 3 +++
+ 2 files changed, 7 insertions(+), 1 deletion(-)
+
+--- a/include/net/tcp.h
++++ b/include/net/tcp.h
+@@ -985,6 +985,7 @@ struct tcp_skb_cb {
+ #define TCPCB_IN_FLIGHT_MAX ((1U << TCPCB_IN_FLIGHT_BITS) - 1)
+ 			u32 in_flight:20,   /* packets in flight at transmit */
+ 			    unused2:12;
++			u32 lost;	/* packets lost so far upon tx of skb */
+ 		} tx;   /* only used for outgoing skbs */
+ 		union {
+ 			struct inet_skb_parm	h4;
+@@ -1138,11 +1139,13 @@ struct ack_sample {
+  */
+ struct rate_sample {
+ 	u64  prior_mstamp; /* starting timestamp for interval */
++	u32  prior_lost;	/* tp->lost at "prior_mstamp" */
+ 	u32  prior_delivered;	/* tp->delivered at "prior_mstamp" */
+ 	u32  prior_delivered_ce;/* tp->delivered_ce at "prior_mstamp" */
+ 	u32 tx_in_flight;	/* packets in flight at starting timestamp */
++	s32  lost;		/* number of packets lost over interval */
+ 	s32  delivered;		/* number of packets delivered over interval */
+-	s32  delivered_ce;	/* number of packets delivered w/ CE marks*/
++	s32  delivered_ce;	/* packets delivered w/ CE mark over interval */
+ 	long interval_us;	/* time for tp->delivered to incr "delivered" */
+ 	u32 snd_interval_us;	/* snd interval for delivered packets */
+ 	u32 rcv_interval_us;	/* rcv interval for delivered packets */
+--- a/net/ipv4/tcp_rate.c
++++ b/net/ipv4/tcp_rate.c
+@@ -84,6 +84,7 @@ void tcp_rate_skb_sent(struct sock *sk,
+ 	TCP_SKB_CB(skb)->tx.delivered_mstamp	= tp->delivered_mstamp;
+ 	TCP_SKB_CB(skb)->tx.delivered		= tp->delivered;
+ 	TCP_SKB_CB(skb)->tx.delivered_ce	= tp->delivered_ce;
++	TCP_SKB_CB(skb)->tx.lost		= tp->lost;
+ 	TCP_SKB_CB(skb)->tx.is_app_limited	= tp->app_limited ? 1 : 0;
+ 	tcp_set_tx_in_flight(sk, skb);
+ }
+@@ -110,6 +111,7 @@ void tcp_rate_skb_delivered(struct sock
+ 	if (!rs->prior_delivered ||
+ 	    tcp_skb_sent_after(tx_tstamp, tp->first_tx_mstamp,
+ 			       scb->end_seq, rs->last_end_seq)) {
++		rs->prior_lost	     = scb->tx.lost;
+ 		rs->prior_delivered_ce  = scb->tx.delivered_ce;
+ 		rs->prior_delivered  = scb->tx.delivered;
+ 		rs->prior_mstamp     = scb->tx.delivered_mstamp;
+@@ -165,6 +167,7 @@ void tcp_rate_gen(struct sock *sk, u32 d
+ 		return;
+ 	}
+ 	rs->delivered   = tp->delivered - rs->prior_delivered;
++	rs->lost        = tp->lost - rs->prior_lost;
+ 
+ 	rs->delivered_ce = tp->delivered_ce - rs->prior_delivered_ce;
+ 	/* delivered_ce occupies less than 32 bits in the skb control block */
diff --git a/debian/patches/misc-bbr3/0005-net-tcp_bbr-v2-export-FLAG_ECE-in-rate_sample.is_ece.patch b/debian/patches/misc-bbr3/0005-net-tcp_bbr-v2-export-FLAG_ECE-in-rate_sample.is_ece.patch
new file mode 100644
index 0000000..1045dbf
--- /dev/null
+++ b/debian/patches/misc-bbr3/0005-net-tcp_bbr-v2-export-FLAG_ECE-in-rate_sample.is_ece.patch
@@ -0,0 +1,38 @@
+From fdf01142aea8645186e080f1278d3b5a5fd8c66c Mon Sep 17 00:00:00 2001
+From: Neal Cardwell <ncardwell@google.com>
+Date: Mon, 19 Nov 2018 13:48:36 -0500
+Subject: [PATCH 05/19] net-tcp_bbr: v2: export FLAG_ECE in rate_sample.is_ece
+
+For understanding the relationship between inflight and ECN signals,
+to try to find the highest inflight value that has acceptable levels
+ECN marking.
+
+Effort: net-tcp_bbr
+Origin-9xx-SHA1: 3eba998f2898541406c2666781182200934965a8
+Change-Id: I3a964e04cee83e11649a54507043d2dfe769a3b3
+Signed-off-by: Alexandre Frade <kernel@xanmod.org>
+---
+ include/net/tcp.h    | 1 +
+ net/ipv4/tcp_input.c | 1 +
+ 2 files changed, 2 insertions(+)
+
+--- a/include/net/tcp.h
++++ b/include/net/tcp.h
+@@ -1157,6 +1157,7 @@ struct rate_sample {
+ 	bool is_app_limited;	/* is sample from packet with bubble in pipe? */
+ 	bool is_retrans;	/* is sample from retransmission? */
+ 	bool is_ack_delayed;	/* is this (likely) a delayed ACK? */
++	bool is_ece;		/* did this ACK have ECN marked? */
+ };
+ 
+ struct tcp_congestion_ops {
+--- a/net/ipv4/tcp_input.c
++++ b/net/ipv4/tcp_input.c
+@@ -4060,6 +4060,7 @@ static int tcp_ack(struct sock *sk, cons
+ 	delivered = tcp_newly_delivered(sk, delivered, flag);
+ 	lost = tp->lost - lost;			/* freshly marked lost */
+ 	rs.is_ack_delayed = !!(flag & FLAG_ACK_MAYBE_DELAYED);
++	rs.is_ece = !!(flag & FLAG_ECE);
+ 	tcp_rate_gen(sk, delivered, lost, is_sack_reneg, sack_state.rate);
+ 	tcp_cong_control(sk, ack, delivered, flag, sack_state.rate);
+ 	tcp_xmit_recovery(sk, rexmit);
diff --git a/debian/patches/misc-bbr3/0006-net-tcp_bbr-v2-introduce-ca_ops-skb_marked_lost-CC-m.patch b/debian/patches/misc-bbr3/0006-net-tcp_bbr-v2-introduce-ca_ops-skb_marked_lost-CC-m.patch
new file mode 100644
index 0000000..1582d9e
--- /dev/null
+++ b/debian/patches/misc-bbr3/0006-net-tcp_bbr-v2-introduce-ca_ops-skb_marked_lost-CC-m.patch
@@ -0,0 +1,57 @@
+From a3e88432c2ebf12de9c2053a13417ddf2ad4cb4e Mon Sep 17 00:00:00 2001
+From: Neal Cardwell <ncardwell@google.com>
+Date: Tue, 7 Aug 2018 21:52:06 -0400
+Subject: [PATCH 06/19] net-tcp_bbr: v2: introduce ca_ops->skb_marked_lost() CC
+ module callback API
+
+For connections experiencing reordering, RACK can mark packets lost
+long after we receive the SACKs/ACKs hinting that the packets were
+actually lost.
+
+This means that CC modules cannot easily learn the volume of inflight
+data at which packet loss happens by looking at the current inflight
+or even the packets in flight when the most recently SACKed packet was
+sent. To learn this, CC modules need to know how many packets were in
+flight at the time lost packets were sent. This new callback, combined
+with TCP_SKB_CB(skb)->tx.in_flight, allows them to learn this.
+
+This also provides a consistent callback that is invoked whether
+packets are marked lost upon ACK processing, using the RACK reordering
+timer, or at RTO time.
+
+Effort: net-tcp_bbr
+Origin-9xx-SHA1: afcbebe3374e4632ac6714d39e4dc8a8455956f4
+Change-Id: I54826ab53df636be537e5d3c618a46145d12d51a
+Signed-off-by: Alexandre Frade <kernel@xanmod.org>
+---
+ include/net/tcp.h    | 3 +++
+ net/ipv4/tcp_input.c | 5 +++++
+ 2 files changed, 8 insertions(+)
+
+--- a/include/net/tcp.h
++++ b/include/net/tcp.h
+@@ -1184,6 +1184,9 @@ struct tcp_congestion_ops {
+ 	/* override sysctl_tcp_min_tso_segs */
+ 	u32 (*min_tso_segs)(struct sock *sk);
+ 
++	/* react to a specific lost skb (optional) */
++	void (*skb_marked_lost)(struct sock *sk, const struct sk_buff *skb);
++
+ 	/* call when packets are delivered to update cwnd and pacing rate,
+ 	 * after all the ca_state processing. (optional)
+ 	 */
+--- a/net/ipv4/tcp_input.c
++++ b/net/ipv4/tcp_input.c
+@@ -1120,7 +1120,12 @@ static void tcp_verify_retransmit_hint(s
+  */
+ static void tcp_notify_skb_loss_event(struct tcp_sock *tp, const struct sk_buff *skb)
+ {
++	struct sock *sk = (struct sock *)tp;
++	const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
++
+ 	tp->lost += tcp_skb_pcount(skb);
++	if (ca_ops->skb_marked_lost)
++		ca_ops->skb_marked_lost(sk, skb);
+ }
+ 
+ void tcp_mark_skb_lost(struct sock *sk, struct sk_buff *skb)
diff --git a/debian/patches/misc-bbr3/0007-net-tcp_bbr-v2-adjust-skb-tx.in_flight-upon-merge-in.patch b/debian/patches/misc-bbr3/0007-net-tcp_bbr-v2-adjust-skb-tx.in_flight-upon-merge-in.patch
new file mode 100644
index 0000000..35dec47
--- /dev/null
+++ b/debian/patches/misc-bbr3/0007-net-tcp_bbr-v2-adjust-skb-tx.in_flight-upon-merge-in.patch
@@ -0,0 +1,59 @@
+From af7d33e71649b8e2ae00dccf336720a8ab891606 Mon Sep 17 00:00:00 2001
+From: Neal Cardwell <ncardwell@google.com>
+Date: Wed, 1 May 2019 20:16:33 -0400
+Subject: [PATCH 07/19] net-tcp_bbr: v2: adjust skb tx.in_flight upon merge in
+ tcp_shifted_skb()
+
+When tcp_shifted_skb() updates state as adjacent SACKed skbs are
+coalesced, previously the tx.in_flight was not adjusted, so we could
+get contradictory state where the skb's recorded pcount was bigger
+than the tx.in_flight (the number of segments that were in_flight
+after sending the skb).
+
+Normally have a SACKed skb with contradictory pcount/tx.in_flight
+would not matter. However, with SACK reneging, the SACKed bit is
+removed, and an skb once again becomes eligible for retransmitting,
+fragmenting, SACKing, etc. Packetdrill testing verified the following
+sequence is possible in a kernel that does not have this commit:
+
+ - skb N is SACKed
+ - skb N+1 is SACKed and combined with skb N using tcp_shifted_skb()
+   - tcp_shifted_skb() will increase the pcount of prev,
+     but leave tx.in_flight as-is
+   - so prev skb can have pcount > tx.in_flight
+ - RTO, tcp_timeout_mark_lost(), detect reneg,
+   remove "SACKed" bit, mark skb N as lost
+   - find pcount of skb N is greater than its tx.in_flight
+
+I suspect this issue iw what caused the bbr2_inflight_hi_from_lost_skb():
+  WARN_ON_ONCE(inflight_prev < 0)
+to fire in production machines using bbr2.
+
+Effort: net-tcp_bbr
+Origin-9xx-SHA1: 1a3e997e613d2dcf32b947992882854ebe873715
+Change-Id: I1b0b75c27519953430c7db51c6f358f104c7af55
+Signed-off-by: Alexandre Frade <kernel@xanmod.org>
+---
+ net/ipv4/tcp_input.c | 11 +++++++++++
+ 1 file changed, 11 insertions(+)
+
+--- a/net/ipv4/tcp_input.c
++++ b/net/ipv4/tcp_input.c
+@@ -1506,6 +1506,17 @@ static bool tcp_shifted_skb(struct sock
+ 	WARN_ON_ONCE(tcp_skb_pcount(skb) < pcount);
+ 	tcp_skb_pcount_add(skb, -pcount);
+ 
++	/* Adjust tx.in_flight as pcount is shifted from skb to prev. */
++	if (WARN_ONCE(TCP_SKB_CB(skb)->tx.in_flight < pcount,
++		      "prev in_flight: %u skb in_flight: %u pcount: %u",
++		      TCP_SKB_CB(prev)->tx.in_flight,
++		      TCP_SKB_CB(skb)->tx.in_flight,
++		      pcount))
++		TCP_SKB_CB(skb)->tx.in_flight = 0;
++	else
++		TCP_SKB_CB(skb)->tx.in_flight -= pcount;
++	TCP_SKB_CB(prev)->tx.in_flight += pcount;
++
+ 	/* When we're adding to gso_segs == 1, gso_size will be zero,
+ 	 * in theory this shouldn't be necessary but as long as DSACK
+ 	 * code can come after this skb later on it's better to keep
diff --git a/debian/patches/misc-bbr3/0008-net-tcp_bbr-v2-adjust-skb-tx.in_flight-upon-split-in.patch b/debian/patches/misc-bbr3/0008-net-tcp_bbr-v2-adjust-skb-tx.in_flight-upon-split-in.patch
new file mode 100644
index 0000000..f437449
--- /dev/null
+++ b/debian/patches/misc-bbr3/0008-net-tcp_bbr-v2-adjust-skb-tx.in_flight-upon-split-in.patch
@@ -0,0 +1,97 @@
+From a4d44bce49f61f8755f558dc40edff5f8958b7c6 Mon Sep 17 00:00:00 2001
+From: Neal Cardwell <ncardwell@google.com>
+Date: Wed, 1 May 2019 20:16:25 -0400
+Subject: [PATCH 08/19] net-tcp_bbr: v2: adjust skb tx.in_flight upon split in
+ tcp_fragment()
+
+When we fragment an skb that has already been sent, we need to update
+the tx.in_flight for the first skb in the resulting pair ("buff").
+
+Because we were not updating the tx.in_flight, the tx.in_flight value
+was inconsistent with the pcount of the "buff" skb (tx.in_flight would
+be too high). That meant that if the "buff" skb was lost, then
+bbr2_inflight_hi_from_lost_skb() would calculate an inflight_hi value
+that is too high. This could result in longer queues and higher packet
+loss.
+
+Packetdrill testing verified that without this commit, when the second
+half of an skb is SACKed and then later the first half of that skb is
+marked lost, the calculated inflight_hi was incorrect.
+
+Effort: net-tcp_bbr
+Origin-9xx-SHA1: 385f1ddc610798fab2837f9f372857438b25f874
+Origin-9xx-SHA1: a0eb099690af net-tcp_bbr: v2: fix tcp_fragment() tx.in_flight recomputation [prod feb 8 2021; use as a fixup]
+Origin-9xx-SHA1: 885503228153ff0c9114e net-tcp_bbr: v2: introduce tcp_skb_tx_in_flight_is_suspicious() helper for warnings
+Change-Id: I617f8cab4e9be7a0b8e8d30b047bf8645393354d
+Signed-off-by: Alexandre Frade <kernel@xanmod.org>
+---
+ include/net/tcp.h     | 15 +++++++++++++++
+ net/ipv4/tcp_output.c | 26 +++++++++++++++++++++++++-
+ 2 files changed, 40 insertions(+), 1 deletion(-)
+
+--- a/include/net/tcp.h
++++ b/include/net/tcp.h
+@@ -1283,6 +1283,21 @@ static inline bool tcp_skb_sent_after(u6
+ 	return t1 > t2 || (t1 == t2 && after(seq1, seq2));
+ }
+ 
++/* If a retransmit failed due to local qdisc congestion or other local issues,
++ * then we may have called tcp_set_skb_tso_segs() to increase the number of
++ * segments in the skb without increasing the tx.in_flight. In all other cases,
++ * the tx.in_flight should be at least as big as the pcount of the sk_buff.  We
++ * do not have the state to know whether a retransmit failed due to local qdisc
++ * congestion or other local issues, so to avoid spurious warnings we consider
++ * that any skb marked lost may have suffered that fate.
++ */
++static inline bool tcp_skb_tx_in_flight_is_suspicious(u32 skb_pcount,
++						      u32 skb_sacked_flags,
++						      u32 tx_in_flight)
++{
++	return (skb_pcount > tx_in_flight) && !(skb_sacked_flags & TCPCB_LOST);
++}
++
+ /* These functions determine how the current flow behaves in respect of SACK
+  * handling. SACK is negotiated with the peer, and therefore it can vary
+  * between different flows.
+--- a/net/ipv4/tcp_output.c
++++ b/net/ipv4/tcp_output.c
+@@ -1601,7 +1601,7 @@ int tcp_fragment(struct sock *sk, enum t
+ {
+ 	struct tcp_sock *tp = tcp_sk(sk);
+ 	struct sk_buff *buff;
+-	int old_factor;
++	int old_factor, inflight_prev;
+ 	long limit;
+ 	int nlen;
+ 	u8 flags;
+@@ -1676,6 +1676,30 @@ int tcp_fragment(struct sock *sk, enum t
+ 
+ 		if (diff)
+ 			tcp_adjust_pcount(sk, skb, diff);
++
++		inflight_prev = TCP_SKB_CB(skb)->tx.in_flight - old_factor;
++		if (inflight_prev < 0) {
++			WARN_ONCE(tcp_skb_tx_in_flight_is_suspicious(
++					  old_factor,
++					  TCP_SKB_CB(skb)->sacked,
++					  TCP_SKB_CB(skb)->tx.in_flight),
++				  "inconsistent: tx.in_flight: %u "
++				  "old_factor: %d mss: %u sacked: %u "
++				  "1st pcount: %d 2nd pcount: %d "
++				  "1st len: %u 2nd len: %u ",
++				  TCP_SKB_CB(skb)->tx.in_flight, old_factor,
++				  mss_now, TCP_SKB_CB(skb)->sacked,
++				  tcp_skb_pcount(skb), tcp_skb_pcount(buff),
++				  skb->len, buff->len);
++			inflight_prev = 0;
++		}
++		/* Set 1st tx.in_flight as if 1st were sent by itself: */
++		TCP_SKB_CB(skb)->tx.in_flight = inflight_prev +
++						 tcp_skb_pcount(skb);
++		/* Set 2nd tx.in_flight with new 1st and 2nd pcounts: */
++		TCP_SKB_CB(buff)->tx.in_flight = inflight_prev +
++						 tcp_skb_pcount(skb) +
++						 tcp_skb_pcount(buff);
+ 	}
+ 
+ 	/* Link BUFF into the send queue. */
diff --git a/debian/patches/misc-bbr3/0009-net-tcp-add-new-ca-opts-flag-TCP_CONG_WANTS_CE_EVENT.patch b/debian/patches/misc-bbr3/0009-net-tcp-add-new-ca-opts-flag-TCP_CONG_WANTS_CE_EVENT.patch
new file mode 100644
index 0000000..4f9b314
--- /dev/null
+++ b/debian/patches/misc-bbr3/0009-net-tcp-add-new-ca-opts-flag-TCP_CONG_WANTS_CE_EVENT.patch
@@ -0,0 +1,73 @@
+From 65cca0e8fd954a150ec874650af47f7800ea3049 Mon Sep 17 00:00:00 2001
+From: Yousuk Seung <ysseung@google.com>
+Date: Wed, 23 May 2018 17:55:54 -0700
+Subject: [PATCH 09/19] net-tcp: add new ca opts flag TCP_CONG_WANTS_CE_EVENTS
+
+Add a a new ca opts flag TCP_CONG_WANTS_CE_EVENTS that allows a
+congestion control module to receive CE events.
+
+Currently congestion control modules have to set the TCP_CONG_NEEDS_ECN
+bit in opts flag to receive CE events but this may incur changes in ECN
+behavior elsewhere. This patch adds a new bit TCP_CONG_WANTS_CE_EVENTS
+that allows congestion control modules to receive CE events
+independently of TCP_CONG_NEEDS_ECN.
+
+Effort: net-tcp
+Origin-9xx-SHA1: 9f7e14716cde760bc6c67ef8ef7e1ee48501d95b
+Change-Id: I2255506985242f376d910c6fd37daabaf4744f24
+Signed-off-by: Alexandre Frade <kernel@xanmod.org>
+---
+ include/net/tcp.h    | 14 +++++++++++++-
+ net/ipv4/tcp_input.c |  4 ++--
+ 2 files changed, 15 insertions(+), 3 deletions(-)
+
+--- a/include/net/tcp.h
++++ b/include/net/tcp.h
+@@ -1119,7 +1119,11 @@ enum tcp_ca_ack_event_flags {
+ #define TCP_CONG_NON_RESTRICTED 0x1
+ /* Requires ECN/ECT set on all packets */
+ #define TCP_CONG_NEEDS_ECN	0x2
+-#define TCP_CONG_MASK	(TCP_CONG_NON_RESTRICTED | TCP_CONG_NEEDS_ECN)
++/* Wants notification of CE events (CA_EVENT_ECN_IS_CE, CA_EVENT_ECN_NO_CE). */
++#define TCP_CONG_WANTS_CE_EVENTS	0x4
++#define TCP_CONG_MASK	(TCP_CONG_NON_RESTRICTED | \
++			 TCP_CONG_NEEDS_ECN | \
++			 TCP_CONG_WANTS_CE_EVENTS)
+ 
+ union tcp_cc_info;
+ 
+@@ -1251,6 +1255,14 @@ static inline char *tcp_ca_get_name_by_k
+ }
+ #endif
+ 
++static inline bool tcp_ca_wants_ce_events(const struct sock *sk)
++{
++	const struct inet_connection_sock *icsk = inet_csk(sk);
++
++	return icsk->icsk_ca_ops->flags & (TCP_CONG_NEEDS_ECN |
++					   TCP_CONG_WANTS_CE_EVENTS);
++}
++
+ static inline bool tcp_ca_needs_ecn(const struct sock *sk)
+ {
+ 	const struct inet_connection_sock *icsk = inet_csk(sk);
+--- a/net/ipv4/tcp_input.c
++++ b/net/ipv4/tcp_input.c
+@@ -370,7 +370,7 @@ static void __tcp_ecn_check_ce(struct so
+ 			tcp_enter_quickack_mode(sk, 2);
+ 		break;
+ 	case INET_ECN_CE:
+-		if (tcp_ca_needs_ecn(sk))
++		if (tcp_ca_wants_ce_events(sk))
+ 			tcp_ca_event(sk, CA_EVENT_ECN_IS_CE);
+ 
+ 		if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) {
+@@ -381,7 +381,7 @@ static void __tcp_ecn_check_ce(struct so
+ 		tp->ecn_flags |= TCP_ECN_SEEN;
+ 		break;
+ 	default:
+-		if (tcp_ca_needs_ecn(sk))
++		if (tcp_ca_wants_ce_events(sk))
+ 			tcp_ca_event(sk, CA_EVENT_ECN_NO_CE);
+ 		tp->ecn_flags |= TCP_ECN_SEEN;
+ 		break;
diff --git a/debian/patches/misc-bbr3/0010-net-tcp-re-generalize-TSO-sizing-in-TCP-CC-module-AP.patch b/debian/patches/misc-bbr3/0010-net-tcp-re-generalize-TSO-sizing-in-TCP-CC-module-AP.patch
new file mode 100644
index 0000000..b195ec9
--- /dev/null
+++ b/debian/patches/misc-bbr3/0010-net-tcp-re-generalize-TSO-sizing-in-TCP-CC-module-AP.patch
@@ -0,0 +1,118 @@
+From 3acb852e1cfcdeea388bd428c6dd81609fd40792 Mon Sep 17 00:00:00 2001
+From: Neal Cardwell <ncardwell@google.com>
+Date: Fri, 27 Sep 2019 17:10:26 -0400
+Subject: [PATCH 10/19] net-tcp: re-generalize TSO sizing in TCP CC module API
+
+Reorganize the API for CC modules so that the CC module once again
+gets complete control of the TSO sizing decision. This is how the API
+was set up around 2016 and the initial BBRv1 upstreaming. Later Eric
+Dumazet simplified it. But with wider testing it now seems that to
+avoid CPU regressions BBR needs to have a different TSO sizing
+function.
+
+This is necessary to handle cases where there are many flows
+bottlenecked on the sender host's NIC, in which case BBR's pacing rate
+is much lower than CUBIC/Reno/DCTCP's. Why does this happen? Because
+BBR's pacing rate adapts to the low bandwidth share each flow sees. By
+contrast, CUBIC/Reno/DCTCP see no loss or ECN, so they grow a very
+large cwnd, and thus large pacing rate and large TSO burst size.
+
+Change-Id: Ic8ccfdbe4010ee8d4bf6a6334c48a2fceb2171ea
+Signed-off-by: Alexandre Frade <kernel@xanmod.org>
+---
+ include/net/tcp.h     |  4 ++--
+ net/ipv4/tcp_bbr.c    | 37 ++++++++++++++++++++++++++-----------
+ net/ipv4/tcp_output.c | 11 +++++------
+ 3 files changed, 33 insertions(+), 19 deletions(-)
+
+--- a/include/net/tcp.h
++++ b/include/net/tcp.h
+@@ -1185,8 +1185,8 @@ struct tcp_congestion_ops {
+ 	/* hook for packet ack accounting (optional) */
+ 	void (*pkts_acked)(struct sock *sk, const struct ack_sample *sample);
+ 
+-	/* override sysctl_tcp_min_tso_segs */
+-	u32 (*min_tso_segs)(struct sock *sk);
++	/* pick target number of segments per TSO/GSO skb (optional): */
++	u32 (*tso_segs)(struct sock *sk, unsigned int mss_now);
+ 
+ 	/* react to a specific lost skb (optional) */
+ 	void (*skb_marked_lost)(struct sock *sk, const struct sk_buff *skb);
+--- a/net/ipv4/tcp_bbr.c
++++ b/net/ipv4/tcp_bbr.c
+@@ -301,20 +301,35 @@ __bpf_kfunc static u32 bbr_min_tso_segs(
+ 	return READ_ONCE(sk->sk_pacing_rate) < (bbr_min_tso_rate >> 3) ? 1 : 2;
+ }
+ 
++/* Return the number of segments BBR would like in a TSO/GSO skb, given
++ * a particular max gso size as a constraint.
++ */
++static u32 bbr_tso_segs_generic(struct sock *sk, unsigned int mss_now,
++				u32 gso_max_size)
++{
++	u32 segs;
++	u64 bytes;
++
++	/* Budget a TSO/GSO burst size allowance based on bw (pacing_rate). */
++	bytes = READ_ONCE(sk->sk_pacing_rate) >> READ_ONCE(sk->sk_pacing_shift);
++
++	bytes = min_t(u32, bytes, gso_max_size - 1 - MAX_TCP_HEADER);
++	segs = max_t(u32, bytes / mss_now, bbr_min_tso_segs(sk));
++	return segs;
++}
++
++/* Custom tcp_tso_autosize() for BBR, used at transmit time to cap skb size. */
++static u32  bbr_tso_segs(struct sock *sk, unsigned int mss_now)
++{
++	return bbr_tso_segs_generic(sk, mss_now, sk->sk_gso_max_size);
++}
++
++/* Like bbr_tso_segs(), using mss_cache, ignoring driver's sk_gso_max_size. */
+ static u32 bbr_tso_segs_goal(struct sock *sk)
+ {
+ 	struct tcp_sock *tp = tcp_sk(sk);
+-	u32 segs, bytes;
+-
+-	/* Sort of tcp_tso_autosize() but ignoring
+-	 * driver provided sk_gso_max_size.
+-	 */
+-	bytes = min_t(unsigned long,
+-		      READ_ONCE(sk->sk_pacing_rate) >> READ_ONCE(sk->sk_pacing_shift),
+-		      GSO_LEGACY_MAX_SIZE - 1 - MAX_TCP_HEADER);
+-	segs = max_t(u32, bytes / tp->mss_cache, bbr_min_tso_segs(sk));
+ 
+-	return min(segs, 0x7FU);
++	return bbr_tso_segs_generic(sk, tp->mss_cache, GSO_MAX_SIZE);
+ }
+ 
+ /* Save "last known good" cwnd so we can restore it after losses or PROBE_RTT */
+@@ -1150,7 +1165,7 @@ static struct tcp_congestion_ops tcp_bbr
+ 	.undo_cwnd	= bbr_undo_cwnd,
+ 	.cwnd_event	= bbr_cwnd_event,
+ 	.ssthresh	= bbr_ssthresh,
+-	.min_tso_segs	= bbr_min_tso_segs,
++	.tso_segs	= bbr_tso_segs,
+ 	.get_info	= bbr_get_info,
+ 	.set_state	= bbr_set_state,
+ };
+--- a/net/ipv4/tcp_output.c
++++ b/net/ipv4/tcp_output.c
+@@ -2057,13 +2057,12 @@ static u32 tcp_tso_autosize(const struct
+ static u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now)
+ {
+ 	const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
+-	u32 min_tso, tso_segs;
++	u32 tso_segs;
+ 
+-	min_tso = ca_ops->min_tso_segs ?
+-			ca_ops->min_tso_segs(sk) :
+-			READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs);
+-
+-	tso_segs = tcp_tso_autosize(sk, mss_now, min_tso);
++	tso_segs = ca_ops->tso_segs ?
++		ca_ops->tso_segs(sk, mss_now) :
++		tcp_tso_autosize(sk, mss_now,
++				 sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs);
+ 	return min_t(u32, tso_segs, sk->sk_gso_max_segs);
+ }
+ 
diff --git a/debian/patches/misc-bbr3/0011-net-tcp-add-fast_ack_mode-1-skip-rwin-check-in-tcp_f.patch b/debian/patches/misc-bbr3/0011-net-tcp-add-fast_ack_mode-1-skip-rwin-check-in-tcp_f.patch
new file mode 100644
index 0000000..894d5b1
--- /dev/null
+++ b/debian/patches/misc-bbr3/0011-net-tcp-add-fast_ack_mode-1-skip-rwin-check-in-tcp_f.patch
@@ -0,0 +1,72 @@
+From 3741ada76bab5111cbb9c279cf27e67a0334eb05 Mon Sep 17 00:00:00 2001
+From: Neal Cardwell <ncardwell@google.com>
+Date: Sun, 7 Jan 2024 21:11:26 -0300
+Subject: [PATCH 11/19] net-tcp: add fast_ack_mode=1: skip rwin check in
+ tcp_fast_ack_mode__tcp_ack_snd_check()
+
+Add logic for an experimental TCP connection behavior, enabled with
+tp->fast_ack_mode = 1, which disables checking the receive window
+before sending an ack in __tcp_ack_snd_check(). If this behavior is
+enabled, the data receiver sends an ACK if the amount of data is >
+RCV.MSS.
+
+Change-Id: Iaa0a0fd7108221f883137a79d5bfa724f1b096d4
+Signed-off-by: Alexandre Frade <kernel@xanmod.org>
+---
+ include/linux/tcp.h  | 3 ++-
+ net/ipv4/tcp.c       | 1 +
+ net/ipv4/tcp_cong.c  | 1 +
+ net/ipv4/tcp_input.c | 5 +++--
+ 4 files changed, 7 insertions(+), 3 deletions(-)
+
+--- a/include/linux/tcp.h
++++ b/include/linux/tcp.h
+@@ -369,7 +369,8 @@ struct tcp_sock {
+ 	u8	compressed_ack;
+ 	u8	dup_ack_counter:2,
+ 		tlp_retrans:1,	/* TLP is a retransmission */
+-		unused:5;
++		fast_ack_mode:2, /* which fast ack mode ? */
++		unused:3;
+ 	u8	thin_lto    : 1,/* Use linear timeouts for thin streams */
+ 		fastopen_connect:1, /* FASTOPEN_CONNECT sockopt */
+ 		fastopen_no_cookie:1, /* Allow send/recv SYN+data without a cookie */
+--- a/net/ipv4/tcp.c
++++ b/net/ipv4/tcp.c
+@@ -3123,6 +3123,7 @@ int tcp_disconnect(struct sock *sk, int
+ 	tp->rx_opt.dsack = 0;
+ 	tp->rx_opt.num_sacks = 0;
+ 	tp->rcv_ooopack = 0;
++	tp->fast_ack_mode = 0;
+ 
+ 
+ 	/* Clean up fastopen related fields */
+--- a/net/ipv4/tcp_cong.c
++++ b/net/ipv4/tcp_cong.c
+@@ -237,6 +237,7 @@ void tcp_init_congestion_control(struct
+ 	struct inet_connection_sock *icsk = inet_csk(sk);
+ 
+ 	tcp_sk(sk)->prior_ssthresh = 0;
++	tcp_sk(sk)->fast_ack_mode = 0;
+ 	if (icsk->icsk_ca_ops->init)
+ 		icsk->icsk_ca_ops->init(sk);
+ 	if (tcp_ca_needs_ecn(sk))
+--- a/net/ipv4/tcp_input.c
++++ b/net/ipv4/tcp_input.c
+@@ -5763,13 +5763,14 @@ static void __tcp_ack_snd_check(struct s
+ 
+ 	    /* More than one full frame received... */
+ 	if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss &&
++	     (tp->fast_ack_mode == 1 ||
+ 	     /* ... and right edge of window advances far enough.
+ 	      * (tcp_recvmsg() will send ACK otherwise).
+ 	      * If application uses SO_RCVLOWAT, we want send ack now if
+ 	      * we have not received enough bytes to satisfy the condition.
+ 	      */
+-	    (tp->rcv_nxt - tp->copied_seq < sk->sk_rcvlowat ||
+-	     __tcp_select_window(sk) >= tp->rcv_wnd)) ||
++	      (tp->rcv_nxt - tp->copied_seq < sk->sk_rcvlowat ||
++	       __tcp_select_window(sk) >= tp->rcv_wnd))) ||
+ 	    /* We ACK each frame or... */
+ 	    tcp_in_quickack_mode(sk) ||
+ 	    /* Protocol state mandates a one-time immediate ACK */
diff --git a/debian/patches/misc-bbr3/0012-net-tcp_bbr-v2-record-app-limited-status-of-TLP-repa.patch b/debian/patches/misc-bbr3/0012-net-tcp_bbr-v2-record-app-limited-status-of-TLP-repa.patch
new file mode 100644
index 0000000..07b0d0c
--- /dev/null
+++ b/debian/patches/misc-bbr3/0012-net-tcp_bbr-v2-record-app-limited-status-of-TLP-repa.patch
@@ -0,0 +1,45 @@
+From e5d35b7c882b7001f8a31b14c9f08917230dedc3 Mon Sep 17 00:00:00 2001
+From: Jianfeng Wang <jfwang@google.com>
+Date: Fri, 19 Jun 2020 17:33:45 +0000
+Subject: [PATCH 12/19] net-tcp_bbr: v2: record app-limited status of
+ TLP-repaired flight
+
+When sending a TLP retransmit, record whether the outstanding flight
+of data is application limited. This is important for congestion
+control modules that want to respond to losses repaired by TLP
+retransmits. This is important because the following scenarios convey
+very different information:
+ (1) a packet loss with a small number of packets in flight;
+ (2) a packet loss with the maximum amount of data in flight allowed
+     by the CC module;
+
+Effort: net-tcp_bbr
+Change-Id: Ic8ae567caa4e4bfd5fd82c3d4be12a5d9171655e
+Signed-off-by: Alexandre Frade <kernel@xanmod.org>
+---
+ include/linux/tcp.h   | 3 ++-
+ net/ipv4/tcp_output.c | 1 +
+ 2 files changed, 3 insertions(+), 1 deletion(-)
+
+--- a/include/linux/tcp.h
++++ b/include/linux/tcp.h
+@@ -370,7 +370,8 @@ struct tcp_sock {
+ 	u8	dup_ack_counter:2,
+ 		tlp_retrans:1,	/* TLP is a retransmission */
+ 		fast_ack_mode:2, /* which fast ack mode ? */
+-		unused:3;
++		tlp_orig_data_app_limited:1, /* app-limited before TLP rtx? */
++		unused:2;
+ 	u8	thin_lto    : 1,/* Use linear timeouts for thin streams */
+ 		fastopen_connect:1, /* FASTOPEN_CONNECT sockopt */
+ 		fastopen_no_cookie:1, /* Allow send/recv SYN+data without a cookie */
+--- a/net/ipv4/tcp_output.c
++++ b/net/ipv4/tcp_output.c
+@@ -3003,6 +3003,7 @@ void tcp_send_loss_probe(struct sock *sk
+ 	if (WARN_ON(!skb || !tcp_skb_pcount(skb)))
+ 		goto rearm_timer;
+ 
++	tp->tlp_orig_data_app_limited = TCP_SKB_CB(skb)->tx.is_app_limited;
+ 	if (__tcp_retransmit_skb(sk, skb, 1))
+ 		goto rearm_timer;
+ 
diff --git a/debian/patches/misc-bbr3/0013-net-tcp_bbr-v2-inform-CC-module-of-losses-repaired-b.patch b/debian/patches/misc-bbr3/0013-net-tcp_bbr-v2-inform-CC-module-of-losses-repaired-b.patch
new file mode 100644
index 0000000..217eaea
--- /dev/null
+++ b/debian/patches/misc-bbr3/0013-net-tcp_bbr-v2-inform-CC-module-of-losses-repaired-b.patch
@@ -0,0 +1,45 @@
+From 77e7c22b63f8934206b1e89c173558c3967f0779 Mon Sep 17 00:00:00 2001
+From: Jianfeng Wang <jfwang@google.com>
+Date: Tue, 16 Jun 2020 17:41:19 +0000
+Subject: [PATCH 13/19] net-tcp_bbr: v2: inform CC module of losses repaired by
+ TLP probe
+
+Before this commit, when there is a packet loss that creates a sequence
+hole that is filled by a TLP loss probe, then tcp_process_tlp_ack()
+only informs the congestion control (CC) module via a back-to-back entry
+and exit of CWR. But some congestion control modules (e.g. BBR) do not
+respond to CWR events.
+
+This commit adds a new CA event with which the core TCP stack notifies
+the CC module when a loss is repaired by a TLP. This will allow CC
+modules that do not use the CWR mechanism to have a custom handler for
+such TLP recoveries.
+
+Effort: net-tcp_bbr
+Change-Id: Ieba72332b401b329bff5a641d2b2043a3fb8f632
+Signed-off-by: Alexandre Frade <kernel@xanmod.org>
+---
+ include/net/tcp.h    | 1 +
+ net/ipv4/tcp_input.c | 1 +
+ 2 files changed, 2 insertions(+)
+
+--- a/include/net/tcp.h
++++ b/include/net/tcp.h
+@@ -1097,6 +1097,7 @@ enum tcp_ca_event {
+ 	CA_EVENT_LOSS,		/* loss timeout */
+ 	CA_EVENT_ECN_NO_CE,	/* ECT set, but not CE marked */
+ 	CA_EVENT_ECN_IS_CE,	/* received CE marked IP packet */
++	CA_EVENT_TLP_RECOVERY,	/* a lost segment was repaired by TLP probe */
+ };
+ 
+ /* Information about inbound ACK, passed to cong_ops->in_ack_event() */
+--- a/net/ipv4/tcp_input.c
++++ b/net/ipv4/tcp_input.c
+@@ -3859,6 +3859,7 @@ static void tcp_process_tlp_ack(struct s
+ 		/* ACK advances: there was a loss, so reduce cwnd. Reset
+ 		 * tlp_high_seq in tcp_init_cwnd_reduction()
+ 		 */
++		tcp_ca_event(sk, CA_EVENT_TLP_RECOVERY);
+ 		tcp_init_cwnd_reduction(sk);
+ 		tcp_set_ca_state(sk, TCP_CA_CWR);
+ 		tcp_end_cwnd_reduction(sk);
diff --git a/debian/patches/misc-bbr3/0014-net-tcp_bbr-v2-introduce-is_acking_tlp_retrans_seq-i.patch b/debian/patches/misc-bbr3/0014-net-tcp_bbr-v2-introduce-is_acking_tlp_retrans_seq-i.patch
new file mode 100644
index 0000000..43a1ade
--- /dev/null
+++ b/debian/patches/misc-bbr3/0014-net-tcp_bbr-v2-introduce-is_acking_tlp_retrans_seq-i.patch
@@ -0,0 +1,73 @@
+From cab22a8e2e87870e8334a12ffcd0ba04ea81126f Mon Sep 17 00:00:00 2001
+From: Neal Cardwell <ncardwell@google.com>
+Date: Mon, 21 Sep 2020 14:46:26 -0400
+Subject: [PATCH 14/19] net-tcp_bbr: v2: introduce is_acking_tlp_retrans_seq
+ into rate_sample
+
+Introduce is_acking_tlp_retrans_seq into rate_sample. This bool will
+export to the CC module the knowledge of whether the current ACK
+matched a TLP retransmit.
+
+Note that when this bool is true, we cannot yet tell (in general) whether
+this ACK is for the original or the TLP retransmit.
+
+Effort: net-tcp_bbr
+Change-Id: I2e6494332167e75efcbdc99bd5c119034e9c39b4
+Signed-off-by: Alexandre Frade <kernel@xanmod.org>
+---
+ include/net/tcp.h    |  1 +
+ net/ipv4/tcp_input.c | 12 +++++++++---
+ 2 files changed, 10 insertions(+), 3 deletions(-)
+
+--- a/include/net/tcp.h
++++ b/include/net/tcp.h
+@@ -1161,6 +1161,7 @@ struct rate_sample {
+ 	u32  last_end_seq;	/* end_seq of most recently ACKed packet */
+ 	bool is_app_limited;	/* is sample from packet with bubble in pipe? */
+ 	bool is_retrans;	/* is sample from retransmission? */
++	bool is_acking_tlp_retrans_seq;  /* ACKed a TLP retransmit sequence? */
+ 	bool is_ack_delayed;	/* is this (likely) a delayed ACK? */
+ 	bool is_ece;		/* did this ACK have ECN marked? */
+ };
+--- a/net/ipv4/tcp_input.c
++++ b/net/ipv4/tcp_input.c
+@@ -3842,7 +3842,8 @@ static void tcp_replace_ts_recent(struct
+ /* This routine deals with acks during a TLP episode and ends an episode by
+  * resetting tlp_high_seq. Ref: TLP algorithm in draft-ietf-tcpm-rack
+  */
+-static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag)
++static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag,
++				struct rate_sample *rs)
+ {
+ 	struct tcp_sock *tp = tcp_sk(sk);
+ 
+@@ -3870,6 +3871,11 @@ static void tcp_process_tlp_ack(struct s
+ 			     FLAG_NOT_DUP | FLAG_DATA_SACKED))) {
+ 		/* Pure dupack: original and TLP probe arrived; no loss */
+ 		tp->tlp_high_seq = 0;
++	} else {
++		/* This ACK matches a TLP retransmit. We cannot yet tell if
++		 * this ACK is for the original or the TLP retransmit.
++		 */
++		rs->is_acking_tlp_retrans_seq = 1;
+ 	}
+ }
+ 
+@@ -4053,7 +4059,7 @@ static int tcp_ack(struct sock *sk, cons
+ 	tcp_rack_update_reo_wnd(sk, &rs);
+ 
+ 	if (tp->tlp_high_seq)
+-		tcp_process_tlp_ack(sk, ack, flag);
++		tcp_process_tlp_ack(sk, ack, flag, &rs);
+ 
+ 	if (tcp_ack_is_dubious(sk, flag)) {
+ 		if (!(flag & (FLAG_SND_UNA_ADVANCED |
+@@ -4097,7 +4103,7 @@ no_queue:
+ 	tcp_ack_probe(sk);
+ 
+ 	if (tp->tlp_high_seq)
+-		tcp_process_tlp_ack(sk, ack, flag);
++		tcp_process_tlp_ack(sk, ack, flag, &rs);
+ 	return 1;
+ 
+ old_ack:
diff --git a/debian/patches/misc-bbr3/0015-tcp-introduce-per-route-feature-RTAX_FEATURE_ECN_LOW.patch b/debian/patches/misc-bbr3/0015-tcp-introduce-per-route-feature-RTAX_FEATURE_ECN_LOW.patch
new file mode 100644
index 0000000..5271d85
--- /dev/null
+++ b/debian/patches/misc-bbr3/0015-tcp-introduce-per-route-feature-RTAX_FEATURE_ECN_LOW.patch
@@ -0,0 +1,112 @@
+From 38dd25482f815d949fec91edd7694b2f15823f67 Mon Sep 17 00:00:00 2001
+From: David Morley <morleyd@google.com>
+Date: Fri, 14 Jul 2023 11:07:56 -0400
+Subject: [PATCH 15/19] tcp: introduce per-route feature RTAX_FEATURE_ECN_LOW
+
+Define and implement a new per-route feature, RTAX_FEATURE_ECN_LOW.
+
+This feature indicates that the given destination network is a
+low-latency ECN environment, meaning both that ECN CE marks are
+applied by the network using a low-latency marking threshold and also
+that TCP endpoints provide precise per-data-segment ECN feedback in
+ACKs (where the ACK ECE flag echoes the received CE status of all
+newly-acknowledged data segments). This feature indication can be used
+by congestion control algorithms to decide how to interpret ECN
+signals over the given destination network.
+
+This feature is appropriate for datacenter-style ECN marking, such as
+the ECN marking approach expected by DCTCP or BBR congestion control
+modules.
+
+Signed-off-by: David Morley <morleyd@google.com>
+Signed-off-by: Neal Cardwell <ncardwell@google.com>
+Signed-off-by: Yuchung Cheng <ycheng@google.com>
+Tested-by: David Morley <morleyd@google.com>
+Change-Id: I6bc06e9c6cb426fbae7243fc71c9a8c18175f5d3
+Signed-off-by: Alexandre Frade <kernel@xanmod.org>
+---
+ include/net/tcp.h              | 10 ++++++++++
+ include/uapi/linux/rtnetlink.h |  4 +++-
+ net/ipv4/tcp_minisocks.c       |  2 ++
+ net/ipv4/tcp_output.c          |  6 ++++--
+ 4 files changed, 19 insertions(+), 3 deletions(-)
+
+--- a/include/net/tcp.h
++++ b/include/net/tcp.h
+@@ -375,6 +375,7 @@ static inline void tcp_dec_quickack_mode
+ #define	TCP_ECN_QUEUE_CWR	2
+ #define	TCP_ECN_DEMAND_CWR	4
+ #define	TCP_ECN_SEEN		8
++#define	TCP_ECN_LOW		16
+ 
+ enum tcp_tw_status {
+ 	TCP_TW_SUCCESS = 0,
+@@ -777,6 +778,15 @@ static inline void tcp_fast_path_check(s
+ 		tcp_fast_path_on(tp);
+ }
+ 
++static inline void tcp_set_ecn_low_from_dst(struct sock *sk,
++					    const struct dst_entry *dst)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++
++	if (dst_feature(dst, RTAX_FEATURE_ECN_LOW))
++		tp->ecn_flags |= TCP_ECN_LOW;
++}
++
+ u32 tcp_delack_max(const struct sock *sk);
+ 
+ /* Compute the actual rto_min value */
+--- a/include/uapi/linux/rtnetlink.h
++++ b/include/uapi/linux/rtnetlink.h
+@@ -507,12 +507,14 @@ enum {
+ #define RTAX_FEATURE_TIMESTAMP		(1 << 2) /* unused */
+ #define RTAX_FEATURE_ALLFRAG		(1 << 3) /* unused */
+ #define RTAX_FEATURE_TCP_USEC_TS	(1 << 4)
++#define RTAX_FEATURE_ECN_LOW		(1 << 5)
+ 
+ #define RTAX_FEATURE_MASK	(RTAX_FEATURE_ECN |		\
+ 				 RTAX_FEATURE_SACK |		\
+ 				 RTAX_FEATURE_TIMESTAMP |	\
+ 				 RTAX_FEATURE_ALLFRAG |		\
+-				 RTAX_FEATURE_TCP_USEC_TS)
++				 RTAX_FEATURE_TCP_USEC_TS |	\
++				 RTAX_FEATURE_ECN_LOW)
+ 
+ struct rta_session {
+ 	__u8	proto;
+--- a/net/ipv4/tcp_minisocks.c
++++ b/net/ipv4/tcp_minisocks.c
+@@ -459,6 +459,8 @@ void tcp_ca_openreq_child(struct sock *s
+ 	u32 ca_key = dst_metric(dst, RTAX_CC_ALGO);
+ 	bool ca_got_dst = false;
+ 
++	tcp_set_ecn_low_from_dst(sk, dst);
++
+ 	if (ca_key != TCP_CA_UNSPEC) {
+ 		const struct tcp_congestion_ops *ca;
+ 
+--- a/net/ipv4/tcp_output.c
++++ b/net/ipv4/tcp_output.c
+@@ -336,10 +336,9 @@ static void tcp_ecn_send_syn(struct sock
+ 	bool bpf_needs_ecn = tcp_bpf_ca_needs_ecn(sk);
+ 	bool use_ecn = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn) == 1 ||
+ 		tcp_ca_needs_ecn(sk) || bpf_needs_ecn;
++	const struct dst_entry *dst = __sk_dst_get(sk);
+ 
+ 	if (!use_ecn) {
+-		const struct dst_entry *dst = __sk_dst_get(sk);
+-
+ 		if (dst && dst_feature(dst, RTAX_FEATURE_ECN))
+ 			use_ecn = true;
+ 	}
+@@ -351,6 +350,9 @@ static void tcp_ecn_send_syn(struct sock
+ 		tp->ecn_flags = TCP_ECN_OK;
+ 		if (tcp_ca_needs_ecn(sk) || bpf_needs_ecn)
+ 			INET_ECN_xmit(sk);
++
++		if (dst)
++			tcp_set_ecn_low_from_dst(sk, dst);
+ 	}
+ }
+ 
diff --git a/debian/patches/misc-bbr3/0016-net-tcp_bbr-v3-update-TCP-bbr-congestion-control-mod.patch b/debian/patches/misc-bbr3/0016-net-tcp_bbr-v3-update-TCP-bbr-congestion-control-mod.patch
new file mode 100644
index 0000000..f1ee5be
--- /dev/null
+++ b/debian/patches/misc-bbr3/0016-net-tcp_bbr-v3-update-TCP-bbr-congestion-control-mod.patch
@@ -0,0 +1,2821 @@
+From 6e06d157aa50e3288c749919a81f04ec792d2d91 Mon Sep 17 00:00:00 2001
+From: Neal Cardwell <ncardwell@google.com>
+Date: Tue, 11 Jun 2019 12:54:22 -0400
+Subject: [PATCH 16/19] net-tcp_bbr: v3: update TCP "bbr" congestion control
+ module to BBRv3
+
+BBR v3 is an enhacement to the BBR v1 algorithm. It's designed to aim for lower
+queues, lower loss, and better Reno/CUBIC coexistence than BBR v1.
+
+BBR v3 maintains the core of BBR v1: an explicit model of the network
+path that is two-dimensional, adapting to estimate the (a) maximum
+available bandwidth and (b) maximum safe volume of data a flow can
+keep in-flight in the network. It maintains the estimated BDP as a
+core guide for estimating an appropriate level of in-flight data.
+
+BBR v3 makes several key enhancements:
+
+o Its bandwidth-probing time scale is adapted, within bounds, to allow improved
+coexistence with Reno and CUBIC. The bandwidth-probing time scale is (a)
+extended dynamically based on estimated BDP to improve coexistence with
+Reno/CUBIC; (b) bounded by an interactive wall-clock time-scale to be more
+scalable and responsive than Reno and CUBIC.
+
+o Rather than being largely agnostic to loss and ECN marks, it explicitly uses
+loss and (DCTCP-style) ECN signals to maintain its model.
+
+o It aims for lower losses than v1 by adjusting its model to attempt to stay
+within loss rate and ECN mark rate bounds (loss_thresh and ecn_thresh,
+respectively).
+
+o It adapts to loss/ECN signals even when the application is running out of
+data ("application-limited"), in case the "application-limited" flow is also
+"network-limited" (the bw and/or inflight available to this flow is lower than
+previously estimated when the flow ran out of data).
+
+o It has a three-part model: the model explicit three tracks operating points,
+where an operating point is a tuple: (bandwidth, inflight). The three operating
+points are:
+
+  o latest:        the latest measurement from the current round trip
+  o upper bound:   robust, optimistic, long-term upper bound
+  o lower bound:   robust, conservative, short-term lower bound
+
+These are stored in the following state variables:
+
+  o latest:  bw_latest, inflight_latest
+  o lo:      bw_lo,     inflight_lo
+  o hi:      bw_hi[2],  inflight_hi
+
+To gain intuition about the meaning of the three operating points, it
+may help to consider the analogs in CUBIC, which has a somewhat
+analogous three-part model used by its probing state machine:
+
+  BBR param     CUBIC param
+  -----------   -------------
+  latest     ~  cwnd
+  lo         ~  ssthresh
+  hi         ~  last_max_cwnd
+
+The analogy is only a loose one, though, since the BBR operating
+points are calculated differently, and are 2-dimensional (bw,inflight)
+rather than CUBIC's one-dimensional notion of operating point
+(inflight).
+
+o It uses the three-part model to adapt the magnitude of its bandwidth
+to match the estimated space available in the buffer, rather than (as
+in BBR v1) assuming that it was always acceptable to place 0.25*BDP in
+the bottleneck buffer when probing (commodity datacenter switches
+commonly do not have that much buffer for WAN flows). When BBR v3
+estimates it hit a buffer limit during probing, its bandwidth probing
+then starts gently in case little space is still available in the
+buffer, and the accelerates, slowly at first and then rapidly if it
+can grow inflight without seeing congestion signals. In such cases,
+probing is bounded by inflight_hi + inflight_probe, where
+inflight_probe grows as: [0, 1, 2, 4, 8, 16,...]. This allows BBR to
+keep losses low and bounded if a bottleneck remains congested, while
+rapidly/scalably utilizing free bandwidth when it becomes available.
+
+o It has a slightly revised state machine, to achieve the goals above.
+    BBR_BW_PROBE_UP:    pushes up inflight to probe for bw/vol
+    BBR_BW_PROBE_DOWN:  drain excess inflight from the queue
+    BBR_BW_PROBE_CRUISE: use pipe, w/ headroom in queue/pipe
+    BBR_BW_PROBE_REFILL: try refill the pipe again to 100%, leaving queue empty
+
+o The estimated BDP: BBR v3 continues to maintain an estimate of the
+path's two-way propagation delay, by tracking a windowed min_rtt, and
+coordinating (on an as-ndeeded basis) to try to expose the two-way
+propagation delay by draining the bottleneck queue.
+
+BBR v3 continues to use its min_rtt and (currently-applicable) bandwidth
+estimate to estimate the current bandwidth-delay product. The estimated BDP
+still provides one important guideline for bounding inflight data. However,
+because any min-filtered RTT and max-filtered bw inherently tend to both
+overestimate, the estimated BDP is often too high; in this case loss or ECN
+marks can ensue, in which case BBR v3 adjusts inflight_hi and inflight_lo to
+adapt its sending rate and inflight down to match the available capacity of the
+path.
+
+o Space: Note that ICSK_CA_PRIV_SIZE increased. This is because BBR v3
+requires more space. Note that much of the space is due to support for
+per-socket parameterization and debugging in this release for research
+and debugging. With that state removed, the full "struct bbr" is 140
+bytes, or 144 with padding. This is an increase of 40 bytes over the
+existing ca_priv space.
+
+o Code: BBR v3 reuses many pieces from BBR v1. But it omits the following
+  significant pieces:
+
+  o "packet conservation" (bbr_set_cwnd_to_recover_or_restore(),
+    bbr_can_grow_inflight())
+  o long-term bandwidth estimator ("policer mode")
+
+  The code layout tries to keep BBR v3 code near the bottom of the
+  file, so that v1-applicable code in the top does not accidentally
+  refer to v3 code.
+
+o Docs:
+  See the following docs for more details and diagrams decsribing the BBR v3
+  algorithm:
+    https://datatracker.ietf.org/meeting/104/materials/slides-104-iccrg-an-update-on-bbr-00
+    https://datatracker.ietf.org/meeting/102/materials/slides-102-iccrg-an-update-on-bbr-work-at-google-00
+
+o Internal notes:
+  For this upstream rebase, Neal started from:
+    git show fed518041ac6:net/ipv4/tcp_bbr.c > net/ipv4/tcp_bbr.c
+  then removed dev instrumentation (dynamic get/set for parameters)
+  and code that was only used by BBRv1
+
+Effort: net-tcp_bbr
+Origin-9xx-SHA1: 2c84098e60bed6d67dde23cd7538c51dee273102
+Change-Id: I125cf26ba2a7a686f2fa5e87f4c2afceb65f7a05
+Signed-off-by: Alexandre Frade <kernel@xanmod.org>
+---
+ include/net/inet_connection_sock.h |    4 +-
+ include/net/tcp.h                  |    2 +-
+ include/uapi/linux/inet_diag.h     |   23 +
+ net/ipv4/Kconfig                   |   21 +-
+ net/ipv4/tcp_bbr.c                 | 2214 +++++++++++++++++++++-------
+ 5 files changed, 1740 insertions(+), 524 deletions(-)
+
+--- a/include/net/inet_connection_sock.h
++++ b/include/net/inet_connection_sock.h
+@@ -137,8 +137,8 @@ struct inet_connection_sock {
+ 	u32			  icsk_probes_tstamp;
+ 	u32			  icsk_user_timeout;
+ 
+-	u64			  icsk_ca_priv[104 / sizeof(u64)];
+-#define ICSK_CA_PRIV_SIZE	  sizeof_field(struct inet_connection_sock, icsk_ca_priv)
++#define ICSK_CA_PRIV_SIZE      (144)
++	u64			  icsk_ca_priv[ICSK_CA_PRIV_SIZE / sizeof(u64)];
+ };
+ 
+ #define ICSK_TIME_RETRANS	1	/* Retransmit timer */
+--- a/include/net/tcp.h
++++ b/include/net/tcp.h
+@@ -2473,7 +2473,7 @@ struct tcp_plb_state {
+ 	u8	consec_cong_rounds:5, /* consecutive congested rounds */
+ 		unused:3;
+ 	u32	pause_until; /* jiffies32 when PLB can resume rerouting */
+-};
++} __attribute__ ((__packed__));
+ 
+ static inline void tcp_plb_init(const struct sock *sk,
+ 				struct tcp_plb_state *plb)
+--- a/include/uapi/linux/inet_diag.h
++++ b/include/uapi/linux/inet_diag.h
+@@ -229,6 +229,29 @@ struct tcp_bbr_info {
+ 	__u32	bbr_min_rtt;		/* min-filtered RTT in uSec */
+ 	__u32	bbr_pacing_gain;	/* pacing gain shifted left 8 bits */
+ 	__u32	bbr_cwnd_gain;		/* cwnd gain shifted left 8 bits */
++	__u32	bbr_bw_hi_lsb;		/* lower 32 bits of bw_hi */
++	__u32	bbr_bw_hi_msb;		/* upper 32 bits of bw_hi */
++	__u32	bbr_bw_lo_lsb;		/* lower 32 bits of bw_lo */
++	__u32	bbr_bw_lo_msb;		/* upper 32 bits of bw_lo */
++	__u8	bbr_mode;		/* current bbr_mode in state machine */
++	__u8	bbr_phase;		/* current state machine phase */
++	__u8	unused1;		/* alignment padding; not used yet */
++	__u8	bbr_version;		/* BBR algorithm version */
++	__u32	bbr_inflight_lo;	/* lower short-term data volume bound */
++	__u32	bbr_inflight_hi;	/* higher long-term data volume bound */
++	__u32	bbr_extra_acked;	/* max excess packets ACKed in epoch */
++};
++
++/* TCP BBR congestion control bbr_phase as reported in netlink/ss stats. */
++enum tcp_bbr_phase {
++	BBR_PHASE_INVALID		= 0,
++	BBR_PHASE_STARTUP		= 1,
++	BBR_PHASE_DRAIN			= 2,
++	BBR_PHASE_PROBE_RTT		= 3,
++	BBR_PHASE_PROBE_BW_UP		= 4,
++	BBR_PHASE_PROBE_BW_DOWN		= 5,
++	BBR_PHASE_PROBE_BW_CRUISE	= 6,
++	BBR_PHASE_PROBE_BW_REFILL	= 7,
+ };
+ 
+ union tcp_cc_info {
+--- a/net/ipv4/Kconfig
++++ b/net/ipv4/Kconfig
+@@ -668,15 +668,18 @@ config TCP_CONG_BBR
+ 	default n
+ 	help
+ 
+-	  BBR (Bottleneck Bandwidth and RTT) TCP congestion control aims to
+-	  maximize network utilization and minimize queues. It builds an explicit
+-	  model of the bottleneck delivery rate and path round-trip propagation
+-	  delay. It tolerates packet loss and delay unrelated to congestion. It
+-	  can operate over LAN, WAN, cellular, wifi, or cable modem links. It can
+-	  coexist with flows that use loss-based congestion control, and can
+-	  operate with shallow buffers, deep buffers, bufferbloat, policers, or
+-	  AQM schemes that do not provide a delay signal. It requires the fq
+-	  ("Fair Queue") pacing packet scheduler.
++	  BBR (Bottleneck Bandwidth and RTT) TCP congestion control is a
++	  model-based congestion control algorithm that aims to maximize
++	  network utilization, keep queues and retransmit rates low, and to be
++	  able to coexist with Reno/CUBIC in common scenarios. It builds an
++	  explicit model of the network path.  It tolerates a targeted degree
++	  of random packet loss and delay. It can operate over LAN, WAN,
++	  cellular, wifi, or cable modem links, and can use shallow-threshold
++	  ECN signals. It can coexist to some degree with flows that use
++	  loss-based congestion control, and can operate with shallow buffers,
++	  deep buffers, bufferbloat, policers, or AQM schemes that do not
++	  provide a delay signal. It requires pacing, using either TCP internal
++	  pacing or the fq ("Fair Queue") pacing packet scheduler.
+ 
+ choice
+ 	prompt "Default TCP congestion control"
+--- a/net/ipv4/tcp_bbr.c
++++ b/net/ipv4/tcp_bbr.c
+@@ -1,18 +1,19 @@
+-/* Bottleneck Bandwidth and RTT (BBR) congestion control
++/* BBR (Bottleneck Bandwidth and RTT) congestion control
+  *
+- * BBR congestion control computes the sending rate based on the delivery
+- * rate (throughput) estimated from ACKs. In a nutshell:
++ * BBR is a model-based congestion control algorithm that aims for low queues,
++ * low loss, and (bounded) Reno/CUBIC coexistence. To maintain a model of the
++ * network path, it uses measurements of bandwidth and RTT, as well as (if they
++ * occur) packet loss and/or shallow-threshold ECN signals. Note that although
++ * it can use ECN or loss signals explicitly, it does not require either; it
++ * can bound its in-flight data based on its estimate of the BDP.
+  *
+- *   On each ACK, update our model of the network path:
+- *      bottleneck_bandwidth = windowed_max(delivered / elapsed, 10 round trips)
+- *      min_rtt = windowed_min(rtt, 10 seconds)
+- *   pacing_rate = pacing_gain * bottleneck_bandwidth
+- *   cwnd = max(cwnd_gain * bottleneck_bandwidth * min_rtt, 4)
+- *
+- * The core algorithm does not react directly to packet losses or delays,
+- * although BBR may adjust the size of next send per ACK when loss is
+- * observed, or adjust the sending rate if it estimates there is a
+- * traffic policer, in order to keep the drop rate reasonable.
++ * The model has both higher and lower bounds for the operating range:
++ *   lo: bw_lo, inflight_lo: conservative short-term lower bound
++ *   hi: bw_hi, inflight_hi: robust long-term upper bound
++ * The bandwidth-probing time scale is (a) extended dynamically based on
++ * estimated BDP to improve coexistence with Reno/CUBIC; (b) bounded by
++ * an interactive wall-clock time-scale to be more scalable and responsive
++ * than Reno and CUBIC.
+  *
+  * Here is a state transition diagram for BBR:
+  *
+@@ -65,6 +66,13 @@
+ #include <linux/random.h>
+ #include <linux/win_minmax.h>
+ 
++#include <trace/events/tcp.h>
++#include "tcp_dctcp.h"
++
++#define BBR_VERSION		3
++
++#define bbr_param(sk,name)	(bbr_ ## name)
++
+ /* Scale factor for rate in pkt/uSec unit to avoid truncation in bandwidth
+  * estimation. The rate unit ~= (1500 bytes / 1 usec / 2^24) ~= 715 bps.
+  * This handles bandwidths from 0.06pps (715bps) to 256Mpps (3Tbps) in a u32.
+@@ -85,36 +93,41 @@ enum bbr_mode {
+ 	BBR_PROBE_RTT,	/* cut inflight to min to probe min_rtt */
+ };
+ 
++/* How does the incoming ACK stream relate to our bandwidth probing? */
++enum bbr_ack_phase {
++	BBR_ACKS_INIT,		  /* not probing; not getting probe feedback */
++	BBR_ACKS_REFILLING,	  /* sending at est. bw to fill pipe */
++	BBR_ACKS_PROBE_STARTING,  /* inflight rising to probe bw */
++	BBR_ACKS_PROBE_FEEDBACK,  /* getting feedback from bw probing */
++	BBR_ACKS_PROBE_STOPPING,  /* stopped probing; still getting feedback */
++};
++
+ /* BBR congestion control block */
+ struct bbr {
+ 	u32	min_rtt_us;	        /* min RTT in min_rtt_win_sec window */
+ 	u32	min_rtt_stamp;	        /* timestamp of min_rtt_us */
+ 	u32	probe_rtt_done_stamp;   /* end time for BBR_PROBE_RTT mode */
+-	struct minmax bw;	/* Max recent delivery rate in pkts/uS << 24 */
+-	u32	rtt_cnt;	    /* count of packet-timed rounds elapsed */
++	u32	probe_rtt_min_us;	/* min RTT in probe_rtt_win_ms win */
++	u32	probe_rtt_min_stamp;	/* timestamp of probe_rtt_min_us*/
+ 	u32     next_rtt_delivered; /* scb->tx.delivered at end of round */
+ 	u64	cycle_mstamp;	     /* time of this cycle phase start */
+-	u32     mode:3,		     /* current bbr_mode in state machine */
++	u32     mode:2,		     /* current bbr_mode in state machine */
+ 		prev_ca_state:3,     /* CA state on previous ACK */
+-		packet_conservation:1,  /* use packet conservation? */
+ 		round_start:1,	     /* start of packet-timed tx->ack round? */
++		ce_state:1,          /* If most recent data has CE bit set */
++		bw_probe_up_rounds:5,   /* cwnd-limited rounds in PROBE_UP */
++		try_fast_path:1,	/* can we take fast path? */
+ 		idle_restart:1,	     /* restarting after idle? */
+ 		probe_rtt_round_done:1,  /* a BBR_PROBE_RTT round at 4 pkts? */
+-		unused:13,
+-		lt_is_sampling:1,    /* taking long-term ("LT") samples now? */
+-		lt_rtt_cnt:7,	     /* round trips in long-term interval */
+-		lt_use_bw:1;	     /* use lt_bw as our bw estimate? */
+-	u32	lt_bw;		     /* LT est delivery rate in pkts/uS << 24 */
+-	u32	lt_last_delivered;   /* LT intvl start: tp->delivered */
+-	u32	lt_last_stamp;	     /* LT intvl start: tp->delivered_mstamp */
+-	u32	lt_last_lost;	     /* LT intvl start: tp->lost */
++		init_cwnd:7,         /* initial cwnd */
++		unused_1:10;
+ 	u32	pacing_gain:10,	/* current gain for setting pacing rate */
+ 		cwnd_gain:10,	/* current gain for setting cwnd */
+ 		full_bw_reached:1,   /* reached full bw in Startup? */
+ 		full_bw_cnt:2,	/* number of rounds without large bw gains */
+-		cycle_idx:3,	/* current index in pacing_gain cycle array */
++		cycle_idx:2,	/* current index in pacing_gain cycle array */
+ 		has_seen_rtt:1, /* have we seen an RTT sample yet? */
+-		unused_b:5;
++		unused_2:6;
+ 	u32	prior_cwnd;	/* prior cwnd upon entering loss recovery */
+ 	u32	full_bw;	/* recent bw, to estimate if pipe is full */
+ 
+@@ -124,19 +137,67 @@ struct bbr {
+ 	u32	ack_epoch_acked:20,	/* packets (S)ACKed in sampling epoch */
+ 		extra_acked_win_rtts:5,	/* age of extra_acked, in round trips */
+ 		extra_acked_win_idx:1,	/* current index in extra_acked array */
+-		unused_c:6;
++	/* BBR v3 state: */
++		full_bw_now:1,		/* recently reached full bw plateau? */
++		startup_ecn_rounds:2,	/* consecutive hi ECN STARTUP rounds */
++		loss_in_cycle:1,	/* packet loss in this cycle? */
++		ecn_in_cycle:1,		/* ECN in this cycle? */
++		unused_3:1;
++	u32	loss_round_delivered; /* scb->tx.delivered ending loss round */
++	u32	undo_bw_lo;	     /* bw_lo before latest losses */
++	u32	undo_inflight_lo;    /* inflight_lo before latest losses */
++	u32	undo_inflight_hi;    /* inflight_hi before latest losses */
++	u32	bw_latest;	 /* max delivered bw in last round trip */
++	u32	bw_lo;		 /* lower bound on sending bandwidth */
++	u32	bw_hi[2];	 /* max recent measured bw sample */
++	u32	inflight_latest; /* max delivered data in last round trip */
++	u32	inflight_lo;	 /* lower bound of inflight data range */
++	u32	inflight_hi;	 /* upper bound of inflight data range */
++	u32	bw_probe_up_cnt; /* packets delivered per inflight_hi incr */
++	u32	bw_probe_up_acks;  /* packets (S)ACKed since inflight_hi incr */
++	u32	probe_wait_us;	 /* PROBE_DOWN until next clock-driven probe */
++	u32	prior_rcv_nxt;	/* tp->rcv_nxt when CE state last changed */
++	u32	ecn_eligible:1,	/* sender can use ECN (RTT, handshake)? */
++		ecn_alpha:9,	/* EWMA delivered_ce/delivered; 0..256 */
++		bw_probe_samples:1,    /* rate samples reflect bw probing? */
++		prev_probe_too_high:1, /* did last PROBE_UP go too high? */
++		stopped_risky_probe:1, /* last PROBE_UP stopped due to risk? */
++		rounds_since_probe:8,  /* packet-timed rounds since probed bw */
++		loss_round_start:1,    /* loss_round_delivered round trip? */
++		loss_in_round:1,       /* loss marked in this round trip? */
++		ecn_in_round:1,	       /* ECN marked in this round trip? */
++		ack_phase:3,	       /* bbr_ack_phase: meaning of ACKs */
++		loss_events_in_round:4,/* losses in STARTUP round */
++		initialized:1;	       /* has bbr_init() been called? */
++	u32	alpha_last_delivered;	 /* tp->delivered    at alpha update */
++	u32	alpha_last_delivered_ce; /* tp->delivered_ce at alpha update */
++
++	u8	unused_4;		/* to preserve alignment */
++	struct tcp_plb_state plb;
+ };
+ 
+-#define CYCLE_LEN	8	/* number of phases in a pacing gain cycle */
++struct bbr_context {
++	u32 sample_bw;
++};
+ 
+-/* Window length of bw filter (in rounds): */
+-static const int bbr_bw_rtts = CYCLE_LEN + 2;
+ /* Window length of min_rtt filter (in sec): */
+ static const u32 bbr_min_rtt_win_sec = 10;
+ /* Minimum time (in ms) spent at bbr_cwnd_min_target in BBR_PROBE_RTT mode: */
+ static const u32 bbr_probe_rtt_mode_ms = 200;
+-/* Skip TSO below the following bandwidth (bits/sec): */
+-static const int bbr_min_tso_rate = 1200000;
++/* Window length of probe_rtt_min_us filter (in ms), and consequently the
++ * typical interval between PROBE_RTT mode entries. The default is 5000ms.
++ * Note that bbr_probe_rtt_win_ms must be <= bbr_min_rtt_win_sec * MSEC_PER_SEC
++ */
++static const u32 bbr_probe_rtt_win_ms = 5000;
++/* Proportion of cwnd to estimated BDP in PROBE_RTT, in units of BBR_UNIT: */
++static const u32 bbr_probe_rtt_cwnd_gain = BBR_UNIT * 1 / 2;
++
++/* Use min_rtt to help adapt TSO burst size, with smaller min_rtt resulting
++ * in bigger TSO bursts. We cut the RTT-based allowance in half
++ * for every 2^9 usec (aka 512 us) of RTT, so that the RTT-based allowance
++ * is below 1500 bytes after 6 * ~500 usec = 3ms.
++ */
++static const u32 bbr_tso_rtt_shift = 9;
+ 
+ /* Pace at ~1% below estimated bw, on average, to reduce queue at bottleneck.
+  * In order to help drive the network toward lower queues and low latency while
+@@ -146,13 +207,15 @@ static const int bbr_min_tso_rate = 1200
+  */
+ static const int bbr_pacing_margin_percent = 1;
+ 
+-/* We use a high_gain value of 2/ln(2) because it's the smallest pacing gain
++/* We use a startup_pacing_gain of 4*ln(2) because it's the smallest value
+  * that will allow a smoothly increasing pacing rate that will double each RTT
+  * and send the same number of packets per RTT that an un-paced, slow-starting
+  * Reno or CUBIC flow would:
+  */
+-static const int bbr_high_gain  = BBR_UNIT * 2885 / 1000 + 1;
+-/* The pacing gain of 1/high_gain in BBR_DRAIN is calculated to typically drain
++static const int bbr_startup_pacing_gain = BBR_UNIT * 277 / 100 + 1;
++/* The gain for deriving startup cwnd: */
++static const int bbr_startup_cwnd_gain = BBR_UNIT * 2;
++/* The pacing gain in BBR_DRAIN is calculated to typically drain
+  * the queue created in BBR_STARTUP in a single round:
+  */
+ static const int bbr_drain_gain = BBR_UNIT * 1000 / 2885;
+@@ -160,13 +223,17 @@ static const int bbr_drain_gain = BBR_UN
+ static const int bbr_cwnd_gain  = BBR_UNIT * 2;
+ /* The pacing_gain values for the PROBE_BW gain cycle, to discover/share bw: */
+ static const int bbr_pacing_gain[] = {
+-	BBR_UNIT * 5 / 4,	/* probe for more available bw */
+-	BBR_UNIT * 3 / 4,	/* drain queue and/or yield bw to other flows */
+-	BBR_UNIT, BBR_UNIT, BBR_UNIT,	/* cruise at 1.0*bw to utilize pipe, */
+-	BBR_UNIT, BBR_UNIT, BBR_UNIT	/* without creating excess queue... */
++	BBR_UNIT * 5 / 4,	/* UP: probe for more available bw */
++	BBR_UNIT * 91 / 100,	/* DOWN: drain queue and/or yield bw */
++	BBR_UNIT,		/* CRUISE: try to use pipe w/ some headroom */
++	BBR_UNIT,		/* REFILL: refill pipe to estimated 100% */
++};
++enum bbr_pacing_gain_phase {
++	BBR_BW_PROBE_UP		= 0,  /* push up inflight to probe for bw/vol */
++	BBR_BW_PROBE_DOWN	= 1,  /* drain excess inflight from the queue */
++	BBR_BW_PROBE_CRUISE	= 2,  /* use pipe, w/ headroom in queue/pipe */
++	BBR_BW_PROBE_REFILL	= 3,  /* v2: refill the pipe again to 100% */
+ };
+-/* Randomize the starting gain cycling phase over N phases: */
+-static const u32 bbr_cycle_rand = 7;
+ 
+ /* Try to keep at least this many packets in flight, if things go smoothly. For
+  * smooth functioning, a sliding window protocol ACKing every other packet
+@@ -174,24 +241,12 @@ static const u32 bbr_cycle_rand = 7;
+  */
+ static const u32 bbr_cwnd_min_target = 4;
+ 
+-/* To estimate if BBR_STARTUP mode (i.e. high_gain) has filled pipe... */
++/* To estimate if BBR_STARTUP or BBR_BW_PROBE_UP has filled pipe... */
+ /* If bw has increased significantly (1.25x), there may be more bw available: */
+ static const u32 bbr_full_bw_thresh = BBR_UNIT * 5 / 4;
+ /* But after 3 rounds w/o significant bw growth, estimate pipe is full: */
+ static const u32 bbr_full_bw_cnt = 3;
+ 
+-/* "long-term" ("LT") bandwidth estimator parameters... */
+-/* The minimum number of rounds in an LT bw sampling interval: */
+-static const u32 bbr_lt_intvl_min_rtts = 4;
+-/* If lost/delivered ratio > 20%, interval is "lossy" and we may be policed: */
+-static const u32 bbr_lt_loss_thresh = 50;
+-/* If 2 intervals have a bw ratio <= 1/8, their bw is "consistent": */
+-static const u32 bbr_lt_bw_ratio = BBR_UNIT / 8;
+-/* If 2 intervals have a bw diff <= 4 Kbit/sec their bw is "consistent": */
+-static const u32 bbr_lt_bw_diff = 4000 / 8;
+-/* If we estimate we're policed, use lt_bw for this many round trips: */
+-static const u32 bbr_lt_bw_max_rtts = 48;
+-
+ /* Gain factor for adding extra_acked to target cwnd: */
+ static const int bbr_extra_acked_gain = BBR_UNIT;
+ /* Window length of extra_acked window. */
+@@ -201,8 +256,121 @@ static const u32 bbr_ack_epoch_acked_res
+ /* Time period for clamping cwnd increment due to ack aggregation */
+ static const u32 bbr_extra_acked_max_us = 100 * 1000;
+ 
++/* Flags to control BBR ECN-related behavior... */
++
++/* Ensure ACKs only ACK packets with consistent ECN CE status? */
++static const bool bbr_precise_ece_ack = true;
++
++/* Max RTT (in usec) at which to use sender-side ECN logic.
++ * Disabled when 0 (ECN allowed at any RTT).
++ */
++static const u32 bbr_ecn_max_rtt_us = 5000;
++
++/* On losses, scale down inflight and pacing rate by beta scaled by BBR_SCALE.
++ * No loss response when 0.
++ */
++static const u32 bbr_beta = BBR_UNIT * 30 / 100;
++
++/* Gain factor for ECN mark ratio samples, scaled by BBR_SCALE (1/16 = 6.25%) */
++static const u32 bbr_ecn_alpha_gain = BBR_UNIT * 1 / 16;
++
++/* The initial value for ecn_alpha; 1.0 allows a flow to respond quickly
++ * to congestion if the bottleneck is congested when the flow starts up.
++ */
++static const u32 bbr_ecn_alpha_init = BBR_UNIT;
++
++/* On ECN, cut inflight_lo to (1 - ecn_factor * ecn_alpha) scaled by BBR_SCALE.
++ * No ECN based bounding when 0.
++ */
++static const u32 bbr_ecn_factor = BBR_UNIT * 1 / 3;	 /* 1/3 = 33% */
++
++/* Estimate bw probing has gone too far if CE ratio exceeds this threshold.
++ * Scaled by BBR_SCALE. Disabled when 0.
++ */
++static const u32 bbr_ecn_thresh = BBR_UNIT * 1 / 2;  /* 1/2 = 50% */
++
++/* If non-zero, if in a cycle with no losses but some ECN marks, after ECN
++ * clears then make the first round's increment to inflight_hi the following
++ * fraction of inflight_hi.
++ */
++static const u32 bbr_ecn_reprobe_gain = BBR_UNIT * 1 / 2;
++
++/* Estimate bw probing has gone too far if loss rate exceeds this level. */
++static const u32 bbr_loss_thresh = BBR_UNIT * 2 / 100;  /* 2% loss */
++
++/* Slow down for a packet loss recovered by TLP? */
++static const bool bbr_loss_probe_recovery = true;
++
++/* Exit STARTUP if number of loss marking events in a Recovery round is >= N,
++ * and loss rate is higher than bbr_loss_thresh.
++ * Disabled if 0.
++ */
++static const u32 bbr_full_loss_cnt = 6;
++
++/* Exit STARTUP if number of round trips with ECN mark rate above ecn_thresh
++ * meets this count.
++ */
++static const u32 bbr_full_ecn_cnt = 2;
++
++/* Fraction of unutilized headroom to try to leave in path upon high loss. */
++static const u32 bbr_inflight_headroom = BBR_UNIT * 15 / 100;
++
++/* How much do we increase cwnd_gain when probing for bandwidth in
++ * BBR_BW_PROBE_UP? This specifies the increment in units of
++ * BBR_UNIT/4. The default is 1, meaning 0.25.
++ * The min value is 0 (meaning 0.0); max is 3 (meaning 0.75).
++ */
++static const u32 bbr_bw_probe_cwnd_gain = 1;
++
++/* Max number of packet-timed rounds to wait before probing for bandwidth.  If
++ * we want to tolerate 1% random loss per round, and not have this cut our
++ * inflight too much, we must probe for bw periodically on roughly this scale.
++ * If low, limits Reno/CUBIC coexistence; if high, limits loss tolerance.
++ * We aim to be fair with Reno/CUBIC up to a BDP of at least:
++ *  BDP = 25Mbps * .030sec /(1514bytes) = 61.9 packets
++ */
++static const u32 bbr_bw_probe_max_rounds = 63;
++
++/* Max amount of randomness to inject in round counting for Reno-coexistence.
++ */
++static const u32 bbr_bw_probe_rand_rounds = 2;
++
++/* Use BBR-native probe time scale starting at this many usec.
++ * We aim to be fair with Reno/CUBIC up to an inter-loss time epoch of at least:
++ *  BDP*RTT = 25Mbps * .030sec /(1514bytes) * 0.030sec = 1.9 secs
++ */
++static const u32 bbr_bw_probe_base_us = 2 * USEC_PER_SEC;  /* 2 secs */
++
++/* Use BBR-native probes spread over this many usec: */
++static const u32 bbr_bw_probe_rand_us = 1 * USEC_PER_SEC;  /* 1 secs */
++
++/* Use fast path if app-limited, no loss/ECN, and target cwnd was reached? */
++static const bool bbr_fast_path = true;
++
++/* Use fast ack mode? */
++static const bool bbr_fast_ack_mode = true;
++
++static u32 bbr_max_bw(const struct sock *sk);
++static u32 bbr_bw(const struct sock *sk);
++static void bbr_exit_probe_rtt(struct sock *sk);
++static void bbr_reset_congestion_signals(struct sock *sk);
++static void bbr_run_loss_probe_recovery(struct sock *sk);
++
+ static void bbr_check_probe_rtt_done(struct sock *sk);
+ 
++/* This connection can use ECN if both endpoints have signaled ECN support in
++ * the handshake and the per-route settings indicated this is a
++ * shallow-threshold ECN environment, meaning both:
++ *  (a) ECN CE marks indicate low-latency/shallow-threshold congestion, and
++ *  (b) TCP endpoints provide precise ACKs that only ACK data segments
++ *      with consistent ECN CE status
++ */
++static bool bbr_can_use_ecn(const struct sock *sk)
++{
++	return (tcp_sk(sk)->ecn_flags & TCP_ECN_OK) &&
++	       (tcp_sk(sk)->ecn_flags & TCP_ECN_LOW);
++}
++
+ /* Do we estimate that STARTUP filled the pipe? */
+ static bool bbr_full_bw_reached(const struct sock *sk)
+ {
+@@ -214,17 +382,17 @@ static bool bbr_full_bw_reached(const st
+ /* Return the windowed max recent bandwidth sample, in pkts/uS << BW_SCALE. */
+ static u32 bbr_max_bw(const struct sock *sk)
+ {
+-	struct bbr *bbr = inet_csk_ca(sk);
++	const struct bbr *bbr = inet_csk_ca(sk);
+ 
+-	return minmax_get(&bbr->bw);
++	return max(bbr->bw_hi[0], bbr->bw_hi[1]);
+ }
+ 
+ /* Return the estimated bandwidth of the path, in pkts/uS << BW_SCALE. */
+ static u32 bbr_bw(const struct sock *sk)
+ {
+-	struct bbr *bbr = inet_csk_ca(sk);
++	const struct bbr *bbr = inet_csk_ca(sk);
+ 
+-	return bbr->lt_use_bw ? bbr->lt_bw : bbr_max_bw(sk);
++	return min(bbr_max_bw(sk), bbr->bw_lo);
+ }
+ 
+ /* Return maximum extra acked in past k-2k round trips,
+@@ -241,15 +409,23 @@ static u16 bbr_extra_acked(const struct
+  * The order here is chosen carefully to avoid overflow of u64. This should
+  * work for input rates of up to 2.9Tbit/sec and gain of 2.89x.
+  */
+-static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain)
++static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain,
++				  int margin)
+ {
+ 	unsigned int mss = tcp_sk(sk)->mss_cache;
+ 
+ 	rate *= mss;
+ 	rate *= gain;
+ 	rate >>= BBR_SCALE;
+-	rate *= USEC_PER_SEC / 100 * (100 - bbr_pacing_margin_percent);
+-	return rate >> BW_SCALE;
++	rate *= USEC_PER_SEC / 100 * (100 - margin);
++	rate >>= BW_SCALE;
++	rate = max(rate, 1ULL);
++	return rate;
++}
++
++static u64 bbr_bw_bytes_per_sec(struct sock *sk, u64 rate)
++{
++	return bbr_rate_bytes_per_sec(sk, rate, BBR_UNIT, 0);
+ }
+ 
+ /* Convert a BBR bw and gain factor to a pacing rate in bytes per second. */
+@@ -257,12 +433,13 @@ static unsigned long bbr_bw_to_pacing_ra
+ {
+ 	u64 rate = bw;
+ 
+-	rate = bbr_rate_bytes_per_sec(sk, rate, gain);
++	rate = bbr_rate_bytes_per_sec(sk, rate, gain,
++				      bbr_pacing_margin_percent);
+ 	rate = min_t(u64, rate, READ_ONCE(sk->sk_max_pacing_rate));
+ 	return rate;
+ }
+ 
+-/* Initialize pacing rate to: high_gain * init_cwnd / RTT. */
++/* Initialize pacing rate to: startup_pacing_gain * init_cwnd / RTT. */
+ static void bbr_init_pacing_rate_from_rtt(struct sock *sk)
+ {
+ 	struct tcp_sock *tp = tcp_sk(sk);
+@@ -279,7 +456,7 @@ static void bbr_init_pacing_rate_from_rt
+ 	bw = (u64)tcp_snd_cwnd(tp) * BW_UNIT;
+ 	do_div(bw, rtt_us);
+ 	WRITE_ONCE(sk->sk_pacing_rate,
+-		   bbr_bw_to_pacing_rate(sk, bw, bbr_high_gain));
++		   bbr_bw_to_pacing_rate(sk, bw, bbr_param(sk, startup_pacing_gain)));
+ }
+ 
+ /* Pace using current bw estimate and a gain factor. */
+@@ -295,31 +472,38 @@ static void bbr_set_pacing_rate(struct s
+ 		WRITE_ONCE(sk->sk_pacing_rate, rate);
+ }
+ 
+-/* override sysctl_tcp_min_tso_segs */
+-__bpf_kfunc static u32 bbr_min_tso_segs(struct sock *sk)
+-{
+-	return READ_ONCE(sk->sk_pacing_rate) < (bbr_min_tso_rate >> 3) ? 1 : 2;
+-}
+-
+-/* Return the number of segments BBR would like in a TSO/GSO skb, given
+- * a particular max gso size as a constraint.
++/* Return the number of segments BBR would like in a TSO/GSO skb, given a
++ * particular max gso size as a constraint. TODO: make this simpler and more
++ * consistent by switching bbr to just call tcp_tso_autosize().
+  */
+ static u32 bbr_tso_segs_generic(struct sock *sk, unsigned int mss_now,
+ 				u32 gso_max_size)
+ {
+-	u32 segs;
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 segs, r;
+ 	u64 bytes;
+ 
+ 	/* Budget a TSO/GSO burst size allowance based on bw (pacing_rate). */
+ 	bytes = READ_ONCE(sk->sk_pacing_rate) >> READ_ONCE(sk->sk_pacing_shift);
+ 
++	/* Budget a TSO/GSO burst size allowance based on min_rtt. For every
++	 * K = 2^tso_rtt_shift microseconds of min_rtt, halve the burst.
++	 * The min_rtt-based burst allowance is: 64 KBytes / 2^(min_rtt/K)
++	 */
++	if (bbr_param(sk, tso_rtt_shift)) {
++		r = bbr->min_rtt_us >> bbr_param(sk, tso_rtt_shift);
++		if (r < BITS_PER_TYPE(u32))   /* prevent undefined behavior */
++			bytes += GSO_LEGACY_MAX_SIZE >> r;
++	}
++
+ 	bytes = min_t(u32, bytes, gso_max_size - 1 - MAX_TCP_HEADER);
+-	segs = max_t(u32, bytes / mss_now, bbr_min_tso_segs(sk));
++	segs = max_t(u32, bytes / mss_now,
++		     sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs);
+ 	return segs;
+ }
+ 
+ /* Custom tcp_tso_autosize() for BBR, used at transmit time to cap skb size. */
+-static u32  bbr_tso_segs(struct sock *sk, unsigned int mss_now)
++__bpf_kfunc static u32 bbr_tso_segs(struct sock *sk, unsigned int mss_now)
+ {
+ 	return bbr_tso_segs_generic(sk, mss_now, sk->sk_gso_max_size);
+ }
+@@ -329,7 +513,7 @@ static u32 bbr_tso_segs_goal(struct sock
+ {
+ 	struct tcp_sock *tp = tcp_sk(sk);
+ 
+-	return bbr_tso_segs_generic(sk, tp->mss_cache, GSO_MAX_SIZE);
++	return bbr_tso_segs_generic(sk, tp->mss_cache, GSO_LEGACY_MAX_SIZE);
+ }
+ 
+ /* Save "last known good" cwnd so we can restore it after losses or PROBE_RTT */
+@@ -349,7 +533,9 @@ __bpf_kfunc static void bbr_cwnd_event(s
+ 	struct tcp_sock *tp = tcp_sk(sk);
+ 	struct bbr *bbr = inet_csk_ca(sk);
+ 
+-	if (event == CA_EVENT_TX_START && tp->app_limited) {
++	if (event == CA_EVENT_TX_START) {
++		if (!tp->app_limited)
++			return;
+ 		bbr->idle_restart = 1;
+ 		bbr->ack_epoch_mstamp = tp->tcp_mstamp;
+ 		bbr->ack_epoch_acked = 0;
+@@ -360,6 +546,16 @@ __bpf_kfunc static void bbr_cwnd_event(s
+ 			bbr_set_pacing_rate(sk, bbr_bw(sk), BBR_UNIT);
+ 		else if (bbr->mode == BBR_PROBE_RTT)
+ 			bbr_check_probe_rtt_done(sk);
++	} else if ((event == CA_EVENT_ECN_IS_CE ||
++		    event == CA_EVENT_ECN_NO_CE) &&
++		   bbr_can_use_ecn(sk) &&
++		   bbr_param(sk, precise_ece_ack)) {
++		u32 state = bbr->ce_state;
++		dctcp_ece_ack_update(sk, event, &bbr->prior_rcv_nxt, &state);
++		bbr->ce_state = state;
++	} else if (event == CA_EVENT_TLP_RECOVERY &&
++		   bbr_param(sk, loss_probe_recovery)) {
++		bbr_run_loss_probe_recovery(sk);
+ 	}
+ }
+ 
+@@ -382,10 +578,10 @@ static u32 bbr_bdp(struct sock *sk, u32
+ 	 * default. This should only happen when the connection is not using TCP
+ 	 * timestamps and has retransmitted all of the SYN/SYNACK/data packets
+ 	 * ACKed so far. In this case, an RTO can cut cwnd to 1, in which
+-	 * case we need to slow-start up toward something safe: TCP_INIT_CWND.
++	 * case we need to slow-start up toward something safe: initial cwnd.
+ 	 */
+ 	if (unlikely(bbr->min_rtt_us == ~0U))	 /* no valid RTT samples yet? */
+-		return TCP_INIT_CWND;  /* be safe: cap at default initial cwnd*/
++		return bbr->init_cwnd;  /* be safe: cap at initial cwnd */
+ 
+ 	w = (u64)bw * bbr->min_rtt_us;
+ 
+@@ -402,23 +598,23 @@ static u32 bbr_bdp(struct sock *sk, u32
+  *   - one skb in sending host Qdisc,
+  *   - one skb in sending host TSO/GSO engine
+  *   - one skb being received by receiver host LRO/GRO/delayed-ACK engine
+- * Don't worry, at low rates (bbr_min_tso_rate) this won't bloat cwnd because
+- * in such cases tso_segs_goal is 1. The minimum cwnd is 4 packets,
++ * Don't worry, at low rates this won't bloat cwnd because
++ * in such cases tso_segs_goal is small. The minimum cwnd is 4 packets,
+  * which allows 2 outstanding 2-packet sequences, to try to keep pipe
+  * full even with ACK-every-other-packet delayed ACKs.
+  */
+ static u32 bbr_quantization_budget(struct sock *sk, u32 cwnd)
+ {
+ 	struct bbr *bbr = inet_csk_ca(sk);
++	u32 tso_segs_goal;
+ 
+-	/* Allow enough full-sized skbs in flight to utilize end systems. */
+-	cwnd += 3 * bbr_tso_segs_goal(sk);
+-
+-	/* Reduce delayed ACKs by rounding up cwnd to the next even number. */
+-	cwnd = (cwnd + 1) & ~1U;
++	tso_segs_goal = 3 * bbr_tso_segs_goal(sk);
+ 
++	/* Allow enough full-sized skbs in flight to utilize end systems. */
++	cwnd = max_t(u32, cwnd, tso_segs_goal);
++	cwnd = max_t(u32, cwnd, bbr_param(sk, cwnd_min_target));
+ 	/* Ensure gain cycling gets inflight above BDP even for small BDPs. */
+-	if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == 0)
++	if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == BBR_BW_PROBE_UP)
+ 		cwnd += 2;
+ 
+ 	return cwnd;
+@@ -473,10 +669,10 @@ static u32 bbr_ack_aggregation_cwnd(stru
+ {
+ 	u32 max_aggr_cwnd, aggr_cwnd = 0;
+ 
+-	if (bbr_extra_acked_gain && bbr_full_bw_reached(sk)) {
++	if (bbr_param(sk, extra_acked_gain)) {
+ 		max_aggr_cwnd = ((u64)bbr_bw(sk) * bbr_extra_acked_max_us)
+ 				/ BW_UNIT;
+-		aggr_cwnd = (bbr_extra_acked_gain * bbr_extra_acked(sk))
++		aggr_cwnd = (bbr_param(sk, extra_acked_gain) * bbr_extra_acked(sk))
+ 			     >> BBR_SCALE;
+ 		aggr_cwnd = min(aggr_cwnd, max_aggr_cwnd);
+ 	}
+@@ -484,66 +680,27 @@ static u32 bbr_ack_aggregation_cwnd(stru
+ 	return aggr_cwnd;
+ }
+ 
+-/* An optimization in BBR to reduce losses: On the first round of recovery, we
+- * follow the packet conservation principle: send P packets per P packets acked.
+- * After that, we slow-start and send at most 2*P packets per P packets acked.
+- * After recovery finishes, or upon undo, we restore the cwnd we had when
+- * recovery started (capped by the target cwnd based on estimated BDP).
+- *
+- * TODO(ycheng/ncardwell): implement a rate-based approach.
+- */
+-static bool bbr_set_cwnd_to_recover_or_restore(
+-	struct sock *sk, const struct rate_sample *rs, u32 acked, u32 *new_cwnd)
++/* Returns the cwnd for PROBE_RTT mode. */
++static u32 bbr_probe_rtt_cwnd(struct sock *sk)
+ {
+-	struct tcp_sock *tp = tcp_sk(sk);
+-	struct bbr *bbr = inet_csk_ca(sk);
+-	u8 prev_state = bbr->prev_ca_state, state = inet_csk(sk)->icsk_ca_state;
+-	u32 cwnd = tcp_snd_cwnd(tp);
+-
+-	/* An ACK for P pkts should release at most 2*P packets. We do this
+-	 * in two steps. First, here we deduct the number of lost packets.
+-	 * Then, in bbr_set_cwnd() we slow start up toward the target cwnd.
+-	 */
+-	if (rs->losses > 0)
+-		cwnd = max_t(s32, cwnd - rs->losses, 1);
+-
+-	if (state == TCP_CA_Recovery && prev_state != TCP_CA_Recovery) {
+-		/* Starting 1st round of Recovery, so do packet conservation. */
+-		bbr->packet_conservation = 1;
+-		bbr->next_rtt_delivered = tp->delivered;  /* start round now */
+-		/* Cut unused cwnd from app behavior, TSQ, or TSO deferral: */
+-		cwnd = tcp_packets_in_flight(tp) + acked;
+-	} else if (prev_state >= TCP_CA_Recovery && state < TCP_CA_Recovery) {
+-		/* Exiting loss recovery; restore cwnd saved before recovery. */
+-		cwnd = max(cwnd, bbr->prior_cwnd);
+-		bbr->packet_conservation = 0;
+-	}
+-	bbr->prev_ca_state = state;
+-
+-	if (bbr->packet_conservation) {
+-		*new_cwnd = max(cwnd, tcp_packets_in_flight(tp) + acked);
+-		return true;	/* yes, using packet conservation */
+-	}
+-	*new_cwnd = cwnd;
+-	return false;
++	return max_t(u32, bbr_param(sk, cwnd_min_target),
++		     bbr_bdp(sk, bbr_bw(sk), bbr_param(sk, probe_rtt_cwnd_gain)));
+ }
+ 
+ /* Slow-start up toward target cwnd (if bw estimate is growing, or packet loss
+  * has drawn us down below target), or snap down to target if we're above it.
+  */
+ static void bbr_set_cwnd(struct sock *sk, const struct rate_sample *rs,
+-			 u32 acked, u32 bw, int gain)
++			 u32 acked, u32 bw, int gain, u32 cwnd,
++			 struct bbr_context *ctx)
+ {
+ 	struct tcp_sock *tp = tcp_sk(sk);
+ 	struct bbr *bbr = inet_csk_ca(sk);
+-	u32 cwnd = tcp_snd_cwnd(tp), target_cwnd = 0;
++	u32 target_cwnd = 0;
+ 
+ 	if (!acked)
+ 		goto done;  /* no packet fully ACKed; just apply caps */
+ 
+-	if (bbr_set_cwnd_to_recover_or_restore(sk, rs, acked, &cwnd))
+-		goto done;
+-
+ 	target_cwnd = bbr_bdp(sk, bw, gain);
+ 
+ 	/* Increment the cwnd to account for excess ACKed data that seems
+@@ -552,74 +709,26 @@ static void bbr_set_cwnd(struct sock *sk
+ 	target_cwnd += bbr_ack_aggregation_cwnd(sk);
+ 	target_cwnd = bbr_quantization_budget(sk, target_cwnd);
+ 
+-	/* If we're below target cwnd, slow start cwnd toward target cwnd. */
+-	if (bbr_full_bw_reached(sk))  /* only cut cwnd if we filled the pipe */
+-		cwnd = min(cwnd + acked, target_cwnd);
+-	else if (cwnd < target_cwnd || tp->delivered < TCP_INIT_CWND)
+-		cwnd = cwnd + acked;
+-	cwnd = max(cwnd, bbr_cwnd_min_target);
++	/* Update cwnd and enable fast path if cwnd reaches target_cwnd. */
++	bbr->try_fast_path = 0;
++	if (bbr_full_bw_reached(sk)) { /* only cut cwnd if we filled the pipe */
++		cwnd += acked;
++		if (cwnd >= target_cwnd) {
++			cwnd = target_cwnd;
++			bbr->try_fast_path = 1;
++		}
++	} else if (cwnd < target_cwnd || cwnd  < 2 * bbr->init_cwnd) {
++		cwnd += acked;
++	} else {
++		bbr->try_fast_path = 1;
++	}
+ 
++	cwnd = max_t(u32, cwnd, bbr_param(sk, cwnd_min_target));
+ done:
+-	tcp_snd_cwnd_set(tp, min(cwnd, tp->snd_cwnd_clamp));	/* apply global cap */
++	tcp_snd_cwnd_set(tp, min(cwnd, tp->snd_cwnd_clamp));  /* global cap */
+ 	if (bbr->mode == BBR_PROBE_RTT)  /* drain queue, refresh min_rtt */
+-		tcp_snd_cwnd_set(tp, min(tcp_snd_cwnd(tp), bbr_cwnd_min_target));
+-}
+-
+-/* End cycle phase if it's time and/or we hit the phase's in-flight target. */
+-static bool bbr_is_next_cycle_phase(struct sock *sk,
+-				    const struct rate_sample *rs)
+-{
+-	struct tcp_sock *tp = tcp_sk(sk);
+-	struct bbr *bbr = inet_csk_ca(sk);
+-	bool is_full_length =
+-		tcp_stamp_us_delta(tp->delivered_mstamp, bbr->cycle_mstamp) >
+-		bbr->min_rtt_us;
+-	u32 inflight, bw;
+-
+-	/* The pacing_gain of 1.0 paces at the estimated bw to try to fully
+-	 * use the pipe without increasing the queue.
+-	 */
+-	if (bbr->pacing_gain == BBR_UNIT)
+-		return is_full_length;		/* just use wall clock time */
+-
+-	inflight = bbr_packets_in_net_at_edt(sk, rs->prior_in_flight);
+-	bw = bbr_max_bw(sk);
+-
+-	/* A pacing_gain > 1.0 probes for bw by trying to raise inflight to at
+-	 * least pacing_gain*BDP; this may take more than min_rtt if min_rtt is
+-	 * small (e.g. on a LAN). We do not persist if packets are lost, since
+-	 * a path with small buffers may not hold that much.
+-	 */
+-	if (bbr->pacing_gain > BBR_UNIT)
+-		return is_full_length &&
+-			(rs->losses ||  /* perhaps pacing_gain*BDP won't fit */
+-			 inflight >= bbr_inflight(sk, bw, bbr->pacing_gain));
+-
+-	/* A pacing_gain < 1.0 tries to drain extra queue we added if bw
+-	 * probing didn't find more bw. If inflight falls to match BDP then we
+-	 * estimate queue is drained; persisting would underutilize the pipe.
+-	 */
+-	return is_full_length ||
+-		inflight <= bbr_inflight(sk, bw, BBR_UNIT);
+-}
+-
+-static void bbr_advance_cycle_phase(struct sock *sk)
+-{
+-	struct tcp_sock *tp = tcp_sk(sk);
+-	struct bbr *bbr = inet_csk_ca(sk);
+-
+-	bbr->cycle_idx = (bbr->cycle_idx + 1) & (CYCLE_LEN - 1);
+-	bbr->cycle_mstamp = tp->delivered_mstamp;
+-}
+-
+-/* Gain cycling: cycle pacing gain to converge to fair share of available bw. */
+-static void bbr_update_cycle_phase(struct sock *sk,
+-				   const struct rate_sample *rs)
+-{
+-	struct bbr *bbr = inet_csk_ca(sk);
+-
+-	if (bbr->mode == BBR_PROBE_BW && bbr_is_next_cycle_phase(sk, rs))
+-		bbr_advance_cycle_phase(sk);
++		tcp_snd_cwnd_set(tp, min_t(u32, tcp_snd_cwnd(tp),
++					   bbr_probe_rtt_cwnd(sk)));
+ }
+ 
+ static void bbr_reset_startup_mode(struct sock *sk)
+@@ -629,191 +738,49 @@ static void bbr_reset_startup_mode(struc
+ 	bbr->mode = BBR_STARTUP;
+ }
+ 
+-static void bbr_reset_probe_bw_mode(struct sock *sk)
+-{
+-	struct bbr *bbr = inet_csk_ca(sk);
+-
+-	bbr->mode = BBR_PROBE_BW;
+-	bbr->cycle_idx = CYCLE_LEN - 1 - get_random_u32_below(bbr_cycle_rand);
+-	bbr_advance_cycle_phase(sk);	/* flip to next phase of gain cycle */
+-}
+-
+-static void bbr_reset_mode(struct sock *sk)
+-{
+-	if (!bbr_full_bw_reached(sk))
+-		bbr_reset_startup_mode(sk);
+-	else
+-		bbr_reset_probe_bw_mode(sk);
+-}
+-
+-/* Start a new long-term sampling interval. */
+-static void bbr_reset_lt_bw_sampling_interval(struct sock *sk)
+-{
+-	struct tcp_sock *tp = tcp_sk(sk);
+-	struct bbr *bbr = inet_csk_ca(sk);
+-
+-	bbr->lt_last_stamp = div_u64(tp->delivered_mstamp, USEC_PER_MSEC);
+-	bbr->lt_last_delivered = tp->delivered;
+-	bbr->lt_last_lost = tp->lost;
+-	bbr->lt_rtt_cnt = 0;
+-}
+-
+-/* Completely reset long-term bandwidth sampling. */
+-static void bbr_reset_lt_bw_sampling(struct sock *sk)
+-{
+-	struct bbr *bbr = inet_csk_ca(sk);
+-
+-	bbr->lt_bw = 0;
+-	bbr->lt_use_bw = 0;
+-	bbr->lt_is_sampling = false;
+-	bbr_reset_lt_bw_sampling_interval(sk);
+-}
+-
+-/* Long-term bw sampling interval is done. Estimate whether we're policed. */
+-static void bbr_lt_bw_interval_done(struct sock *sk, u32 bw)
+-{
+-	struct bbr *bbr = inet_csk_ca(sk);
+-	u32 diff;
+-
+-	if (bbr->lt_bw) {  /* do we have bw from a previous interval? */
+-		/* Is new bw close to the lt_bw from the previous interval? */
+-		diff = abs(bw - bbr->lt_bw);
+-		if ((diff * BBR_UNIT <= bbr_lt_bw_ratio * bbr->lt_bw) ||
+-		    (bbr_rate_bytes_per_sec(sk, diff, BBR_UNIT) <=
+-		     bbr_lt_bw_diff)) {
+-			/* All criteria are met; estimate we're policed. */
+-			bbr->lt_bw = (bw + bbr->lt_bw) >> 1;  /* avg 2 intvls */
+-			bbr->lt_use_bw = 1;
+-			bbr->pacing_gain = BBR_UNIT;  /* try to avoid drops */
+-			bbr->lt_rtt_cnt = 0;
+-			return;
+-		}
+-	}
+-	bbr->lt_bw = bw;
+-	bbr_reset_lt_bw_sampling_interval(sk);
+-}
+-
+-/* Token-bucket traffic policers are common (see "An Internet-Wide Analysis of
+- * Traffic Policing", SIGCOMM 2016). BBR detects token-bucket policers and
+- * explicitly models their policed rate, to reduce unnecessary losses. We
+- * estimate that we're policed if we see 2 consecutive sampling intervals with
+- * consistent throughput and high packet loss. If we think we're being policed,
+- * set lt_bw to the "long-term" average delivery rate from those 2 intervals.
++/* See if we have reached next round trip. Upon start of the new round,
++ * returns packets delivered since previous round start plus this ACK.
+  */
+-static void bbr_lt_bw_sampling(struct sock *sk, const struct rate_sample *rs)
++static u32 bbr_update_round_start(struct sock *sk,
++		const struct rate_sample *rs, struct bbr_context *ctx)
+ {
+ 	struct tcp_sock *tp = tcp_sk(sk);
+ 	struct bbr *bbr = inet_csk_ca(sk);
+-	u32 lost, delivered;
+-	u64 bw;
+-	u32 t;
+-
+-	if (bbr->lt_use_bw) {	/* already using long-term rate, lt_bw? */
+-		if (bbr->mode == BBR_PROBE_BW && bbr->round_start &&
+-		    ++bbr->lt_rtt_cnt >= bbr_lt_bw_max_rtts) {
+-			bbr_reset_lt_bw_sampling(sk);    /* stop using lt_bw */
+-			bbr_reset_probe_bw_mode(sk);  /* restart gain cycling */
+-		}
+-		return;
+-	}
+-
+-	/* Wait for the first loss before sampling, to let the policer exhaust
+-	 * its tokens and estimate the steady-state rate allowed by the policer.
+-	 * Starting samples earlier includes bursts that over-estimate the bw.
+-	 */
+-	if (!bbr->lt_is_sampling) {
+-		if (!rs->losses)
+-			return;
+-		bbr_reset_lt_bw_sampling_interval(sk);
+-		bbr->lt_is_sampling = true;
+-	}
+-
+-	/* To avoid underestimates, reset sampling if we run out of data. */
+-	if (rs->is_app_limited) {
+-		bbr_reset_lt_bw_sampling(sk);
+-		return;
+-	}
+-
+-	if (bbr->round_start)
+-		bbr->lt_rtt_cnt++;	/* count round trips in this interval */
+-	if (bbr->lt_rtt_cnt < bbr_lt_intvl_min_rtts)
+-		return;		/* sampling interval needs to be longer */
+-	if (bbr->lt_rtt_cnt > 4 * bbr_lt_intvl_min_rtts) {
+-		bbr_reset_lt_bw_sampling(sk);  /* interval is too long */
+-		return;
+-	}
+-
+-	/* End sampling interval when a packet is lost, so we estimate the
+-	 * policer tokens were exhausted. Stopping the sampling before the
+-	 * tokens are exhausted under-estimates the policed rate.
+-	 */
+-	if (!rs->losses)
+-		return;
+-
+-	/* Calculate packets lost and delivered in sampling interval. */
+-	lost = tp->lost - bbr->lt_last_lost;
+-	delivered = tp->delivered - bbr->lt_last_delivered;
+-	/* Is loss rate (lost/delivered) >= lt_loss_thresh? If not, wait. */
+-	if (!delivered || (lost << BBR_SCALE) < bbr_lt_loss_thresh * delivered)
+-		return;
+-
+-	/* Find average delivery rate in this sampling interval. */
+-	t = div_u64(tp->delivered_mstamp, USEC_PER_MSEC) - bbr->lt_last_stamp;
+-	if ((s32)t < 1)
+-		return;		/* interval is less than one ms, so wait */
+-	/* Check if can multiply without overflow */
+-	if (t >= ~0U / USEC_PER_MSEC) {
+-		bbr_reset_lt_bw_sampling(sk);  /* interval too long; reset */
+-		return;
+-	}
+-	t *= USEC_PER_MSEC;
+-	bw = (u64)delivered * BW_UNIT;
+-	do_div(bw, t);
+-	bbr_lt_bw_interval_done(sk, bw);
+-}
+-
+-/* Estimate the bandwidth based on how fast packets are delivered */
+-static void bbr_update_bw(struct sock *sk, const struct rate_sample *rs)
+-{
+-	struct tcp_sock *tp = tcp_sk(sk);
+-	struct bbr *bbr = inet_csk_ca(sk);
+-	u64 bw;
++	u32 round_delivered = 0;
+ 
+ 	bbr->round_start = 0;
+-	if (rs->delivered < 0 || rs->interval_us <= 0)
+-		return; /* Not a valid observation */
+ 
+ 	/* See if we've reached the next RTT */
+-	if (!before(rs->prior_delivered, bbr->next_rtt_delivered)) {
++	if (rs->interval_us > 0 &&
++	    !before(rs->prior_delivered, bbr->next_rtt_delivered)) {
++		round_delivered = tp->delivered - bbr->next_rtt_delivered;
+ 		bbr->next_rtt_delivered = tp->delivered;
+-		bbr->rtt_cnt++;
+ 		bbr->round_start = 1;
+-		bbr->packet_conservation = 0;
+ 	}
++	return round_delivered;
++}
+ 
+-	bbr_lt_bw_sampling(sk, rs);
++/* Calculate the bandwidth based on how fast packets are delivered */
++static void bbr_calculate_bw_sample(struct sock *sk,
++			const struct rate_sample *rs, struct bbr_context *ctx)
++{
++	u64 bw = 0;
+ 
+ 	/* Divide delivered by the interval to find a (lower bound) bottleneck
+ 	 * bandwidth sample. Delivered is in packets and interval_us in uS and
+ 	 * ratio will be <<1 for most connections. So delivered is first scaled.
++	 * Round up to allow growth at low rates, even with integer division.
+ 	 */
+-	bw = div64_long((u64)rs->delivered * BW_UNIT, rs->interval_us);
++	if (rs->interval_us > 0) {
++		if (WARN_ONCE(rs->delivered < 0,
++			      "negative delivered: %d interval_us: %ld\n",
++			      rs->delivered, rs->interval_us))
++			return;
+ 
+-	/* If this sample is application-limited, it is likely to have a very
+-	 * low delivered count that represents application behavior rather than
+-	 * the available network rate. Such a sample could drag down estimated
+-	 * bw, causing needless slow-down. Thus, to continue to send at the
+-	 * last measured network rate, we filter out app-limited samples unless
+-	 * they describe the path bw at least as well as our bw model.
+-	 *
+-	 * So the goal during app-limited phase is to proceed with the best
+-	 * network rate no matter how long. We automatically leave this
+-	 * phase when app writes faster than the network can deliver :)
+-	 */
+-	if (!rs->is_app_limited || bw >= bbr_max_bw(sk)) {
+-		/* Incorporate new sample into our max bw filter. */
+-		minmax_running_max(&bbr->bw, bbr_bw_rtts, bbr->rtt_cnt, bw);
++		bw = DIV_ROUND_UP_ULL((u64)rs->delivered * BW_UNIT, rs->interval_us);
+ 	}
++
++	ctx->sample_bw = bw;
+ }
+ 
+ /* Estimates the windowed max degree of ack aggregation.
+@@ -827,7 +794,7 @@ static void bbr_update_bw(struct sock *s
+  *
+  * Max extra_acked is clamped by cwnd and bw * bbr_extra_acked_max_us (100 ms).
+  * Max filter is an approximate sliding window of 5-10 (packet timed) round
+- * trips.
++ * trips for non-startup phase, and 1-2 round trips for startup.
+  */
+ static void bbr_update_ack_aggregation(struct sock *sk,
+ 				       const struct rate_sample *rs)
+@@ -835,15 +802,19 @@ static void bbr_update_ack_aggregation(s
+ 	u32 epoch_us, expected_acked, extra_acked;
+ 	struct bbr *bbr = inet_csk_ca(sk);
+ 	struct tcp_sock *tp = tcp_sk(sk);
++	u32 extra_acked_win_rtts_thresh = bbr_param(sk, extra_acked_win_rtts);
+ 
+-	if (!bbr_extra_acked_gain || rs->acked_sacked <= 0 ||
++	if (!bbr_param(sk, extra_acked_gain) || rs->acked_sacked <= 0 ||
+ 	    rs->delivered < 0 || rs->interval_us <= 0)
+ 		return;
+ 
+ 	if (bbr->round_start) {
+ 		bbr->extra_acked_win_rtts = min(0x1F,
+ 						bbr->extra_acked_win_rtts + 1);
+-		if (bbr->extra_acked_win_rtts >= bbr_extra_acked_win_rtts) {
++		if (!bbr_full_bw_reached(sk))
++			extra_acked_win_rtts_thresh = 1;
++		if (bbr->extra_acked_win_rtts >=
++		    extra_acked_win_rtts_thresh) {
+ 			bbr->extra_acked_win_rtts = 0;
+ 			bbr->extra_acked_win_idx = bbr->extra_acked_win_idx ?
+ 						   0 : 1;
+@@ -877,49 +848,6 @@ static void bbr_update_ack_aggregation(s
+ 		bbr->extra_acked[bbr->extra_acked_win_idx] = extra_acked;
+ }
+ 
+-/* Estimate when the pipe is full, using the change in delivery rate: BBR
+- * estimates that STARTUP filled the pipe if the estimated bw hasn't changed by
+- * at least bbr_full_bw_thresh (25%) after bbr_full_bw_cnt (3) non-app-limited
+- * rounds. Why 3 rounds: 1: rwin autotuning grows the rwin, 2: we fill the
+- * higher rwin, 3: we get higher delivery rate samples. Or transient
+- * cross-traffic or radio noise can go away. CUBIC Hystart shares a similar
+- * design goal, but uses delay and inter-ACK spacing instead of bandwidth.
+- */
+-static void bbr_check_full_bw_reached(struct sock *sk,
+-				      const struct rate_sample *rs)
+-{
+-	struct bbr *bbr = inet_csk_ca(sk);
+-	u32 bw_thresh;
+-
+-	if (bbr_full_bw_reached(sk) || !bbr->round_start || rs->is_app_limited)
+-		return;
+-
+-	bw_thresh = (u64)bbr->full_bw * bbr_full_bw_thresh >> BBR_SCALE;
+-	if (bbr_max_bw(sk) >= bw_thresh) {
+-		bbr->full_bw = bbr_max_bw(sk);
+-		bbr->full_bw_cnt = 0;
+-		return;
+-	}
+-	++bbr->full_bw_cnt;
+-	bbr->full_bw_reached = bbr->full_bw_cnt >= bbr_full_bw_cnt;
+-}
+-
+-/* If pipe is probably full, drain the queue and then enter steady-state. */
+-static void bbr_check_drain(struct sock *sk, const struct rate_sample *rs)
+-{
+-	struct bbr *bbr = inet_csk_ca(sk);
+-
+-	if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) {
+-		bbr->mode = BBR_DRAIN;	/* drain queue we created */
+-		tcp_sk(sk)->snd_ssthresh =
+-				bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT);
+-	}	/* fall through to check if in-flight is already small: */
+-	if (bbr->mode == BBR_DRAIN &&
+-	    bbr_packets_in_net_at_edt(sk, tcp_packets_in_flight(tcp_sk(sk))) <=
+-	    bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT))
+-		bbr_reset_probe_bw_mode(sk);  /* we estimate queue is drained */
+-}
+-
+ static void bbr_check_probe_rtt_done(struct sock *sk)
+ {
+ 	struct tcp_sock *tp = tcp_sk(sk);
+@@ -929,9 +857,9 @@ static void bbr_check_probe_rtt_done(str
+ 	      after(tcp_jiffies32, bbr->probe_rtt_done_stamp)))
+ 		return;
+ 
+-	bbr->min_rtt_stamp = tcp_jiffies32;  /* wait a while until PROBE_RTT */
++	bbr->probe_rtt_min_stamp = tcp_jiffies32; /* schedule next PROBE_RTT */
+ 	tcp_snd_cwnd_set(tp, max(tcp_snd_cwnd(tp), bbr->prior_cwnd));
+-	bbr_reset_mode(sk);
++	bbr_exit_probe_rtt(sk);
+ }
+ 
+ /* The goal of PROBE_RTT mode is to have BBR flows cooperatively and
+@@ -957,23 +885,35 @@ static void bbr_update_min_rtt(struct so
+ {
+ 	struct tcp_sock *tp = tcp_sk(sk);
+ 	struct bbr *bbr = inet_csk_ca(sk);
+-	bool filter_expired;
++	bool probe_rtt_expired, min_rtt_expired;
++	u32 expire;
+ 
+-	/* Track min RTT seen in the min_rtt_win_sec filter window: */
+-	filter_expired = after(tcp_jiffies32,
+-			       bbr->min_rtt_stamp + bbr_min_rtt_win_sec * HZ);
++	/* Track min RTT in probe_rtt_win_ms to time next PROBE_RTT state. */
++	expire = bbr->probe_rtt_min_stamp +
++		 msecs_to_jiffies(bbr_param(sk, probe_rtt_win_ms));
++	probe_rtt_expired = after(tcp_jiffies32, expire);
+ 	if (rs->rtt_us >= 0 &&
+-	    (rs->rtt_us < bbr->min_rtt_us ||
+-	     (filter_expired && !rs->is_ack_delayed))) {
+-		bbr->min_rtt_us = rs->rtt_us;
+-		bbr->min_rtt_stamp = tcp_jiffies32;
++	    (rs->rtt_us < bbr->probe_rtt_min_us ||
++	     (probe_rtt_expired && !rs->is_ack_delayed))) {
++		bbr->probe_rtt_min_us = rs->rtt_us;
++		bbr->probe_rtt_min_stamp = tcp_jiffies32;
++	}
++	/* Track min RTT seen in the min_rtt_win_sec filter window: */
++	expire = bbr->min_rtt_stamp + bbr_param(sk, min_rtt_win_sec) * HZ;
++	min_rtt_expired = after(tcp_jiffies32, expire);
++	if (bbr->probe_rtt_min_us <= bbr->min_rtt_us ||
++	    min_rtt_expired) {
++		bbr->min_rtt_us = bbr->probe_rtt_min_us;
++		bbr->min_rtt_stamp = bbr->probe_rtt_min_stamp;
+ 	}
+ 
+-	if (bbr_probe_rtt_mode_ms > 0 && filter_expired &&
++	if (bbr_param(sk, probe_rtt_mode_ms) > 0 && probe_rtt_expired &&
+ 	    !bbr->idle_restart && bbr->mode != BBR_PROBE_RTT) {
+ 		bbr->mode = BBR_PROBE_RTT;  /* dip, drain queue */
+ 		bbr_save_cwnd(sk);  /* note cwnd so we can restore it */
+ 		bbr->probe_rtt_done_stamp = 0;
++		bbr->ack_phase = BBR_ACKS_PROBE_STOPPING;
++		bbr->next_rtt_delivered = tp->delivered;
+ 	}
+ 
+ 	if (bbr->mode == BBR_PROBE_RTT) {
+@@ -982,9 +922,9 @@ static void bbr_update_min_rtt(struct so
+ 			(tp->delivered + tcp_packets_in_flight(tp)) ? : 1;
+ 		/* Maintain min packets in flight for max(200 ms, 1 round). */
+ 		if (!bbr->probe_rtt_done_stamp &&
+-		    tcp_packets_in_flight(tp) <= bbr_cwnd_min_target) {
++		    tcp_packets_in_flight(tp) <= bbr_probe_rtt_cwnd(sk)) {
+ 			bbr->probe_rtt_done_stamp = tcp_jiffies32 +
+-				msecs_to_jiffies(bbr_probe_rtt_mode_ms);
++				msecs_to_jiffies(bbr_param(sk, probe_rtt_mode_ms));
+ 			bbr->probe_rtt_round_done = 0;
+ 			bbr->next_rtt_delivered = tp->delivered;
+ 		} else if (bbr->probe_rtt_done_stamp) {
+@@ -1005,18 +945,20 @@ static void bbr_update_gains(struct sock
+ 
+ 	switch (bbr->mode) {
+ 	case BBR_STARTUP:
+-		bbr->pacing_gain = bbr_high_gain;
+-		bbr->cwnd_gain	 = bbr_high_gain;
++		bbr->pacing_gain = bbr_param(sk, startup_pacing_gain);
++		bbr->cwnd_gain	 = bbr_param(sk, startup_cwnd_gain);
+ 		break;
+ 	case BBR_DRAIN:
+-		bbr->pacing_gain = bbr_drain_gain;	/* slow, to drain */
+-		bbr->cwnd_gain	 = bbr_high_gain;	/* keep cwnd */
++		bbr->pacing_gain = bbr_param(sk, drain_gain);  /* slow, to drain */
++		bbr->cwnd_gain	 = bbr_param(sk, startup_cwnd_gain);  /* keep cwnd */
+ 		break;
+ 	case BBR_PROBE_BW:
+-		bbr->pacing_gain = (bbr->lt_use_bw ?
+-				    BBR_UNIT :
+-				    bbr_pacing_gain[bbr->cycle_idx]);
+-		bbr->cwnd_gain	 = bbr_cwnd_gain;
++		bbr->pacing_gain = bbr_pacing_gain[bbr->cycle_idx];
++		bbr->cwnd_gain	 = bbr_param(sk, cwnd_gain);
++		if (bbr_param(sk, bw_probe_cwnd_gain) &&
++		    bbr->cycle_idx == BBR_BW_PROBE_UP)
++			bbr->cwnd_gain +=
++				BBR_UNIT * bbr_param(sk, bw_probe_cwnd_gain) / 4;
+ 		break;
+ 	case BBR_PROBE_RTT:
+ 		bbr->pacing_gain = BBR_UNIT;
+@@ -1028,27 +970,1108 @@ static void bbr_update_gains(struct sock
+ 	}
+ }
+ 
+-static void bbr_update_model(struct sock *sk, const struct rate_sample *rs)
++__bpf_kfunc static u32 bbr_sndbuf_expand(struct sock *sk)
++{
++	/* Provision 3 * cwnd since BBR may slow-start even during recovery. */
++	return 3;
++}
++
++/* Incorporate a new bw sample into the current window of our max filter. */
++static void bbr_take_max_bw_sample(struct sock *sk, u32 bw)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	bbr->bw_hi[1] = max(bw, bbr->bw_hi[1]);
++}
++
++/* Keep max of last 1-2 cycles. Each PROBE_BW cycle, flip filter window. */
++static void bbr_advance_max_bw_filter(struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	if (!bbr->bw_hi[1])
++		return;  /* no samples in this window; remember old window */
++	bbr->bw_hi[0] = bbr->bw_hi[1];
++	bbr->bw_hi[1] = 0;
++}
++
++/* Reset the estimator for reaching full bandwidth based on bw plateau. */
++static void bbr_reset_full_bw(struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	bbr->full_bw = 0;
++	bbr->full_bw_cnt = 0;
++	bbr->full_bw_now = 0;
++}
++
++/* How much do we want in flight? Our BDP, unless congestion cut cwnd. */
++static u32 bbr_target_inflight(struct sock *sk)
++{
++	u32 bdp = bbr_inflight(sk, bbr_bw(sk), BBR_UNIT);
++
++	return min(bdp, tcp_sk(sk)->snd_cwnd);
++}
++
++static bool bbr_is_probing_bandwidth(struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	return (bbr->mode == BBR_STARTUP) ||
++		(bbr->mode == BBR_PROBE_BW &&
++		 (bbr->cycle_idx == BBR_BW_PROBE_REFILL ||
++		  bbr->cycle_idx == BBR_BW_PROBE_UP));
++}
++
++/* Has the given amount of time elapsed since we marked the phase start? */
++static bool bbr_has_elapsed_in_phase(const struct sock *sk, u32 interval_us)
++{
++	const struct tcp_sock *tp = tcp_sk(sk);
++	const struct bbr *bbr = inet_csk_ca(sk);
++
++	return tcp_stamp_us_delta(tp->tcp_mstamp,
++				  bbr->cycle_mstamp + interval_us) > 0;
++}
++
++static void bbr_handle_queue_too_high_in_startup(struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 bdp;  /* estimated BDP in packets, with quantization budget */
++
++	bbr->full_bw_reached = 1;
++
++	bdp = bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT);
++	bbr->inflight_hi = max(bdp, bbr->inflight_latest);
++}
++
++/* Exit STARTUP upon N consecutive rounds with ECN mark rate > ecn_thresh. */
++static void bbr_check_ecn_too_high_in_startup(struct sock *sk, u32 ce_ratio)
+ {
+-	bbr_update_bw(sk, rs);
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	if (bbr_full_bw_reached(sk) || !bbr->ecn_eligible ||
++	    !bbr_param(sk, full_ecn_cnt) || !bbr_param(sk, ecn_thresh))
++		return;
++
++	if (ce_ratio >= bbr_param(sk, ecn_thresh))
++		bbr->startup_ecn_rounds++;
++	else
++		bbr->startup_ecn_rounds = 0;
++
++	if (bbr->startup_ecn_rounds >= bbr_param(sk, full_ecn_cnt)) {
++		bbr_handle_queue_too_high_in_startup(sk);
++		return;
++	}
++}
++
++/* Updates ecn_alpha and returns ce_ratio. -1 if not available. */
++static int bbr_update_ecn_alpha(struct sock *sk)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct net *net = sock_net(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++	s32 delivered, delivered_ce;
++	u64 alpha, ce_ratio;
++	u32 gain;
++	bool want_ecn_alpha;
++
++	/* See if we should use ECN sender logic for this connection. */
++	if (!bbr->ecn_eligible && bbr_can_use_ecn(sk) &&
++	    bbr_param(sk, ecn_factor) &&
++	    (bbr->min_rtt_us <= bbr_ecn_max_rtt_us ||
++	     !bbr_ecn_max_rtt_us))
++		bbr->ecn_eligible = 1;
++
++	/* Skip updating alpha only if not ECN-eligible and PLB is disabled. */
++	want_ecn_alpha = (bbr->ecn_eligible ||
++			  (bbr_can_use_ecn(sk) &&
++			   READ_ONCE(net->ipv4.sysctl_tcp_plb_enabled)));
++	if (!want_ecn_alpha)
++		return -1;
++
++	delivered = tp->delivered - bbr->alpha_last_delivered;
++	delivered_ce = tp->delivered_ce - bbr->alpha_last_delivered_ce;
++
++	if (delivered == 0 ||		/* avoid divide by zero */
++	    WARN_ON_ONCE(delivered < 0 || delivered_ce < 0))  /* backwards? */
++		return -1;
++
++	BUILD_BUG_ON(BBR_SCALE != TCP_PLB_SCALE);
++	ce_ratio = (u64)delivered_ce << BBR_SCALE;
++	do_div(ce_ratio, delivered);
++
++	gain = bbr_param(sk, ecn_alpha_gain);
++	alpha = ((BBR_UNIT - gain) * bbr->ecn_alpha) >> BBR_SCALE;
++	alpha += (gain * ce_ratio) >> BBR_SCALE;
++	bbr->ecn_alpha = min_t(u32, alpha, BBR_UNIT);
++
++	bbr->alpha_last_delivered = tp->delivered;
++	bbr->alpha_last_delivered_ce = tp->delivered_ce;
++
++	bbr_check_ecn_too_high_in_startup(sk, ce_ratio);
++	return (int)ce_ratio;
++}
++
++/* Protective Load Balancing (PLB). PLB rehashes outgoing data (to a new IPv6
++ * flow label) if it encounters sustained congestion in the form of ECN marks.
++ */
++static void bbr_plb(struct sock *sk, const struct rate_sample *rs, int ce_ratio)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	if (bbr->round_start && ce_ratio >= 0)
++		tcp_plb_update_state(sk, &bbr->plb, ce_ratio);
++
++	tcp_plb_check_rehash(sk, &bbr->plb);
++}
++
++/* Each round trip of BBR_BW_PROBE_UP, double volume of probing data. */
++static void bbr_raise_inflight_hi_slope(struct sock *sk)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 growth_this_round, cnt;
++
++	/* Calculate "slope": packets S/Acked per inflight_hi increment. */
++	growth_this_round = 1 << bbr->bw_probe_up_rounds;
++	bbr->bw_probe_up_rounds = min(bbr->bw_probe_up_rounds + 1, 30);
++	cnt = tcp_snd_cwnd(tp) / growth_this_round;
++	cnt = max(cnt, 1U);
++	bbr->bw_probe_up_cnt = cnt;
++}
++
++/* In BBR_BW_PROBE_UP, not seeing high loss/ECN/queue, so raise inflight_hi. */
++static void bbr_probe_inflight_hi_upward(struct sock *sk,
++					  const struct rate_sample *rs)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 delta;
++
++	if (!tp->is_cwnd_limited || tcp_snd_cwnd(tp) < bbr->inflight_hi)
++		return;  /* not fully using inflight_hi, so don't grow it */
++
++	/* For each bw_probe_up_cnt packets ACKed, increase inflight_hi by 1. */
++	bbr->bw_probe_up_acks += rs->acked_sacked;
++	if (bbr->bw_probe_up_acks >=  bbr->bw_probe_up_cnt) {
++		delta = bbr->bw_probe_up_acks / bbr->bw_probe_up_cnt;
++		bbr->bw_probe_up_acks -= delta * bbr->bw_probe_up_cnt;
++		bbr->inflight_hi += delta;
++		bbr->try_fast_path = 0;  /* Need to update cwnd */
++	}
++
++	if (bbr->round_start)
++		bbr_raise_inflight_hi_slope(sk);
++}
++
++/* Does loss/ECN rate for this sample say inflight is "too high"?
++ * This is used by both the bbr_check_loss_too_high_in_startup() function,
++ * which can be used in either v1 or v2, and the PROBE_UP phase of v2, which
++ * uses it to notice when loss/ECN rates suggest inflight is too high.
++ */
++static bool bbr_is_inflight_too_high(const struct sock *sk,
++				      const struct rate_sample *rs)
++{
++	const struct bbr *bbr = inet_csk_ca(sk);
++	u32 loss_thresh, ecn_thresh;
++
++	if (rs->lost > 0 && rs->tx_in_flight) {
++		loss_thresh = (u64)rs->tx_in_flight * bbr_param(sk, loss_thresh) >>
++				BBR_SCALE;
++		if (rs->lost > loss_thresh) {
++			return true;
++		}
++	}
++
++	if (rs->delivered_ce > 0 && rs->delivered > 0 &&
++	    bbr->ecn_eligible && bbr_param(sk, ecn_thresh)) {
++		ecn_thresh = (u64)rs->delivered * bbr_param(sk, ecn_thresh) >>
++				BBR_SCALE;
++		if (rs->delivered_ce > ecn_thresh) {
++			return true;
++		}
++	}
++
++	return false;
++}
++
++/* Calculate the tx_in_flight level that corresponded to excessive loss.
++ * We find "lost_prefix" segs of the skb where loss rate went too high,
++ * by solving for "lost_prefix" in the following equation:
++ *   lost                     /  inflight                     >= loss_thresh
++ *  (lost_prev + lost_prefix) / (inflight_prev + lost_prefix) >= loss_thresh
++ * Then we take that equation, convert it to fixed point, and
++ * round up to the nearest packet.
++ */
++static u32 bbr_inflight_hi_from_lost_skb(const struct sock *sk,
++					  const struct rate_sample *rs,
++					  const struct sk_buff *skb)
++{
++	const struct tcp_sock *tp = tcp_sk(sk);
++	u32 loss_thresh  = bbr_param(sk, loss_thresh);
++	u32 pcount, divisor, inflight_hi;
++	s32 inflight_prev, lost_prev;
++	u64 loss_budget, lost_prefix;
++
++	pcount = tcp_skb_pcount(skb);
++
++	/* How much data was in flight before this skb? */
++	inflight_prev = rs->tx_in_flight - pcount;
++	if (inflight_prev < 0) {
++		WARN_ONCE(tcp_skb_tx_in_flight_is_suspicious(
++				  pcount,
++				  TCP_SKB_CB(skb)->sacked,
++				  rs->tx_in_flight),
++			  "tx_in_flight: %u pcount: %u reneg: %u",
++			  rs->tx_in_flight, pcount, tcp_sk(sk)->is_sack_reneg);
++		return ~0U;
++	}
++
++	/* How much inflight data was marked lost before this skb? */
++	lost_prev = rs->lost - pcount;
++	if (WARN_ONCE(lost_prev < 0,
++		      "cwnd: %u ca: %d out: %u lost: %u pif: %u "
++		      "tx_in_flight: %u tx.lost: %u tp->lost: %u rs->lost: %d "
++		      "lost_prev: %d pcount: %d seq: %u end_seq: %u reneg: %u",
++		      tcp_snd_cwnd(tp), inet_csk(sk)->icsk_ca_state,
++		      tp->packets_out, tp->lost_out, tcp_packets_in_flight(tp),
++		      rs->tx_in_flight, TCP_SKB_CB(skb)->tx.lost, tp->lost,
++		      rs->lost, lost_prev, pcount,
++		      TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
++		      tp->is_sack_reneg))
++		return ~0U;
++
++	/* At what prefix of this lost skb did losss rate exceed loss_thresh? */
++	loss_budget = (u64)inflight_prev * loss_thresh + BBR_UNIT - 1;
++	loss_budget >>= BBR_SCALE;
++	if (lost_prev >= loss_budget) {
++		lost_prefix = 0;   /* previous losses crossed loss_thresh */
++	} else {
++		lost_prefix = loss_budget - lost_prev;
++		lost_prefix <<= BBR_SCALE;
++		divisor = BBR_UNIT - loss_thresh;
++		if (WARN_ON_ONCE(!divisor))  /* loss_thresh is 8 bits */
++			return ~0U;
++		do_div(lost_prefix, divisor);
++	}
++
++	inflight_hi = inflight_prev + lost_prefix;
++	return inflight_hi;
++}
++
++/* If loss/ECN rates during probing indicated we may have overfilled a
++ * buffer, return an operating point that tries to leave unutilized headroom in
++ * the path for other flows, for fairness convergence and lower RTTs and loss.
++ */
++static u32 bbr_inflight_with_headroom(const struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 headroom, headroom_fraction;
++
++	if (bbr->inflight_hi == ~0U)
++		return ~0U;
++
++	headroom_fraction = bbr_param(sk, inflight_headroom);
++	headroom = ((u64)bbr->inflight_hi * headroom_fraction) >> BBR_SCALE;
++	headroom = max(headroom, 1U);
++	return max_t(s32, bbr->inflight_hi - headroom,
++		     bbr_param(sk, cwnd_min_target));
++}
++
++/* Bound cwnd to a sensible level, based on our current probing state
++ * machine phase and model of a good inflight level (inflight_lo, inflight_hi).
++ */
++static void bbr_bound_cwnd_for_inflight_model(struct sock *sk)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 cap;
++
++	/* tcp_rcv_synsent_state_process() currently calls tcp_ack()
++	 * and thus cong_control() without first initializing us(!).
++	 */
++	if (!bbr->initialized)
++		return;
++
++	cap = ~0U;
++	if (bbr->mode == BBR_PROBE_BW &&
++	    bbr->cycle_idx != BBR_BW_PROBE_CRUISE) {
++		/* Probe to see if more packets fit in the path. */
++		cap = bbr->inflight_hi;
++	} else {
++		if (bbr->mode == BBR_PROBE_RTT ||
++		    (bbr->mode == BBR_PROBE_BW &&
++		     bbr->cycle_idx == BBR_BW_PROBE_CRUISE))
++			cap = bbr_inflight_with_headroom(sk);
++	}
++	/* Adapt to any loss/ECN since our last bw probe. */
++	cap = min(cap, bbr->inflight_lo);
++
++	cap = max_t(u32, cap, bbr_param(sk, cwnd_min_target));
++	tcp_snd_cwnd_set(tp, min(cap, tcp_snd_cwnd(tp)));
++}
++
++/* How should we multiplicatively cut bw or inflight limits based on ECN? */
++static u32 bbr_ecn_cut(struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	return BBR_UNIT -
++		((bbr->ecn_alpha * bbr_param(sk, ecn_factor)) >> BBR_SCALE);
++}
++
++/* Init lower bounds if have not inited yet. */
++static void bbr_init_lower_bounds(struct sock *sk, bool init_bw)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	if (init_bw && bbr->bw_lo == ~0U)
++		bbr->bw_lo = bbr_max_bw(sk);
++	if (bbr->inflight_lo == ~0U)
++		bbr->inflight_lo = tcp_snd_cwnd(tp);
++}
++
++/* Reduce bw and inflight to (1 - beta). */
++static void bbr_loss_lower_bounds(struct sock *sk, u32 *bw, u32 *inflight)
++{
++	struct bbr* bbr = inet_csk_ca(sk);
++	u32 loss_cut = BBR_UNIT - bbr_param(sk, beta);
++
++	*bw = max_t(u32, bbr->bw_latest,
++		    (u64)bbr->bw_lo * loss_cut >> BBR_SCALE);
++	*inflight = max_t(u32, bbr->inflight_latest,
++			  (u64)bbr->inflight_lo * loss_cut >> BBR_SCALE);
++}
++
++/* Reduce inflight to (1 - alpha*ecn_factor). */
++static void bbr_ecn_lower_bounds(struct sock *sk, u32 *inflight)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 ecn_cut = bbr_ecn_cut(sk);
++
++	*inflight = (u64)bbr->inflight_lo * ecn_cut >> BBR_SCALE;
++}
++
++/* Estimate a short-term lower bound on the capacity available now, based
++ * on measurements of the current delivery process and recent history. When we
++ * are seeing loss/ECN at times when we are not probing bw, then conservatively
++ * move toward flow balance by multiplicatively cutting our short-term
++ * estimated safe rate and volume of data (bw_lo and inflight_lo). We use a
++ * multiplicative decrease in order to converge to a lower capacity in time
++ * logarithmic in the magnitude of the decrease.
++ *
++ * However, we do not cut our short-term estimates lower than the current rate
++ * and volume of delivered data from this round trip, since from the current
++ * delivery process we can estimate the measured capacity available now.
++ *
++ * Anything faster than that approach would knowingly risk high loss, which can
++ * cause low bw for Reno/CUBIC and high loss recovery latency for
++ * request/response flows using any congestion control.
++ */
++static void bbr_adapt_lower_bounds(struct sock *sk,
++				    const struct rate_sample *rs)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 ecn_inflight_lo = ~0U;
++
++	/* We only use lower-bound estimates when not probing bw.
++	 * When probing we need to push inflight higher to probe bw.
++	 */
++	if (bbr_is_probing_bandwidth(sk))
++		return;
++
++	/* ECN response. */
++	if (bbr->ecn_in_round && bbr_param(sk, ecn_factor)) {
++		bbr_init_lower_bounds(sk, false);
++		bbr_ecn_lower_bounds(sk, &ecn_inflight_lo);
++	}
++
++	/* Loss response. */
++	if (bbr->loss_in_round) {
++		bbr_init_lower_bounds(sk, true);
++		bbr_loss_lower_bounds(sk, &bbr->bw_lo, &bbr->inflight_lo);
++	}
++
++	/* Adjust to the lower of the levels implied by loss/ECN. */
++	bbr->inflight_lo = min(bbr->inflight_lo, ecn_inflight_lo);
++	bbr->bw_lo = max(1U, bbr->bw_lo);
++}
++
++/* Reset any short-term lower-bound adaptation to congestion, so that we can
++ * push our inflight up.
++ */
++static void bbr_reset_lower_bounds(struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	bbr->bw_lo = ~0U;
++	bbr->inflight_lo = ~0U;
++}
++
++/* After bw probing (STARTUP/PROBE_UP), reset signals before entering a state
++ * machine phase where we adapt our lower bound based on congestion signals.
++ */
++static void bbr_reset_congestion_signals(struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	bbr->loss_in_round = 0;
++	bbr->ecn_in_round = 0;
++	bbr->loss_in_cycle = 0;
++	bbr->ecn_in_cycle = 0;
++	bbr->bw_latest = 0;
++	bbr->inflight_latest = 0;
++}
++
++static void bbr_exit_loss_recovery(struct sock *sk)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	tcp_snd_cwnd_set(tp, max(tcp_snd_cwnd(tp), bbr->prior_cwnd));
++	bbr->try_fast_path = 0; /* bound cwnd using latest model */
++}
++
++/* Update rate and volume of delivered data from latest round trip. */
++static void bbr_update_latest_delivery_signals(
++	struct sock *sk, const struct rate_sample *rs, struct bbr_context *ctx)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	bbr->loss_round_start = 0;
++	if (rs->interval_us <= 0 || !rs->acked_sacked)
++		return; /* Not a valid observation */
++
++	bbr->bw_latest       = max_t(u32, bbr->bw_latest,       ctx->sample_bw);
++	bbr->inflight_latest = max_t(u32, bbr->inflight_latest, rs->delivered);
++
++	if (!before(rs->prior_delivered, bbr->loss_round_delivered)) {
++		bbr->loss_round_delivered = tp->delivered;
++		bbr->loss_round_start = 1;  /* mark start of new round trip */
++	}
++}
++
++/* Once per round, reset filter for latest rate and volume of delivered data. */
++static void bbr_advance_latest_delivery_signals(
++	struct sock *sk, const struct rate_sample *rs, struct bbr_context *ctx)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	/* If ACK matches a TLP retransmit, persist the filter. If we detect
++	 * that a TLP retransmit plugged a tail loss, we'll want to remember
++	 * how much data the path delivered before the tail loss.
++	 */
++	if (bbr->loss_round_start && !rs->is_acking_tlp_retrans_seq) {
++		bbr->bw_latest = ctx->sample_bw;
++		bbr->inflight_latest = rs->delivered;
++	}
++}
++
++/* Update (most of) our congestion signals: track the recent rate and volume of
++ * delivered data, presence of loss, and EWMA degree of ECN marking.
++ */
++static void bbr_update_congestion_signals(
++	struct sock *sk, const struct rate_sample *rs, struct bbr_context *ctx)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	u64 bw;
++
++	if (rs->interval_us <= 0 || !rs->acked_sacked)
++		return; /* Not a valid observation */
++	bw = ctx->sample_bw;
++
++	if (!rs->is_app_limited || bw >= bbr_max_bw(sk))
++		bbr_take_max_bw_sample(sk, bw);
++
++	bbr->loss_in_round |= (rs->losses > 0);
++
++	if (!bbr->loss_round_start)
++		return;		/* skip the per-round-trip updates */
++	/* Now do per-round-trip updates. */
++	bbr_adapt_lower_bounds(sk, rs);
++
++	bbr->loss_in_round = 0;
++	bbr->ecn_in_round  = 0;
++}
++
++/* Bandwidth probing can cause loss. To help coexistence with loss-based
++ * congestion control we spread out our probing in a Reno-conscious way. Due to
++ * the shape of the Reno sawtooth, the time required between loss epochs for an
++ * idealized Reno flow is a number of round trips that is the BDP of that
++ * flow. We count packet-timed round trips directly, since measured RTT can
++ * vary widely, and Reno is driven by packet-timed round trips.
++ */
++static bool bbr_is_reno_coexistence_probe_time(struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 rounds;
++
++	/* Random loss can shave some small percentage off of our inflight
++	 * in each round. To survive this, flows need robust periodic probes.
++	 */
++	rounds = min_t(u32, bbr_param(sk, bw_probe_max_rounds), bbr_target_inflight(sk));
++	return bbr->rounds_since_probe >= rounds;
++}
++
++/* How long do we want to wait before probing for bandwidth (and risking
++ * loss)? We randomize the wait, for better mixing and fairness convergence.
++ *
++ * We bound the Reno-coexistence inter-bw-probe time to be 62-63 round trips.
++ * This is calculated to allow fairness with a 25Mbps, 30ms Reno flow,
++ * (eg 4K video to a broadband user):
++ *   BDP = 25Mbps * .030sec /(1514bytes) = 61.9 packets
++ *
++ * We bound the BBR-native inter-bw-probe wall clock time to be:
++ *  (a) higher than 2 sec: to try to avoid causing loss for a long enough time
++ *      to allow Reno at 30ms to get 4K video bw, the inter-bw-probe time must
++ *      be at least: 25Mbps * .030sec / (1514bytes) * 0.030sec = 1.9secs
++ *  (b) lower than 3 sec: to ensure flows can start probing in a reasonable
++ *      amount of time to discover unutilized bw on human-scale interactive
++ *      time-scales (e.g. perhaps traffic from a web page download that we
++ *      were competing with is now complete).
++ */
++static void bbr_pick_probe_wait(struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	/* Decide the random round-trip bound for wait until probe: */
++	bbr->rounds_since_probe =
++		get_random_u32_below(bbr_param(sk, bw_probe_rand_rounds));
++	/* Decide the random wall clock bound for wait until probe: */
++	bbr->probe_wait_us = bbr_param(sk, bw_probe_base_us) +
++			     get_random_u32_below(bbr_param(sk, bw_probe_rand_us));
++}
++
++static void bbr_set_cycle_idx(struct sock *sk, int cycle_idx)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	bbr->cycle_idx = cycle_idx;
++	/* New phase, so need to update cwnd and pacing rate. */
++	bbr->try_fast_path = 0;
++}
++
++/* Send at estimated bw to fill the pipe, but not queue. We need this phase
++ * before PROBE_UP, because as soon as we send faster than the available bw
++ * we will start building a queue, and if the buffer is shallow we can cause
++ * loss. If we do not fill the pipe before we cause this loss, our bw_hi and
++ * inflight_hi estimates will underestimate.
++ */
++static void bbr_start_bw_probe_refill(struct sock *sk, u32 bw_probe_up_rounds)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	bbr_reset_lower_bounds(sk);
++	bbr->bw_probe_up_rounds = bw_probe_up_rounds;
++	bbr->bw_probe_up_acks = 0;
++	bbr->stopped_risky_probe = 0;
++	bbr->ack_phase = BBR_ACKS_REFILLING;
++	bbr->next_rtt_delivered = tp->delivered;
++	bbr_set_cycle_idx(sk, BBR_BW_PROBE_REFILL);
++}
++
++/* Now probe max deliverable data rate and volume. */
++static void bbr_start_bw_probe_up(struct sock *sk, struct bbr_context *ctx)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	bbr->ack_phase = BBR_ACKS_PROBE_STARTING;
++	bbr->next_rtt_delivered = tp->delivered;
++	bbr->cycle_mstamp = tp->tcp_mstamp;
++	bbr_reset_full_bw(sk);
++	bbr->full_bw = ctx->sample_bw;
++	bbr_set_cycle_idx(sk, BBR_BW_PROBE_UP);
++	bbr_raise_inflight_hi_slope(sk);
++}
++
++/* Start a new PROBE_BW probing cycle of some wall clock length. Pick a wall
++ * clock time at which to probe beyond an inflight that we think to be
++ * safe. This will knowingly risk packet loss, so we want to do this rarely, to
++ * keep packet loss rates low. Also start a round-trip counter, to probe faster
++ * if we estimate a Reno flow at our BDP would probe faster.
++ */
++static void bbr_start_bw_probe_down(struct sock *sk)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	bbr_reset_congestion_signals(sk);
++	bbr->bw_probe_up_cnt = ~0U;     /* not growing inflight_hi any more */
++	bbr_pick_probe_wait(sk);
++	bbr->cycle_mstamp = tp->tcp_mstamp;		/* start wall clock */
++	bbr->ack_phase = BBR_ACKS_PROBE_STOPPING;
++	bbr->next_rtt_delivered = tp->delivered;
++	bbr_set_cycle_idx(sk, BBR_BW_PROBE_DOWN);
++}
++
++/* Cruise: maintain what we estimate to be a neutral, conservative
++ * operating point, without attempting to probe up for bandwidth or down for
++ * RTT, and only reducing inflight in response to loss/ECN signals.
++ */
++static void bbr_start_bw_probe_cruise(struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	if (bbr->inflight_lo != ~0U)
++		bbr->inflight_lo = min(bbr->inflight_lo, bbr->inflight_hi);
++
++	bbr_set_cycle_idx(sk, BBR_BW_PROBE_CRUISE);
++}
++
++/* Loss and/or ECN rate is too high while probing.
++ * Adapt (once per bw probe) by cutting inflight_hi and then restarting cycle.
++ */
++static void bbr_handle_inflight_too_high(struct sock *sk,
++					  const struct rate_sample *rs)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	const u32 beta = bbr_param(sk, beta);
++
++	bbr->prev_probe_too_high = 1;
++	bbr->bw_probe_samples = 0;  /* only react once per probe */
++	/* If we are app-limited then we are not robustly
++	 * probing the max volume of inflight data we think
++	 * might be safe (analogous to how app-limited bw
++	 * samples are not known to be robustly probing bw).
++	 */
++	if (!rs->is_app_limited) {
++		bbr->inflight_hi = max_t(u32, rs->tx_in_flight,
++					 (u64)bbr_target_inflight(sk) *
++					 (BBR_UNIT - beta) >> BBR_SCALE);
++	}
++	if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == BBR_BW_PROBE_UP)
++		bbr_start_bw_probe_down(sk);
++}
++
++/* If we're seeing bw and loss samples reflecting our bw probing, adapt
++ * using the signals we see. If loss or ECN mark rate gets too high, then adapt
++ * inflight_hi downward. If we're able to push inflight higher without such
++ * signals, push higher: adapt inflight_hi upward.
++ */
++static bool bbr_adapt_upper_bounds(struct sock *sk,
++				    const struct rate_sample *rs,
++				    struct bbr_context *ctx)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	/* Track when we'll see bw/loss samples resulting from our bw probes. */
++	if (bbr->ack_phase == BBR_ACKS_PROBE_STARTING && bbr->round_start)
++		bbr->ack_phase = BBR_ACKS_PROBE_FEEDBACK;
++	if (bbr->ack_phase == BBR_ACKS_PROBE_STOPPING && bbr->round_start) {
++		/* End of samples from bw probing phase. */
++		bbr->bw_probe_samples = 0;
++		bbr->ack_phase = BBR_ACKS_INIT;
++		/* At this point in the cycle, our current bw sample is also
++		 * our best recent chance at finding the highest available bw
++		 * for this flow. So now is the best time to forget the bw
++		 * samples from the previous cycle, by advancing the window.
++		 */
++		if (bbr->mode == BBR_PROBE_BW && !rs->is_app_limited)
++			bbr_advance_max_bw_filter(sk);
++		/* If we had an inflight_hi, then probed and pushed inflight all
++		 * the way up to hit that inflight_hi without seeing any
++		 * high loss/ECN in all the resulting ACKs from that probing,
++		 * then probe up again, this time letting inflight persist at
++		 * inflight_hi for a round trip, then accelerating beyond.
++		 */
++		if (bbr->mode == BBR_PROBE_BW &&
++		    bbr->stopped_risky_probe && !bbr->prev_probe_too_high) {
++			bbr_start_bw_probe_refill(sk, 0);
++			return true;  /* yes, decided state transition */
++		}
++	}
++	if (bbr_is_inflight_too_high(sk, rs)) {
++		if (bbr->bw_probe_samples)  /*  sample is from bw probing? */
++			bbr_handle_inflight_too_high(sk, rs);
++	} else {
++		/* Loss/ECN rate is declared safe. Adjust upper bound upward. */
++
++		if (bbr->inflight_hi == ~0U)
++			return false;   /* no excess queue signals yet */
++
++		/* To be resilient to random loss, we must raise bw/inflight_hi
++		 * if we observe in any phase that a higher level is safe.
++		 */
++		if (rs->tx_in_flight > bbr->inflight_hi) {
++			bbr->inflight_hi = rs->tx_in_flight;
++		}
++
++		if (bbr->mode == BBR_PROBE_BW &&
++		    bbr->cycle_idx == BBR_BW_PROBE_UP)
++			bbr_probe_inflight_hi_upward(sk, rs);
++	}
++
++	return false;
++}
++
++/* Check if it's time to probe for bandwidth now, and if so, kick it off. */
++static bool bbr_check_time_to_probe_bw(struct sock *sk,
++					const struct rate_sample *rs)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 n;
++
++	/* If we seem to be at an operating point where we are not seeing loss
++	 * but we are seeing ECN marks, then when the ECN marks cease we reprobe
++	 * quickly (in case cross-traffic has ceased and freed up bw).
++	 */
++	if (bbr_param(sk, ecn_reprobe_gain) && bbr->ecn_eligible &&
++	    bbr->ecn_in_cycle && !bbr->loss_in_cycle &&
++	    inet_csk(sk)->icsk_ca_state == TCP_CA_Open) {
++		/* Calculate n so that when bbr_raise_inflight_hi_slope()
++		 * computes growth_this_round as 2^n it will be roughly the
++		 * desired volume of data (inflight_hi*ecn_reprobe_gain).
++		 */
++		n = ilog2((((u64)bbr->inflight_hi *
++			    bbr_param(sk, ecn_reprobe_gain)) >> BBR_SCALE));
++		bbr_start_bw_probe_refill(sk, n);
++		return true;
++	}
++
++	if (bbr_has_elapsed_in_phase(sk, bbr->probe_wait_us) ||
++	    bbr_is_reno_coexistence_probe_time(sk)) {
++		bbr_start_bw_probe_refill(sk, 0);
++		return true;
++	}
++	return false;
++}
++
++/* Is it time to transition from PROBE_DOWN to PROBE_CRUISE? */
++static bool bbr_check_time_to_cruise(struct sock *sk, u32 inflight, u32 bw)
++{
++	/* Always need to pull inflight down to leave headroom in queue. */
++	if (inflight > bbr_inflight_with_headroom(sk))
++		return false;
++
++	return inflight <= bbr_inflight(sk, bw, BBR_UNIT);
++}
++
++/* PROBE_BW state machine: cruise, refill, probe for bw, or drain? */
++static void bbr_update_cycle_phase(struct sock *sk,
++				    const struct rate_sample *rs,
++				    struct bbr_context *ctx)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++	bool is_bw_probe_done = false;
++	u32 inflight, bw;
++
++	if (!bbr_full_bw_reached(sk))
++		return;
++
++	/* In DRAIN, PROBE_BW, or PROBE_RTT, adjust upper bounds. */
++	if (bbr_adapt_upper_bounds(sk, rs, ctx))
++		return;		/* already decided state transition */
++
++	if (bbr->mode != BBR_PROBE_BW)
++		return;
++
++	inflight = bbr_packets_in_net_at_edt(sk, rs->prior_in_flight);
++	bw = bbr_max_bw(sk);
++
++	switch (bbr->cycle_idx) {
++	/* First we spend most of our time cruising with a pacing_gain of 1.0,
++	 * which paces at the estimated bw, to try to fully use the pipe
++	 * without building queue. If we encounter loss/ECN marks, we adapt
++	 * by slowing down.
++	 */
++	case BBR_BW_PROBE_CRUISE:
++		if (bbr_check_time_to_probe_bw(sk, rs))
++			return;		/* already decided state transition */
++		break;
++
++	/* After cruising, when it's time to probe, we first "refill": we send
++	 * at the estimated bw to fill the pipe, before probing higher and
++	 * knowingly risking overflowing the bottleneck buffer (causing loss).
++	 */
++	case BBR_BW_PROBE_REFILL:
++		if (bbr->round_start) {
++			/* After one full round trip of sending in REFILL, we
++			 * start to see bw samples reflecting our REFILL, which
++			 * may be putting too much data in flight.
++			 */
++			bbr->bw_probe_samples = 1;
++			bbr_start_bw_probe_up(sk, ctx);
++		}
++		break;
++
++	/* After we refill the pipe, we probe by using a pacing_gain > 1.0, to
++	 * probe for bw. If we have not seen loss/ECN, we try to raise inflight
++	 * to at least pacing_gain*BDP; note that this may take more than
++	 * min_rtt if min_rtt is small (e.g. on a LAN).
++	 *
++	 * We terminate PROBE_UP bandwidth probing upon any of the following:
++	 *
++	 * (1) We've pushed inflight up to hit the inflight_hi target set in the
++	 *     most recent previous bw probe phase. Thus we want to start
++	 *     draining the queue immediately because it's very likely the most
++	 *     recently sent packets will fill the queue and cause drops.
++	 * (2) If inflight_hi has not limited bandwidth growth recently, and
++	 *     yet delivered bandwidth has not increased much recently
++	 *     (bbr->full_bw_now).
++	 * (3) Loss filter says loss rate is "too high".
++	 * (4) ECN filter says ECN mark rate is "too high".
++	 *
++	 * (1) (2) checked here, (3) (4) checked in bbr_is_inflight_too_high()
++	 */
++	case BBR_BW_PROBE_UP:
++		if (bbr->prev_probe_too_high &&
++		    inflight >= bbr->inflight_hi) {
++			bbr->stopped_risky_probe = 1;
++			is_bw_probe_done = true;
++		} else {
++			if (tp->is_cwnd_limited &&
++			    tcp_snd_cwnd(tp) >= bbr->inflight_hi) {
++				/* inflight_hi is limiting bw growth */
++				bbr_reset_full_bw(sk);
++				bbr->full_bw = ctx->sample_bw;
++			} else if (bbr->full_bw_now) {
++				/* Plateau in estimated bw. Pipe looks full. */
++				is_bw_probe_done = true;
++			}
++		}
++		if (is_bw_probe_done) {
++			bbr->prev_probe_too_high = 0;  /* no loss/ECN (yet) */
++			bbr_start_bw_probe_down(sk);  /* restart w/ down */
++		}
++		break;
++
++	/* After probing in PROBE_UP, we have usually accumulated some data in
++	 * the bottleneck buffer (if bw probing didn't find more bw). We next
++	 * enter PROBE_DOWN to try to drain any excess data from the queue. To
++	 * do this, we use a pacing_gain < 1.0. We hold this pacing gain until
++	 * our inflight is less then that target cruising point, which is the
++	 * minimum of (a) the amount needed to leave headroom, and (b) the
++	 * estimated BDP. Once inflight falls to match the target, we estimate
++	 * the queue is drained; persisting would underutilize the pipe.
++	 */
++	case BBR_BW_PROBE_DOWN:
++		if (bbr_check_time_to_probe_bw(sk, rs))
++			return;		/* already decided state transition */
++		if (bbr_check_time_to_cruise(sk, inflight, bw))
++			bbr_start_bw_probe_cruise(sk);
++		break;
++
++	default:
++		WARN_ONCE(1, "BBR invalid cycle index %u\n", bbr->cycle_idx);
++	}
++}
++
++/* Exiting PROBE_RTT, so return to bandwidth probing in STARTUP or PROBE_BW. */
++static void bbr_exit_probe_rtt(struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	bbr_reset_lower_bounds(sk);
++	if (bbr_full_bw_reached(sk)) {
++		bbr->mode = BBR_PROBE_BW;
++		/* Raising inflight after PROBE_RTT may cause loss, so reset
++		 * the PROBE_BW clock and schedule the next bandwidth probe for
++		 * a friendly and randomized future point in time.
++		 */
++		bbr_start_bw_probe_down(sk);
++		/* Since we are exiting PROBE_RTT, we know inflight is
++		 * below our estimated BDP, so it is reasonable to cruise.
++		 */
++		bbr_start_bw_probe_cruise(sk);
++	} else {
++		bbr->mode = BBR_STARTUP;
++	}
++}
++
++/* Exit STARTUP based on loss rate > 1% and loss gaps in round >= N. Wait until
++ * the end of the round in recovery to get a good estimate of how many packets
++ * have been lost, and how many we need to drain with a low pacing rate.
++ */
++static void bbr_check_loss_too_high_in_startup(struct sock *sk,
++						const struct rate_sample *rs)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	if (bbr_full_bw_reached(sk))
++		return;
++
++	/* For STARTUP exit, check the loss rate at the end of each round trip
++	 * of Recovery episodes in STARTUP. We check the loss rate at the end
++	 * of the round trip to filter out noisy/low loss and have a better
++	 * sense of inflight (extent of loss), so we can drain more accurately.
++	 */
++	if (rs->losses && bbr->loss_events_in_round < 0xf)
++		bbr->loss_events_in_round++;  /* update saturating counter */
++	if (bbr_param(sk, full_loss_cnt) && bbr->loss_round_start &&
++	    inet_csk(sk)->icsk_ca_state == TCP_CA_Recovery &&
++	    bbr->loss_events_in_round >= bbr_param(sk, full_loss_cnt) &&
++	    bbr_is_inflight_too_high(sk, rs)) {
++		bbr_handle_queue_too_high_in_startup(sk);
++		return;
++	}
++	if (bbr->loss_round_start)
++		bbr->loss_events_in_round = 0;
++}
++
++/* Estimate when the pipe is full, using the change in delivery rate: BBR
++ * estimates bw probing filled the pipe if the estimated bw hasn't changed by
++ * at least bbr_full_bw_thresh (25%) after bbr_full_bw_cnt (3) non-app-limited
++ * rounds. Why 3 rounds: 1: rwin autotuning grows the rwin, 2: we fill the
++ * higher rwin, 3: we get higher delivery rate samples. Or transient
++ * cross-traffic or radio noise can go away. CUBIC Hystart shares a similar
++ * design goal, but uses delay and inter-ACK spacing instead of bandwidth.
++ */
++static void bbr_check_full_bw_reached(struct sock *sk,
++				       const struct rate_sample *rs,
++				       struct bbr_context *ctx)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 bw_thresh, full_cnt, thresh;
++
++	if (bbr->full_bw_now || rs->is_app_limited)
++		return;
++
++	thresh = bbr_param(sk, full_bw_thresh);
++	full_cnt = bbr_param(sk, full_bw_cnt);
++	bw_thresh = (u64)bbr->full_bw * thresh >> BBR_SCALE;
++	if (ctx->sample_bw >= bw_thresh) {
++		bbr_reset_full_bw(sk);
++		bbr->full_bw = ctx->sample_bw;
++		return;
++	}
++	if (!bbr->round_start)
++		return;
++	++bbr->full_bw_cnt;
++	bbr->full_bw_now = bbr->full_bw_cnt >= full_cnt;
++	bbr->full_bw_reached |= bbr->full_bw_now;
++}
++
++/* If pipe is probably full, drain the queue and then enter steady-state. */
++static void bbr_check_drain(struct sock *sk, const struct rate_sample *rs,
++			    struct bbr_context *ctx)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) {
++		bbr->mode = BBR_DRAIN;	/* drain queue we created */
++		/* Set ssthresh to export purely for monitoring, to signal
++		 * completion of initial STARTUP by setting to a non-
++		 * TCP_INFINITE_SSTHRESH value (ssthresh is not used by BBR).
++		 */
++		tcp_sk(sk)->snd_ssthresh =
++				bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT);
++		bbr_reset_congestion_signals(sk);
++	}	/* fall through to check if in-flight is already small: */
++	if (bbr->mode == BBR_DRAIN &&
++	    bbr_packets_in_net_at_edt(sk, tcp_packets_in_flight(tcp_sk(sk))) <=
++	    bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT)) {
++		bbr->mode = BBR_PROBE_BW;
++		bbr_start_bw_probe_down(sk);
++	}
++}
++
++static void bbr_update_model(struct sock *sk, const struct rate_sample *rs,
++			      struct bbr_context *ctx)
++{
++	bbr_update_congestion_signals(sk, rs, ctx);
+ 	bbr_update_ack_aggregation(sk, rs);
+-	bbr_update_cycle_phase(sk, rs);
+-	bbr_check_full_bw_reached(sk, rs);
+-	bbr_check_drain(sk, rs);
++	bbr_check_loss_too_high_in_startup(sk, rs);
++	bbr_check_full_bw_reached(sk, rs, ctx);
++	bbr_check_drain(sk, rs, ctx);
++	bbr_update_cycle_phase(sk, rs, ctx);
+ 	bbr_update_min_rtt(sk, rs);
+-	bbr_update_gains(sk);
++}
++
++/* Fast path for app-limited case.
++ *
++ * On each ack, we execute bbr state machine, which primarily consists of:
++ * 1) update model based on new rate sample, and
++ * 2) update control based on updated model or state change.
++ *
++ * There are certain workload/scenarios, e.g. app-limited case, where
++ * either we can skip updating model or we can skip update of both model
++ * as well as control. This provides signifcant softirq cpu savings for
++ * processing incoming acks.
++ *
++ * In case of app-limited, if there is no congestion (loss/ecn) and
++ * if observed bw sample is less than current estimated bw, then we can
++ * skip some of the computation in bbr state processing:
++ *
++ * - if there is no rtt/mode/phase change: In this case, since all the
++ *   parameters of the network model are constant, we can skip model
++ *   as well control update.
++ *
++ * - else we can skip rest of the model update. But we still need to
++ *   update the control to account for the new rtt/mode/phase.
++ *
++ * Returns whether we can take fast path or not.
++ */
++static bool bbr_run_fast_path(struct sock *sk, bool *update_model,
++		const struct rate_sample *rs, struct bbr_context *ctx)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 prev_min_rtt_us, prev_mode;
++
++	if (bbr_param(sk, fast_path) && bbr->try_fast_path &&
++	    rs->is_app_limited && ctx->sample_bw < bbr_max_bw(sk) &&
++	    !bbr->loss_in_round && !bbr->ecn_in_round ) {
++		prev_mode = bbr->mode;
++		prev_min_rtt_us = bbr->min_rtt_us;
++		bbr_check_drain(sk, rs, ctx);
++		bbr_update_cycle_phase(sk, rs, ctx);
++		bbr_update_min_rtt(sk, rs);
++
++		if (bbr->mode == prev_mode &&
++		    bbr->min_rtt_us == prev_min_rtt_us &&
++		    bbr->try_fast_path) {
++			return true;
++		}
++
++		/* Skip model update, but control still needs to be updated */
++		*update_model = false;
++	}
++	return false;
+ }
+ 
+ __bpf_kfunc static void bbr_main(struct sock *sk, u32 ack, int flag, const struct rate_sample *rs)
+ {
++	struct tcp_sock *tp = tcp_sk(sk);
+ 	struct bbr *bbr = inet_csk_ca(sk);
+-	u32 bw;
++	struct bbr_context ctx = { 0 };
++	bool update_model = true;
++	u32 bw, round_delivered;
++	int ce_ratio = -1;
++
++	round_delivered = bbr_update_round_start(sk, rs, &ctx);
++	if (bbr->round_start) {
++		bbr->rounds_since_probe =
++			min_t(s32, bbr->rounds_since_probe + 1, 0xFF);
++		ce_ratio = bbr_update_ecn_alpha(sk);
++	}
++	bbr_plb(sk, rs, ce_ratio);
++
++	bbr->ecn_in_round  |= (bbr->ecn_eligible && rs->is_ece);
++	bbr_calculate_bw_sample(sk, rs, &ctx);
++	bbr_update_latest_delivery_signals(sk, rs, &ctx);
+ 
+-	bbr_update_model(sk, rs);
++	if (bbr_run_fast_path(sk, &update_model, rs, &ctx))
++		goto out;
+ 
++	if (update_model)
++		bbr_update_model(sk, rs, &ctx);
++
++	bbr_update_gains(sk);
+ 	bw = bbr_bw(sk);
+ 	bbr_set_pacing_rate(sk, bw, bbr->pacing_gain);
+-	bbr_set_cwnd(sk, rs, rs->acked_sacked, bw, bbr->cwnd_gain);
++	bbr_set_cwnd(sk, rs, rs->acked_sacked, bw, bbr->cwnd_gain,
++		     tcp_snd_cwnd(tp), &ctx);
++	bbr_bound_cwnd_for_inflight_model(sk);
++
++out:
++	bbr_advance_latest_delivery_signals(sk, rs, &ctx);
++	bbr->prev_ca_state = inet_csk(sk)->icsk_ca_state;
++	bbr->loss_in_cycle |= rs->lost > 0;
++	bbr->ecn_in_cycle  |= rs->delivered_ce > 0;
+ }
+ 
+ __bpf_kfunc static void bbr_init(struct sock *sk)
+@@ -1056,20 +2079,21 @@ __bpf_kfunc static void bbr_init(struct
+ 	struct tcp_sock *tp = tcp_sk(sk);
+ 	struct bbr *bbr = inet_csk_ca(sk);
+ 
+-	bbr->prior_cwnd = 0;
++	bbr->initialized = 1;
++
++	bbr->init_cwnd = min(0x7FU, tcp_snd_cwnd(tp));
++	bbr->prior_cwnd = tp->prior_cwnd;
+ 	tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
+-	bbr->rtt_cnt = 0;
+ 	bbr->next_rtt_delivered = tp->delivered;
+ 	bbr->prev_ca_state = TCP_CA_Open;
+-	bbr->packet_conservation = 0;
+ 
+ 	bbr->probe_rtt_done_stamp = 0;
+ 	bbr->probe_rtt_round_done = 0;
++	bbr->probe_rtt_min_us = tcp_min_rtt(tp);
++	bbr->probe_rtt_min_stamp = tcp_jiffies32;
+ 	bbr->min_rtt_us = tcp_min_rtt(tp);
+ 	bbr->min_rtt_stamp = tcp_jiffies32;
+ 
+-	minmax_reset(&bbr->bw, bbr->rtt_cnt, 0);  /* init max bw to 0 */
+-
+ 	bbr->has_seen_rtt = 0;
+ 	bbr_init_pacing_rate_from_rtt(sk);
+ 
+@@ -1080,7 +2104,7 @@ __bpf_kfunc static void bbr_init(struct
+ 	bbr->full_bw_cnt = 0;
+ 	bbr->cycle_mstamp = 0;
+ 	bbr->cycle_idx = 0;
+-	bbr_reset_lt_bw_sampling(sk);
++
+ 	bbr_reset_startup_mode(sk);
+ 
+ 	bbr->ack_epoch_mstamp = tp->tcp_mstamp;
+@@ -1090,78 +2114,236 @@ __bpf_kfunc static void bbr_init(struct
+ 	bbr->extra_acked[0] = 0;
+ 	bbr->extra_acked[1] = 0;
+ 
++	bbr->ce_state = 0;
++	bbr->prior_rcv_nxt = tp->rcv_nxt;
++	bbr->try_fast_path = 0;
++
+ 	cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED);
++
++	/* Start sampling ECN mark rate after first full flight is ACKed: */
++	bbr->loss_round_delivered = tp->delivered + 1;
++	bbr->loss_round_start = 0;
++	bbr->undo_bw_lo = 0;
++	bbr->undo_inflight_lo = 0;
++	bbr->undo_inflight_hi = 0;
++	bbr->loss_events_in_round = 0;
++	bbr->startup_ecn_rounds = 0;
++	bbr_reset_congestion_signals(sk);
++	bbr->bw_lo = ~0U;
++	bbr->bw_hi[0] = 0;
++	bbr->bw_hi[1] = 0;
++	bbr->inflight_lo = ~0U;
++	bbr->inflight_hi = ~0U;
++	bbr_reset_full_bw(sk);
++	bbr->bw_probe_up_cnt = ~0U;
++	bbr->bw_probe_up_acks = 0;
++	bbr->bw_probe_up_rounds = 0;
++	bbr->probe_wait_us = 0;
++	bbr->stopped_risky_probe = 0;
++	bbr->ack_phase = BBR_ACKS_INIT;
++	bbr->rounds_since_probe = 0;
++	bbr->bw_probe_samples = 0;
++	bbr->prev_probe_too_high = 0;
++	bbr->ecn_eligible = 0;
++	bbr->ecn_alpha = bbr_param(sk, ecn_alpha_init);
++	bbr->alpha_last_delivered = 0;
++	bbr->alpha_last_delivered_ce = 0;
++	bbr->plb.pause_until = 0;
++
++	tp->fast_ack_mode = bbr_fast_ack_mode ? 1 : 0;
+ }
+ 
+-__bpf_kfunc static u32 bbr_sndbuf_expand(struct sock *sk)
++/* BBR marks the current round trip as a loss round. */
++static void bbr_note_loss(struct sock *sk)
+ {
+-	/* Provision 3 * cwnd since BBR may slow-start even during recovery. */
+-	return 3;
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	/* Capture "current" data over the full round trip of loss, to
++	 * have a better chance of observing the full capacity of the path.
++	 */
++	if (!bbr->loss_in_round)  /* first loss in this round trip? */
++		bbr->loss_round_delivered = tp->delivered;  /* set round trip */
++	bbr->loss_in_round = 1;
++	bbr->loss_in_cycle = 1;
+ }
+ 
+-/* In theory BBR does not need to undo the cwnd since it does not
+- * always reduce cwnd on losses (see bbr_main()). Keep it for now.
+- */
++/* Core TCP stack informs us that the given skb was just marked lost. */
++__bpf_kfunc static void bbr_skb_marked_lost(struct sock *sk,
++					    const struct sk_buff *skb)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++	struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
++	struct rate_sample rs = {};
++
++	bbr_note_loss(sk);
++
++	if (!bbr->bw_probe_samples)
++		return;  /* not an skb sent while probing for bandwidth */
++	if (unlikely(!scb->tx.delivered_mstamp))
++		return;  /* skb was SACKed, reneged, marked lost; ignore it */
++	/* We are probing for bandwidth. Construct a rate sample that
++	 * estimates what happened in the flight leading up to this lost skb,
++	 * then see if the loss rate went too high, and if so at which packet.
++	 */
++	rs.tx_in_flight = scb->tx.in_flight;
++	rs.lost = tp->lost - scb->tx.lost;
++	rs.is_app_limited = scb->tx.is_app_limited;
++	if (bbr_is_inflight_too_high(sk, &rs)) {
++		rs.tx_in_flight = bbr_inflight_hi_from_lost_skb(sk, &rs, skb);
++		bbr_handle_inflight_too_high(sk, &rs);
++	}
++}
++
++static void bbr_run_loss_probe_recovery(struct sock *sk)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++	struct rate_sample rs = {0};
++
++	bbr_note_loss(sk);
++
++	if (!bbr->bw_probe_samples)
++		return;  /* not sent while probing for bandwidth */
++	/* We are probing for bandwidth. Construct a rate sample that
++	 * estimates what happened in the flight leading up to this
++	 * loss, then see if the loss rate went too high.
++	 */
++	rs.lost = 1;	/* TLP probe repaired loss of a single segment */
++	rs.tx_in_flight = bbr->inflight_latest + rs.lost;
++	rs.is_app_limited = tp->tlp_orig_data_app_limited;
++	if (bbr_is_inflight_too_high(sk, &rs))
++		bbr_handle_inflight_too_high(sk, &rs);
++}
++
++/* Revert short-term model if current loss recovery event was spurious. */
+ __bpf_kfunc static u32 bbr_undo_cwnd(struct sock *sk)
+ {
+ 	struct bbr *bbr = inet_csk_ca(sk);
+ 
+-	bbr->full_bw = 0;   /* spurious slow-down; reset full pipe detection */
+-	bbr->full_bw_cnt = 0;
+-	bbr_reset_lt_bw_sampling(sk);
+-	return tcp_snd_cwnd(tcp_sk(sk));
++	bbr_reset_full_bw(sk); /* spurious slow-down; reset full bw detector */
++	bbr->loss_in_round = 0;
++
++	/* Revert to cwnd and other state saved before loss episode. */
++	bbr->bw_lo = max(bbr->bw_lo, bbr->undo_bw_lo);
++	bbr->inflight_lo = max(bbr->inflight_lo, bbr->undo_inflight_lo);
++	bbr->inflight_hi = max(bbr->inflight_hi, bbr->undo_inflight_hi);
++	bbr->try_fast_path = 0;  /* take slow path to set proper cwnd, pacing */
++	return bbr->prior_cwnd;
+ }
+ 
+-/* Entering loss recovery, so save cwnd for when we exit or undo recovery. */
++/* Entering loss recovery, so save state for when we undo recovery. */
+ __bpf_kfunc static u32 bbr_ssthresh(struct sock *sk)
+ {
++	struct bbr *bbr = inet_csk_ca(sk);
++
+ 	bbr_save_cwnd(sk);
++	/* For undo, save state that adapts based on loss signal. */
++	bbr->undo_bw_lo		= bbr->bw_lo;
++	bbr->undo_inflight_lo	= bbr->inflight_lo;
++	bbr->undo_inflight_hi	= bbr->inflight_hi;
+ 	return tcp_sk(sk)->snd_ssthresh;
+ }
+ 
++static enum tcp_bbr_phase bbr_get_phase(struct bbr *bbr)
++{
++	switch (bbr->mode) {
++	case BBR_STARTUP:
++		return BBR_PHASE_STARTUP;
++	case BBR_DRAIN:
++		return BBR_PHASE_DRAIN;
++	case BBR_PROBE_BW:
++		break;
++	case BBR_PROBE_RTT:
++		return BBR_PHASE_PROBE_RTT;
++	default:
++		return BBR_PHASE_INVALID;
++	}
++	switch (bbr->cycle_idx) {
++	case BBR_BW_PROBE_UP:
++		return BBR_PHASE_PROBE_BW_UP;
++	case BBR_BW_PROBE_DOWN:
++		return BBR_PHASE_PROBE_BW_DOWN;
++	case BBR_BW_PROBE_CRUISE:
++		return BBR_PHASE_PROBE_BW_CRUISE;
++	case BBR_BW_PROBE_REFILL:
++		return BBR_PHASE_PROBE_BW_REFILL;
++	default:
++		return BBR_PHASE_INVALID;
++	}
++}
++
+ static size_t bbr_get_info(struct sock *sk, u32 ext, int *attr,
+-			   union tcp_cc_info *info)
++			    union tcp_cc_info *info)
+ {
+ 	if (ext & (1 << (INET_DIAG_BBRINFO - 1)) ||
+ 	    ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
+-		struct tcp_sock *tp = tcp_sk(sk);
+ 		struct bbr *bbr = inet_csk_ca(sk);
+-		u64 bw = bbr_bw(sk);
+-
+-		bw = bw * tp->mss_cache * USEC_PER_SEC >> BW_SCALE;
+-		memset(&info->bbr, 0, sizeof(info->bbr));
+-		info->bbr.bbr_bw_lo		= (u32)bw;
+-		info->bbr.bbr_bw_hi		= (u32)(bw >> 32);
+-		info->bbr.bbr_min_rtt		= bbr->min_rtt_us;
+-		info->bbr.bbr_pacing_gain	= bbr->pacing_gain;
+-		info->bbr.bbr_cwnd_gain		= bbr->cwnd_gain;
++		u64 bw = bbr_bw_bytes_per_sec(sk, bbr_bw(sk));
++		u64 bw_hi = bbr_bw_bytes_per_sec(sk, bbr_max_bw(sk));
++		u64 bw_lo = bbr->bw_lo == ~0U ?
++			~0ULL : bbr_bw_bytes_per_sec(sk, bbr->bw_lo);
++		struct tcp_bbr_info *bbr_info = &info->bbr;
++
++		memset(bbr_info, 0, sizeof(*bbr_info));
++		bbr_info->bbr_bw_lo		= (u32)bw;
++		bbr_info->bbr_bw_hi		= (u32)(bw >> 32);
++		bbr_info->bbr_min_rtt		= bbr->min_rtt_us;
++		bbr_info->bbr_pacing_gain	= bbr->pacing_gain;
++		bbr_info->bbr_cwnd_gain		= bbr->cwnd_gain;
++		bbr_info->bbr_bw_hi_lsb		= (u32)bw_hi;
++		bbr_info->bbr_bw_hi_msb		= (u32)(bw_hi >> 32);
++		bbr_info->bbr_bw_lo_lsb		= (u32)bw_lo;
++		bbr_info->bbr_bw_lo_msb		= (u32)(bw_lo >> 32);
++		bbr_info->bbr_mode		= bbr->mode;
++		bbr_info->bbr_phase		= (__u8)bbr_get_phase(bbr);
++		bbr_info->bbr_version		= (__u8)BBR_VERSION;
++		bbr_info->bbr_inflight_lo	= bbr->inflight_lo;
++		bbr_info->bbr_inflight_hi	= bbr->inflight_hi;
++		bbr_info->bbr_extra_acked	= bbr_extra_acked(sk);
+ 		*attr = INET_DIAG_BBRINFO;
+-		return sizeof(info->bbr);
++		return sizeof(*bbr_info);
+ 	}
+ 	return 0;
+ }
+ 
+ __bpf_kfunc static void bbr_set_state(struct sock *sk, u8 new_state)
+ {
++	struct tcp_sock *tp = tcp_sk(sk);
+ 	struct bbr *bbr = inet_csk_ca(sk);
+ 
+ 	if (new_state == TCP_CA_Loss) {
+-		struct rate_sample rs = { .losses = 1 };
+ 
+ 		bbr->prev_ca_state = TCP_CA_Loss;
+-		bbr->full_bw = 0;
+-		bbr->round_start = 1;	/* treat RTO like end of a round */
+-		bbr_lt_bw_sampling(sk, &rs);
++		tcp_plb_update_state_upon_rto(sk, &bbr->plb);
++		/* The tcp_write_timeout() call to sk_rethink_txhash() likely
++		 * repathed this flow, so re-learn the min network RTT on the
++		 * new path:
++		 */
++		bbr_reset_full_bw(sk);
++		if (!bbr_is_probing_bandwidth(sk) && bbr->inflight_lo == ~0U) {
++			/* bbr_adapt_lower_bounds() needs cwnd before
++			 * we suffered an RTO, to update inflight_lo:
++			 */
++			bbr->inflight_lo =
++				max(tcp_snd_cwnd(tp), bbr->prior_cwnd);
++		}
++	} else if (bbr->prev_ca_state == TCP_CA_Loss &&
++		   new_state != TCP_CA_Loss) {
++		bbr_exit_loss_recovery(sk);
+ 	}
+ }
+ 
++
+ static struct tcp_congestion_ops tcp_bbr_cong_ops __read_mostly = {
+-	.flags		= TCP_CONG_NON_RESTRICTED,
++	.flags		= TCP_CONG_NON_RESTRICTED | TCP_CONG_WANTS_CE_EVENTS,
+ 	.name		= "bbr",
+ 	.owner		= THIS_MODULE,
+ 	.init		= bbr_init,
+ 	.cong_control	= bbr_main,
+ 	.sndbuf_expand	= bbr_sndbuf_expand,
++	.skb_marked_lost = bbr_skb_marked_lost,
+ 	.undo_cwnd	= bbr_undo_cwnd,
+ 	.cwnd_event	= bbr_cwnd_event,
+ 	.ssthresh	= bbr_ssthresh,
+@@ -1174,10 +2356,11 @@ BTF_KFUNCS_START(tcp_bbr_check_kfunc_ids
+ BTF_ID_FLAGS(func, bbr_init)
+ BTF_ID_FLAGS(func, bbr_main)
+ BTF_ID_FLAGS(func, bbr_sndbuf_expand)
++BTF_ID_FLAGS(func, bbr_skb_marked_lost)
+ BTF_ID_FLAGS(func, bbr_undo_cwnd)
+ BTF_ID_FLAGS(func, bbr_cwnd_event)
+ BTF_ID_FLAGS(func, bbr_ssthresh)
+-BTF_ID_FLAGS(func, bbr_min_tso_segs)
++BTF_ID_FLAGS(func, bbr_tso_segs)
+ BTF_ID_FLAGS(func, bbr_set_state)
+ BTF_KFUNCS_END(tcp_bbr_check_kfunc_ids)
+ 
+@@ -1210,5 +2393,12 @@ MODULE_AUTHOR("Van Jacobson <vanj@google
+ MODULE_AUTHOR("Neal Cardwell <ncardwell@google.com>");
+ MODULE_AUTHOR("Yuchung Cheng <ycheng@google.com>");
+ MODULE_AUTHOR("Soheil Hassas Yeganeh <soheil@google.com>");
++MODULE_AUTHOR("Priyaranjan Jha <priyarjha@google.com>");
++MODULE_AUTHOR("Yousuk Seung <ysseung@google.com>");
++MODULE_AUTHOR("Kevin Yang <yyd@google.com>");
++MODULE_AUTHOR("Arjun Roy <arjunroy@google.com>");
++MODULE_AUTHOR("David Morley <morleyd@google.com>");
++
+ MODULE_LICENSE("Dual BSD/GPL");
+ MODULE_DESCRIPTION("TCP BBR (Bottleneck Bandwidth and RTT)");
++MODULE_VERSION(__stringify(BBR_VERSION));
diff --git a/debian/patches/misc-bbr3/0017-net-tcp_bbr-v3-ensure-ECN-enabled-BBR-flows-set-ECT-.patch b/debian/patches/misc-bbr3/0017-net-tcp_bbr-v3-ensure-ECN-enabled-BBR-flows-set-ECT-.patch
new file mode 100644
index 0000000..af8f105
--- /dev/null
+++ b/debian/patches/misc-bbr3/0017-net-tcp_bbr-v3-ensure-ECN-enabled-BBR-flows-set-ECT-.patch
@@ -0,0 +1,59 @@
+From 99e86f904f246ae9ec7a13d1b920eaf4a8c2d47b Mon Sep 17 00:00:00 2001
+From: Adithya Abraham Philip <abrahamphilip@google.com>
+Date: Fri, 11 Jun 2021 21:56:10 +0000
+Subject: [PATCH 17/19] net-tcp_bbr: v3: ensure ECN-enabled BBR flows set ECT
+ on retransmits
+
+Adds a new flag TCP_ECN_ECT_PERMANENT that is used by CCAs to
+indicate that retransmitted packets and pure ACKs must have the
+ECT bit set. This is necessary for BBR, which when using
+ECN expects ECT to be set even on retransmitted packets and ACKs.
+
+Previous to this addition of TCP_ECN_ECT_PERMANENT, CCAs which can use
+ECN but don't "need" it did not have a way to indicate that ECT should
+be set on retransmissions/ACKs.
+
+Signed-off-by: Adithya Abraham Philip <abrahamphilip@google.com>
+Signed-off-by: Neal Cardwell <ncardwell@google.com>
+Change-Id: I8b048eaab35e136fe6501ef6cd89fd9faa15e6d2
+Signed-off-by: Alexandre Frade <kernel@xanmod.org>
+---
+ include/net/tcp.h     | 1 +
+ net/ipv4/tcp_bbr.c    | 3 +++
+ net/ipv4/tcp_output.c | 3 ++-
+ 3 files changed, 6 insertions(+), 1 deletion(-)
+
+--- a/include/net/tcp.h
++++ b/include/net/tcp.h
+@@ -376,6 +376,7 @@ static inline void tcp_dec_quickack_mode
+ #define	TCP_ECN_DEMAND_CWR	4
+ #define	TCP_ECN_SEEN		8
+ #define	TCP_ECN_LOW		16
++#define	TCP_ECN_ECT_PERMANENT	32
+ 
+ enum tcp_tw_status {
+ 	TCP_TW_SUCCESS = 0,
+--- a/net/ipv4/tcp_bbr.c
++++ b/net/ipv4/tcp_bbr.c
+@@ -2151,6 +2151,9 @@ __bpf_kfunc static void bbr_init(struct
+ 	bbr->plb.pause_until = 0;
+ 
+ 	tp->fast_ack_mode = bbr_fast_ack_mode ? 1 : 0;
++
++	if (bbr_can_use_ecn(sk))
++		tp->ecn_flags |= TCP_ECN_ECT_PERMANENT;
+ }
+ 
+ /* BBR marks the current round trip as a loss round. */
+--- a/net/ipv4/tcp_output.c
++++ b/net/ipv4/tcp_output.c
+@@ -390,7 +390,8 @@ static void tcp_ecn_send(struct sock *sk
+ 				th->cwr = 1;
+ 				skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN;
+ 			}
+-		} else if (!tcp_ca_needs_ecn(sk)) {
++		} else if (!(tp->ecn_flags & TCP_ECN_ECT_PERMANENT) &&
++			!tcp_ca_needs_ecn(sk)) {
+ 			/* ACK or retransmitted segment: clear ECT|CE */
+ 			INET_ECN_dontxmit(sk);
+ 		}
diff --git a/debian/patches/misc-bbr3/0018-tcp-export-TCPI_OPT_ECN_LOW-in-tcp_info-tcpi_options.patch b/debian/patches/misc-bbr3/0018-tcp-export-TCPI_OPT_ECN_LOW-in-tcp_info-tcpi_options.patch
new file mode 100644
index 0000000..a5d2395
--- /dev/null
+++ b/debian/patches/misc-bbr3/0018-tcp-export-TCPI_OPT_ECN_LOW-in-tcp_info-tcpi_options.patch
@@ -0,0 +1,38 @@
+From 5d7cb61552d374bcaaa95022129b4ca1eace1c33 Mon Sep 17 00:00:00 2001
+From: Neal Cardwell <ncardwell@google.com>
+Date: Sun, 23 Jul 2023 23:25:34 -0400
+Subject: [PATCH 18/19] tcp: export TCPI_OPT_ECN_LOW in tcp_info tcpi_options
+ field
+
+Analogous to other important ECN information, export TCPI_OPT_ECN_LOW
+in tcp_info tcpi_options field.
+
+Signed-off-by: Neal Cardwell <ncardwell@google.com>
+Change-Id: I08d8d8c7e8780e6e37df54038ee50301ac5a0320
+Signed-off-by: Alexandre Frade <kernel@xanmod.org>
+---
+ include/uapi/linux/tcp.h | 1 +
+ net/ipv4/tcp.c           | 2 ++
+ 2 files changed, 3 insertions(+)
+
+--- a/include/uapi/linux/tcp.h
++++ b/include/uapi/linux/tcp.h
+@@ -178,6 +178,7 @@ enum tcp_fastopen_client_fail {
+ #define TCPI_OPT_ECN_SEEN	16 /* we received at least one packet with ECT */
+ #define TCPI_OPT_SYN_DATA	32 /* SYN-ACK acked data in SYN sent or rcvd */
+ #define TCPI_OPT_USEC_TS	64 /* usec timestamps */
++#define TCPI_OPT_ECN_LOW	128 /* Low-latency ECN configured at init */
+ 
+ /*
+  * Sender's congestion state indicating normal or abnormal situations
+--- a/net/ipv4/tcp.c
++++ b/net/ipv4/tcp.c
+@@ -3850,6 +3850,8 @@ void tcp_get_info(struct sock *sk, struc
+ 		info->tcpi_options |= TCPI_OPT_ECN;
+ 	if (tp->ecn_flags & TCP_ECN_SEEN)
+ 		info->tcpi_options |= TCPI_OPT_ECN_SEEN;
++	if (tp->ecn_flags & TCP_ECN_LOW)
++		info->tcpi_options |= TCPI_OPT_ECN_LOW;
+ 	if (tp->syn_data_acked)
+ 		info->tcpi_options |= TCPI_OPT_SYN_DATA;
+ 	if (tp->tcp_usec_ts)
diff --git a/debian/patches/misc-bbr3/0019-x86-cfi-bpf-Add-tso_segs-and-skb_marked_lost-to-bpf_.patch b/debian/patches/misc-bbr3/0019-x86-cfi-bpf-Add-tso_segs-and-skb_marked_lost-to-bpf_.patch
new file mode 100644
index 0000000..0fb0528
--- /dev/null
+++ b/debian/patches/misc-bbr3/0019-x86-cfi-bpf-Add-tso_segs-and-skb_marked_lost-to-bpf_.patch
@@ -0,0 +1,42 @@
+From 39838c2f0b09bec02004c092904aada85da2bc2e Mon Sep 17 00:00:00 2001
+From: Alexandre Frade <kernel@xanmod.org>
+Date: Mon, 11 Mar 2024 12:01:13 -0300
+Subject: [PATCH 19/19] x86/cfi,bpf: Add tso_segs and skb_marked_lost to
+ bpf_struct_ops CFI
+
+Rebased-by: Oleksandr Natalenko <oleksandr@natalenko.name>
+[ https://github.com/sirlucjan/kernel-patches/blob/master/6.8/bbr3-patches/0001-tcp-bbr3-initial-import.patch ]
+Signed-off-by: Alexandre Frade <kernel@xanmod.org>
+---
+ net/ipv4/bpf_tcp_ca.c | 9 +++++++--
+ 1 file changed, 7 insertions(+), 2 deletions(-)
+
+--- a/net/ipv4/bpf_tcp_ca.c
++++ b/net/ipv4/bpf_tcp_ca.c
+@@ -305,11 +305,15 @@ static void bpf_tcp_ca_pkts_acked(struct
+ {
+ }
+ 
+-static u32 bpf_tcp_ca_min_tso_segs(struct sock *sk)
++static u32 bpf_tcp_ca_tso_segs(struct sock *sk, unsigned int mss_now)
+ {
+ 	return 0;
+ }
+ 
++static void bpf_tcp_ca_skb_marked_lost(struct sock *sk, const struct sk_buff *skb)
++{
++}
++
+ static void bpf_tcp_ca_cong_control(struct sock *sk, u32 ack, int flag,
+ 				    const struct rate_sample *rs)
+ {
+@@ -340,7 +344,8 @@ static struct tcp_congestion_ops __bpf_o
+ 	.cwnd_event = bpf_tcp_ca_cwnd_event,
+ 	.in_ack_event = bpf_tcp_ca_in_ack_event,
+ 	.pkts_acked = bpf_tcp_ca_pkts_acked,
+-	.min_tso_segs = bpf_tcp_ca_min_tso_segs,
++	.tso_segs = bpf_tcp_ca_tso_segs,
++	.skb_marked_lost = bpf_tcp_ca_skb_marked_lost,
+ 	.cong_control = bpf_tcp_ca_cong_control,
+ 	.undo_cwnd = bpf_tcp_ca_undo_cwnd,
+ 	.sndbuf_expand = bpf_tcp_ca_sndbuf_expand,
diff --git a/debian/patches/misc-ntsync5/0001-ntsync-Introduce-NTSYNC_IOC_WAIT_ANY.patch b/debian/patches/misc-ntsync5/0001-ntsync-Introduce-NTSYNC_IOC_WAIT_ANY.patch
new file mode 100644
index 0000000..0072ea3
--- /dev/null
+++ b/debian/patches/misc-ntsync5/0001-ntsync-Introduce-NTSYNC_IOC_WAIT_ANY.patch
@@ -0,0 +1,375 @@
+From 60b01019526236e40466cf20bf1192074e5e1a7c Mon Sep 17 00:00:00 2001
+From: Elizabeth Figura <zfigura@codeweavers.com>
+Date: Sun, 19 May 2024 15:24:27 -0500
+Subject: ntsync: Introduce NTSYNC_IOC_WAIT_ANY.
+
+This corresponds to part of the functionality of the NT syscall
+NtWaitForMultipleObjects(). Specifically, it implements the behaviour where
+the third argument (wait_any) is TRUE, and it does not handle alertable waits.
+Those features have been split out into separate patches to ease review.
+
+This patch therefore implements the wait/wake infrastructure which comprises the
+core of ntsync's functionality.
+
+NTSYNC_IOC_WAIT_ANY is a vectored wait function similar to poll(). Unlike
+poll(), it "consumes" objects when they are signaled. For semaphores, this means
+decreasing one from the internal counter. At most one object can be consumed by
+this function.
+
+This wait/wake model is fundamentally different from that used anywhere else in
+the kernel, and for that reason ntsync does not use any existing infrastructure,
+such as futexes, kernel mutexes or semaphores, or wait_event().
+
+Up to 64 objects can be waited on at once. As soon as one is signaled, the
+object with the lowest index is consumed, and that index is returned via the
+"index" field.
+
+A timeout is supported. The timeout is passed as a u64 nanosecond value, which
+represents absolute time measured against either the MONOTONIC or REALTIME clock
+(controlled by the flags argument). If U64_MAX is passed, the ioctl waits
+indefinitely.
+
+This ioctl validates that all objects belong to the relevant device. This is not
+necessary for any technical reason related to NTSYNC_IOC_WAIT_ANY, but will be
+necessary for NTSYNC_IOC_WAIT_ALL introduced in the following patch.
+
+Some padding fields are added for alignment and for fields which will be added
+in future patches (split out to ease review).
+
+Signed-off-by: Elizabeth Figura <zfigura@codeweavers.com>
+---
+ drivers/misc/ntsync.c       | 245 ++++++++++++++++++++++++++++++++++++
+ include/uapi/linux/ntsync.h |  14 +++
+ 2 files changed, 259 insertions(+)
+
+--- a/drivers/misc/ntsync.c
++++ b/drivers/misc/ntsync.c
+@@ -6,11 +6,16 @@
+  */
+ 
+ #include <linux/anon_inodes.h>
++#include <linux/atomic.h>
+ #include <linux/file.h>
+ #include <linux/fs.h>
++#include <linux/hrtimer.h>
++#include <linux/ktime.h>
+ #include <linux/miscdevice.h>
+ #include <linux/module.h>
+ #include <linux/overflow.h>
++#include <linux/sched.h>
++#include <linux/sched/signal.h>
+ #include <linux/slab.h>
+ #include <linux/spinlock.h>
+ #include <uapi/linux/ntsync.h>
+@@ -30,6 +35,8 @@ enum ntsync_type {
+  *
+  * Both rely on struct file for reference counting. Individual
+  * ntsync_obj objects take a reference to the device when created.
++ * Wait operations take a reference to each object being waited on for
++ * the duration of the wait.
+  */
+ 
+ struct ntsync_obj {
+@@ -47,12 +54,55 @@ struct ntsync_obj {
+ 			__u32 max;
+ 		} sem;
+ 	} u;
++
++	struct list_head any_waiters;
++};
++
++struct ntsync_q_entry {
++	struct list_head node;
++	struct ntsync_q *q;
++	struct ntsync_obj *obj;
++	__u32 index;
++};
++
++struct ntsync_q {
++	struct task_struct *task;
++
++	/*
++	 * Protected via atomic_try_cmpxchg(). Only the thread that wins the
++	 * compare-and-swap may actually change object states and wake this
++	 * task.
++	 */
++	atomic_t signaled;
++
++	__u32 count;
++	struct ntsync_q_entry entries[];
+ };
+ 
+ struct ntsync_device {
+ 	struct file *file;
+ };
+ 
++static void try_wake_any_sem(struct ntsync_obj *sem)
++{
++	struct ntsync_q_entry *entry;
++
++	lockdep_assert_held(&sem->lock);
++
++	list_for_each_entry(entry, &sem->any_waiters, node) {
++		struct ntsync_q *q = entry->q;
++		int signaled = -1;
++
++		if (!sem->u.sem.count)
++			break;
++
++		if (atomic_try_cmpxchg(&q->signaled, &signaled, entry->index)) {
++			sem->u.sem.count--;
++			wake_up_process(q->task);
++		}
++	}
++}
++
+ /*
+  * Actually change the semaphore state, returning -EOVERFLOW if it is made
+  * invalid.
+@@ -88,6 +138,8 @@ static int ntsync_sem_post(struct ntsync
+ 
+ 	prev_count = sem->u.sem.count;
+ 	ret = post_sem_state(sem, args);
++	if (!ret)
++		try_wake_any_sem(sem);
+ 
+ 	spin_unlock(&sem->lock);
+ 
+@@ -141,6 +193,7 @@ static struct ntsync_obj *ntsync_alloc_o
+ 	obj->dev = dev;
+ 	get_file(dev->file);
+ 	spin_lock_init(&obj->lock);
++	INIT_LIST_HEAD(&obj->any_waiters);
+ 
+ 	return obj;
+ }
+@@ -191,6 +244,196 @@ static int ntsync_create_sem(struct ntsy
+ 	return put_user(fd, &user_args->sem);
+ }
+ 
++static struct ntsync_obj *get_obj(struct ntsync_device *dev, int fd)
++{
++	struct file *file = fget(fd);
++	struct ntsync_obj *obj;
++
++	if (!file)
++		return NULL;
++
++	if (file->f_op != &ntsync_obj_fops) {
++		fput(file);
++		return NULL;
++	}
++
++	obj = file->private_data;
++	if (obj->dev != dev) {
++		fput(file);
++		return NULL;
++	}
++
++	return obj;
++}
++
++static void put_obj(struct ntsync_obj *obj)
++{
++	fput(obj->file);
++}
++
++static int ntsync_schedule(const struct ntsync_q *q, const struct ntsync_wait_args *args)
++{
++	ktime_t timeout = ns_to_ktime(args->timeout);
++	clockid_t clock = CLOCK_MONOTONIC;
++	ktime_t *timeout_ptr;
++	int ret = 0;
++
++	timeout_ptr = (args->timeout == U64_MAX ? NULL : &timeout);
++
++	if (args->flags & NTSYNC_WAIT_REALTIME)
++		clock = CLOCK_REALTIME;
++
++	do {
++		if (signal_pending(current)) {
++			ret = -ERESTARTSYS;
++			break;
++		}
++
++		set_current_state(TASK_INTERRUPTIBLE);
++		if (atomic_read(&q->signaled) != -1) {
++			ret = 0;
++			break;
++		}
++		ret = schedule_hrtimeout_range_clock(timeout_ptr, 0, HRTIMER_MODE_ABS, clock);
++	} while (ret < 0);
++	__set_current_state(TASK_RUNNING);
++
++	return ret;
++}
++
++/*
++ * Allocate and initialize the ntsync_q structure, but do not queue us yet.
++ */
++static int setup_wait(struct ntsync_device *dev,
++		      const struct ntsync_wait_args *args,
++		      struct ntsync_q **ret_q)
++{
++	const __u32 count = args->count;
++	int fds[NTSYNC_MAX_WAIT_COUNT];
++	struct ntsync_q *q;
++	__u32 i, j;
++
++	if (args->pad[0] || args->pad[1] || args->pad[2] || (args->flags & ~NTSYNC_WAIT_REALTIME))
++		return -EINVAL;
++
++	if (args->count > NTSYNC_MAX_WAIT_COUNT)
++		return -EINVAL;
++
++	if (copy_from_user(fds, u64_to_user_ptr(args->objs),
++			   array_size(count, sizeof(*fds))))
++		return -EFAULT;
++
++	q = kmalloc(struct_size(q, entries, count), GFP_KERNEL);
++	if (!q)
++		return -ENOMEM;
++	q->task = current;
++	atomic_set(&q->signaled, -1);
++	q->count = count;
++
++	for (i = 0; i < count; i++) {
++		struct ntsync_q_entry *entry = &q->entries[i];
++		struct ntsync_obj *obj = get_obj(dev, fds[i]);
++
++		if (!obj)
++			goto err;
++
++		entry->obj = obj;
++		entry->q = q;
++		entry->index = i;
++	}
++
++	*ret_q = q;
++	return 0;
++
++err:
++	for (j = 0; j < i; j++)
++		put_obj(q->entries[j].obj);
++	kfree(q);
++	return -EINVAL;
++}
++
++static void try_wake_any_obj(struct ntsync_obj *obj)
++{
++	switch (obj->type) {
++	case NTSYNC_TYPE_SEM:
++		try_wake_any_sem(obj);
++		break;
++	}
++}
++
++static int ntsync_wait_any(struct ntsync_device *dev, void __user *argp)
++{
++	struct ntsync_wait_args args;
++	struct ntsync_q *q;
++	int signaled;
++	__u32 i;
++	int ret;
++
++	if (copy_from_user(&args, argp, sizeof(args)))
++		return -EFAULT;
++
++	ret = setup_wait(dev, &args, &q);
++	if (ret < 0)
++		return ret;
++
++	/* queue ourselves */
++
++	for (i = 0; i < args.count; i++) {
++		struct ntsync_q_entry *entry = &q->entries[i];
++		struct ntsync_obj *obj = entry->obj;
++
++		spin_lock(&obj->lock);
++		list_add_tail(&entry->node, &obj->any_waiters);
++		spin_unlock(&obj->lock);
++	}
++
++	/* check if we are already signaled */
++
++	for (i = 0; i < args.count; i++) {
++		struct ntsync_obj *obj = q->entries[i].obj;
++
++		if (atomic_read(&q->signaled) != -1)
++			break;
++
++		spin_lock(&obj->lock);
++		try_wake_any_obj(obj);
++		spin_unlock(&obj->lock);
++	}
++
++	/* sleep */
++
++	ret = ntsync_schedule(q, &args);
++
++	/* and finally, unqueue */
++
++	for (i = 0; i < args.count; i++) {
++		struct ntsync_q_entry *entry = &q->entries[i];
++		struct ntsync_obj *obj = entry->obj;
++
++		spin_lock(&obj->lock);
++		list_del(&entry->node);
++		spin_unlock(&obj->lock);
++
++		put_obj(obj);
++	}
++
++	signaled = atomic_read(&q->signaled);
++	if (signaled != -1) {
++		struct ntsync_wait_args __user *user_args = argp;
++
++		/* even if we caught a signal, we need to communicate success */
++		ret = 0;
++
++		if (put_user(signaled, &user_args->index))
++			ret = -EFAULT;
++	} else if (!ret) {
++		ret = -ETIMEDOUT;
++	}
++
++	kfree(q);
++	return ret;
++}
++
+ static int ntsync_char_open(struct inode *inode, struct file *file)
+ {
+ 	struct ntsync_device *dev;
+@@ -222,6 +465,8 @@ static long ntsync_char_ioctl(struct fil
+ 	switch (cmd) {
+ 	case NTSYNC_IOC_CREATE_SEM:
+ 		return ntsync_create_sem(dev, argp);
++	case NTSYNC_IOC_WAIT_ANY:
++		return ntsync_wait_any(dev, argp);
+ 	default:
+ 		return -ENOIOCTLCMD;
+ 	}
+--- a/include/uapi/linux/ntsync.h
++++ b/include/uapi/linux/ntsync.h
+@@ -16,7 +16,21 @@ struct ntsync_sem_args {
+ 	__u32 max;
+ };
+ 
++#define NTSYNC_WAIT_REALTIME	0x1
++
++struct ntsync_wait_args {
++	__u64 timeout;
++	__u64 objs;
++	__u32 count;
++	__u32 index;
++	__u32 flags;
++	__u32 pad[3];
++};
++
++#define NTSYNC_MAX_WAIT_COUNT 64
++
+ #define NTSYNC_IOC_CREATE_SEM		_IOWR('N', 0x80, struct ntsync_sem_args)
++#define NTSYNC_IOC_WAIT_ANY		_IOWR('N', 0x82, struct ntsync_wait_args)
+ 
+ #define NTSYNC_IOC_SEM_POST		_IOWR('N', 0x81, __u32)
+ 
diff --git a/debian/patches/misc-ntsync5/0002-ntsync-Introduce-NTSYNC_IOC_WAIT_ALL.patch b/debian/patches/misc-ntsync5/0002-ntsync-Introduce-NTSYNC_IOC_WAIT_ALL.patch
new file mode 100644
index 0000000..c3cc639
--- /dev/null
+++ b/debian/patches/misc-ntsync5/0002-ntsync-Introduce-NTSYNC_IOC_WAIT_ALL.patch
@@ -0,0 +1,532 @@
+From 73fc33606fcb7028ec1ee6027a361de4e85ab5d6 Mon Sep 17 00:00:00 2001
+From: Elizabeth Figura <zfigura@codeweavers.com>
+Date: Sun, 19 May 2024 15:24:28 -0500
+Subject: ntsync: Introduce NTSYNC_IOC_WAIT_ALL.
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+This is similar to NTSYNC_IOC_WAIT_ANY, but waits until all of the objects are
+simultaneously signaled, and then acquires all of them as a single atomic
+operation.
+
+Because acquisition of multiple objects is atomic, some complex locking is
+required. We cannot simply spin-lock multiple objects simultaneously, as that
+may disable preëmption for a problematically long time.
+
+Instead, modifying any object which may be involved in a wait-all operation takes
+a device-wide sleeping mutex, "wait_all_lock", instead of the normal object
+spinlock.
+
+Because wait-for-all is a rare operation, in order to optimize wait-for-any,
+this lock is only taken when necessary. "all_hint" is used to mark objects which
+are involved in a wait-for-all operation, and if an object is not, only its
+spinlock is taken.
+
+The locking scheme used here was written by Peter Zijlstra.
+
+Signed-off-by: Elizabeth Figura <zfigura@codeweavers.com>
+---
+ drivers/misc/ntsync.c       | 334 ++++++++++++++++++++++++++++++++++--
+ include/uapi/linux/ntsync.h |   1 +
+ 2 files changed, 322 insertions(+), 13 deletions(-)
+
+--- a/drivers/misc/ntsync.c
++++ b/drivers/misc/ntsync.c
+@@ -13,6 +13,7 @@
+ #include <linux/ktime.h>
+ #include <linux/miscdevice.h>
+ #include <linux/module.h>
++#include <linux/mutex.h>
+ #include <linux/overflow.h>
+ #include <linux/sched.h>
+ #include <linux/sched/signal.h>
+@@ -41,6 +42,7 @@ enum ntsync_type {
+ 
+ struct ntsync_obj {
+ 	spinlock_t lock;
++	int dev_locked;
+ 
+ 	enum ntsync_type type;
+ 
+@@ -55,7 +57,30 @@ struct ntsync_obj {
+ 		} sem;
+ 	} u;
+ 
++	/*
++	 * any_waiters is protected by the object lock, but all_waiters is
++	 * protected by the device wait_all_lock.
++	 */
+ 	struct list_head any_waiters;
++	struct list_head all_waiters;
++
++	/*
++	 * Hint describing how many tasks are queued on this object in a
++	 * wait-all operation.
++	 *
++	 * Any time we do a wake, we may need to wake "all" waiters as well as
++	 * "any" waiters. In order to atomically wake "all" waiters, we must
++	 * lock all of the objects, and that means grabbing the wait_all_lock
++	 * below (and, due to lock ordering rules, before locking this object).
++	 * However, wait-all is a rare operation, and grabbing the wait-all
++	 * lock for every wake would create unnecessary contention.
++	 * Therefore we first check whether all_hint is zero, and, if it is,
++	 * we skip trying to wake "all" waiters.
++	 *
++	 * Since wait requests must originate from user-space threads, we're
++	 * limited here by PID_MAX_LIMIT, so there's no risk of overflow.
++	 */
++	atomic_t all_hint;
+ };
+ 
+ struct ntsync_q_entry {
+@@ -75,19 +100,198 @@ struct ntsync_q {
+ 	 */
+ 	atomic_t signaled;
+ 
++	bool all;
+ 	__u32 count;
+ 	struct ntsync_q_entry entries[];
+ };
+ 
+ struct ntsync_device {
++	/*
++	 * Wait-all operations must atomically grab all objects, and be totally
++	 * ordered with respect to each other and wait-any operations.
++	 * If one thread is trying to acquire several objects, another thread
++	 * cannot touch the object at the same time.
++	 *
++	 * This device-wide lock is used to serialize wait-for-all
++	 * operations, and operations on an object that is involved in a
++	 * wait-for-all.
++	 */
++	struct mutex wait_all_lock;
++
+ 	struct file *file;
+ };
+ 
++/*
++ * Single objects are locked using obj->lock.
++ *
++ * Multiple objects are 'locked' while holding dev->wait_all_lock.
++ * In this case however, individual objects are not locked by holding
++ * obj->lock, but by setting obj->dev_locked.
++ *
++ * This means that in order to lock a single object, the sequence is slightly
++ * more complicated than usual. Specifically it needs to check obj->dev_locked
++ * after acquiring obj->lock, if set, it needs to drop the lock and acquire
++ * dev->wait_all_lock in order to serialize against the multi-object operation.
++ */
++
++static void dev_lock_obj(struct ntsync_device *dev, struct ntsync_obj *obj)
++{
++	lockdep_assert_held(&dev->wait_all_lock);
++	lockdep_assert(obj->dev == dev);
++	spin_lock(&obj->lock);
++	/*
++	 * By setting obj->dev_locked inside obj->lock, it is ensured that
++	 * anyone holding obj->lock must see the value.
++	 */
++	obj->dev_locked = 1;
++	spin_unlock(&obj->lock);
++}
++
++static void dev_unlock_obj(struct ntsync_device *dev, struct ntsync_obj *obj)
++{
++	lockdep_assert_held(&dev->wait_all_lock);
++	lockdep_assert(obj->dev == dev);
++	spin_lock(&obj->lock);
++	obj->dev_locked = 0;
++	spin_unlock(&obj->lock);
++}
++
++static void obj_lock(struct ntsync_obj *obj)
++{
++	struct ntsync_device *dev = obj->dev;
++
++	for (;;) {
++		spin_lock(&obj->lock);
++		if (likely(!obj->dev_locked))
++			break;
++
++		spin_unlock(&obj->lock);
++		mutex_lock(&dev->wait_all_lock);
++		spin_lock(&obj->lock);
++		/*
++		 * obj->dev_locked should be set and released under the same
++		 * wait_all_lock section, since we now own this lock, it should
++		 * be clear.
++		 */
++		lockdep_assert(!obj->dev_locked);
++		spin_unlock(&obj->lock);
++		mutex_unlock(&dev->wait_all_lock);
++	}
++}
++
++static void obj_unlock(struct ntsync_obj *obj)
++{
++	spin_unlock(&obj->lock);
++}
++
++static bool ntsync_lock_obj(struct ntsync_device *dev, struct ntsync_obj *obj)
++{
++	bool all;
++
++	obj_lock(obj);
++	all = atomic_read(&obj->all_hint);
++	if (unlikely(all)) {
++		obj_unlock(obj);
++		mutex_lock(&dev->wait_all_lock);
++		dev_lock_obj(dev, obj);
++	}
++
++	return all;
++}
++
++static void ntsync_unlock_obj(struct ntsync_device *dev, struct ntsync_obj *obj, bool all)
++{
++	if (all) {
++		dev_unlock_obj(dev, obj);
++		mutex_unlock(&dev->wait_all_lock);
++	} else {
++		obj_unlock(obj);
++	}
++}
++
++#define ntsync_assert_held(obj) \
++	lockdep_assert((lockdep_is_held(&(obj)->lock) != LOCK_STATE_NOT_HELD) || \
++		       ((lockdep_is_held(&(obj)->dev->wait_all_lock) != LOCK_STATE_NOT_HELD) && \
++			(obj)->dev_locked))
++
++static bool is_signaled(struct ntsync_obj *obj)
++{
++	ntsync_assert_held(obj);
++
++	switch (obj->type) {
++	case NTSYNC_TYPE_SEM:
++		return !!obj->u.sem.count;
++	}
++
++	WARN(1, "bad object type %#x\n", obj->type);
++	return false;
++}
++
++/*
++ * "locked_obj" is an optional pointer to an object which is already locked and
++ * should not be locked again. This is necessary so that changing an object's
++ * state and waking it can be a single atomic operation.
++ */
++static void try_wake_all(struct ntsync_device *dev, struct ntsync_q *q,
++			 struct ntsync_obj *locked_obj)
++{
++	__u32 count = q->count;
++	bool can_wake = true;
++	int signaled = -1;
++	__u32 i;
++
++	lockdep_assert_held(&dev->wait_all_lock);
++	if (locked_obj)
++		lockdep_assert(locked_obj->dev_locked);
++
++	for (i = 0; i < count; i++) {
++		if (q->entries[i].obj != locked_obj)
++			dev_lock_obj(dev, q->entries[i].obj);
++	}
++
++	for (i = 0; i < count; i++) {
++		if (!is_signaled(q->entries[i].obj)) {
++			can_wake = false;
++			break;
++		}
++	}
++
++	if (can_wake && atomic_try_cmpxchg(&q->signaled, &signaled, 0)) {
++		for (i = 0; i < count; i++) {
++			struct ntsync_obj *obj = q->entries[i].obj;
++
++			switch (obj->type) {
++			case NTSYNC_TYPE_SEM:
++				obj->u.sem.count--;
++				break;
++			}
++		}
++		wake_up_process(q->task);
++	}
++
++	for (i = 0; i < count; i++) {
++		if (q->entries[i].obj != locked_obj)
++			dev_unlock_obj(dev, q->entries[i].obj);
++	}
++}
++
++static void try_wake_all_obj(struct ntsync_device *dev, struct ntsync_obj *obj)
++{
++	struct ntsync_q_entry *entry;
++
++	lockdep_assert_held(&dev->wait_all_lock);
++	lockdep_assert(obj->dev_locked);
++
++	list_for_each_entry(entry, &obj->all_waiters, node)
++		try_wake_all(dev, entry->q, obj);
++}
++
+ static void try_wake_any_sem(struct ntsync_obj *sem)
+ {
+ 	struct ntsync_q_entry *entry;
+ 
+-	lockdep_assert_held(&sem->lock);
++	ntsync_assert_held(sem);
++	lockdep_assert(sem->type == NTSYNC_TYPE_SEM);
+ 
+ 	list_for_each_entry(entry, &sem->any_waiters, node) {
+ 		struct ntsync_q *q = entry->q;
+@@ -111,7 +315,7 @@ static int post_sem_state(struct ntsync_
+ {
+ 	__u32 sum;
+ 
+-	lockdep_assert_held(&sem->lock);
++	ntsync_assert_held(sem);
+ 
+ 	if (check_add_overflow(sem->u.sem.count, count, &sum) ||
+ 	    sum > sem->u.sem.max)
+@@ -123,9 +327,11 @@ static int post_sem_state(struct ntsync_
+ 
+ static int ntsync_sem_post(struct ntsync_obj *sem, void __user *argp)
+ {
++	struct ntsync_device *dev = sem->dev;
+ 	__u32 __user *user_args = argp;
+ 	__u32 prev_count;
+ 	__u32 args;
++	bool all;
+ 	int ret;
+ 
+ 	if (copy_from_user(&args, argp, sizeof(args)))
+@@ -134,14 +340,17 @@ static int ntsync_sem_post(struct ntsync
+ 	if (sem->type != NTSYNC_TYPE_SEM)
+ 		return -EINVAL;
+ 
+-	spin_lock(&sem->lock);
++	all = ntsync_lock_obj(dev, sem);
+ 
+ 	prev_count = sem->u.sem.count;
+ 	ret = post_sem_state(sem, args);
+-	if (!ret)
++	if (!ret) {
++		if (all)
++			try_wake_all_obj(dev, sem);
+ 		try_wake_any_sem(sem);
++	}
+ 
+-	spin_unlock(&sem->lock);
++	ntsync_unlock_obj(dev, sem, all);
+ 
+ 	if (!ret && put_user(prev_count, user_args))
+ 		ret = -EFAULT;
+@@ -194,6 +403,8 @@ static struct ntsync_obj *ntsync_alloc_o
+ 	get_file(dev->file);
+ 	spin_lock_init(&obj->lock);
+ 	INIT_LIST_HEAD(&obj->any_waiters);
++	INIT_LIST_HEAD(&obj->all_waiters);
++	atomic_set(&obj->all_hint, 0);
+ 
+ 	return obj;
+ }
+@@ -305,7 +516,7 @@ static int ntsync_schedule(const struct
+  * Allocate and initialize the ntsync_q structure, but do not queue us yet.
+  */
+ static int setup_wait(struct ntsync_device *dev,
+-		      const struct ntsync_wait_args *args,
++		      const struct ntsync_wait_args *args, bool all,
+ 		      struct ntsync_q **ret_q)
+ {
+ 	const __u32 count = args->count;
+@@ -328,6 +539,7 @@ static int setup_wait(struct ntsync_devi
+ 		return -ENOMEM;
+ 	q->task = current;
+ 	atomic_set(&q->signaled, -1);
++	q->all = all;
+ 	q->count = count;
+ 
+ 	for (i = 0; i < count; i++) {
+@@ -337,6 +549,16 @@ static int setup_wait(struct ntsync_devi
+ 		if (!obj)
+ 			goto err;
+ 
++		if (all) {
++			/* Check that the objects are all distinct. */
++			for (j = 0; j < i; j++) {
++				if (obj == q->entries[j].obj) {
++					put_obj(obj);
++					goto err;
++				}
++			}
++		}
++
+ 		entry->obj = obj;
+ 		entry->q = q;
+ 		entry->index = i;
+@@ -366,13 +588,14 @@ static int ntsync_wait_any(struct ntsync
+ 	struct ntsync_wait_args args;
+ 	struct ntsync_q *q;
+ 	int signaled;
++	bool all;
+ 	__u32 i;
+ 	int ret;
+ 
+ 	if (copy_from_user(&args, argp, sizeof(args)))
+ 		return -EFAULT;
+ 
+-	ret = setup_wait(dev, &args, &q);
++	ret = setup_wait(dev, &args, false, &q);
+ 	if (ret < 0)
+ 		return ret;
+ 
+@@ -382,9 +605,9 @@ static int ntsync_wait_any(struct ntsync
+ 		struct ntsync_q_entry *entry = &q->entries[i];
+ 		struct ntsync_obj *obj = entry->obj;
+ 
+-		spin_lock(&obj->lock);
++		all = ntsync_lock_obj(dev, obj);
+ 		list_add_tail(&entry->node, &obj->any_waiters);
+-		spin_unlock(&obj->lock);
++		ntsync_unlock_obj(dev, obj, all);
+ 	}
+ 
+ 	/* check if we are already signaled */
+@@ -395,9 +618,9 @@ static int ntsync_wait_any(struct ntsync
+ 		if (atomic_read(&q->signaled) != -1)
+ 			break;
+ 
+-		spin_lock(&obj->lock);
++		all = ntsync_lock_obj(dev, obj);
+ 		try_wake_any_obj(obj);
+-		spin_unlock(&obj->lock);
++		ntsync_unlock_obj(dev, obj, all);
+ 	}
+ 
+ 	/* sleep */
+@@ -410,13 +633,94 @@ static int ntsync_wait_any(struct ntsync
+ 		struct ntsync_q_entry *entry = &q->entries[i];
+ 		struct ntsync_obj *obj = entry->obj;
+ 
+-		spin_lock(&obj->lock);
++		all = ntsync_lock_obj(dev, obj);
+ 		list_del(&entry->node);
+-		spin_unlock(&obj->lock);
++		ntsync_unlock_obj(dev, obj, all);
++
++		put_obj(obj);
++	}
++
++	signaled = atomic_read(&q->signaled);
++	if (signaled != -1) {
++		struct ntsync_wait_args __user *user_args = argp;
++
++		/* even if we caught a signal, we need to communicate success */
++		ret = 0;
++
++		if (put_user(signaled, &user_args->index))
++			ret = -EFAULT;
++	} else if (!ret) {
++		ret = -ETIMEDOUT;
++	}
++
++	kfree(q);
++	return ret;
++}
++
++static int ntsync_wait_all(struct ntsync_device *dev, void __user *argp)
++{
++	struct ntsync_wait_args args;
++	struct ntsync_q *q;
++	int signaled;
++	__u32 i;
++	int ret;
++
++	if (copy_from_user(&args, argp, sizeof(args)))
++		return -EFAULT;
++
++	ret = setup_wait(dev, &args, true, &q);
++	if (ret < 0)
++		return ret;
++
++	/* queue ourselves */
++
++	mutex_lock(&dev->wait_all_lock);
++
++	for (i = 0; i < args.count; i++) {
++		struct ntsync_q_entry *entry = &q->entries[i];
++		struct ntsync_obj *obj = entry->obj;
++
++		atomic_inc(&obj->all_hint);
++
++		/*
++		 * obj->all_waiters is protected by dev->wait_all_lock rather
++		 * than obj->lock, so there is no need to acquire obj->lock
++		 * here.
++		 */
++		list_add_tail(&entry->node, &obj->all_waiters);
++	}
++
++	/* check if we are already signaled */
++
++	try_wake_all(dev, q, NULL);
++
++	mutex_unlock(&dev->wait_all_lock);
++
++	/* sleep */
++
++	ret = ntsync_schedule(q, &args);
++
++	/* and finally, unqueue */
++
++	mutex_lock(&dev->wait_all_lock);
++
++	for (i = 0; i < args.count; i++) {
++		struct ntsync_q_entry *entry = &q->entries[i];
++		struct ntsync_obj *obj = entry->obj;
++
++		/*
++		 * obj->all_waiters is protected by dev->wait_all_lock rather
++		 * than obj->lock, so there is no need to acquire it here.
++		 */
++		list_del(&entry->node);
++
++		atomic_dec(&obj->all_hint);
+ 
+ 		put_obj(obj);
+ 	}
+ 
++	mutex_unlock(&dev->wait_all_lock);
++
+ 	signaled = atomic_read(&q->signaled);
+ 	if (signaled != -1) {
+ 		struct ntsync_wait_args __user *user_args = argp;
+@@ -442,6 +746,8 @@ static int ntsync_char_open(struct inode
+ 	if (!dev)
+ 		return -ENOMEM;
+ 
++	mutex_init(&dev->wait_all_lock);
++
+ 	file->private_data = dev;
+ 	dev->file = file;
+ 	return nonseekable_open(inode, file);
+@@ -465,6 +771,8 @@ static long ntsync_char_ioctl(struct fil
+ 	switch (cmd) {
+ 	case NTSYNC_IOC_CREATE_SEM:
+ 		return ntsync_create_sem(dev, argp);
++	case NTSYNC_IOC_WAIT_ALL:
++		return ntsync_wait_all(dev, argp);
+ 	case NTSYNC_IOC_WAIT_ANY:
+ 		return ntsync_wait_any(dev, argp);
+ 	default:
+--- a/include/uapi/linux/ntsync.h
++++ b/include/uapi/linux/ntsync.h
+@@ -31,6 +31,7 @@ struct ntsync_wait_args {
+ 
+ #define NTSYNC_IOC_CREATE_SEM		_IOWR('N', 0x80, struct ntsync_sem_args)
+ #define NTSYNC_IOC_WAIT_ANY		_IOWR('N', 0x82, struct ntsync_wait_args)
++#define NTSYNC_IOC_WAIT_ALL		_IOWR('N', 0x83, struct ntsync_wait_args)
+ 
+ #define NTSYNC_IOC_SEM_POST		_IOWR('N', 0x81, __u32)
+ 
diff --git a/debian/patches/misc-ntsync5/0003-ntsync-Introduce-NTSYNC_IOC_CREATE_MUTEX.patch b/debian/patches/misc-ntsync5/0003-ntsync-Introduce-NTSYNC_IOC_CREATE_MUTEX.patch
new file mode 100644
index 0000000..377e09d
--- /dev/null
+++ b/debian/patches/misc-ntsync5/0003-ntsync-Introduce-NTSYNC_IOC_CREATE_MUTEX.patch
@@ -0,0 +1,226 @@
+From fdeceab49078a80987c665ed837ee4f1b8a942a8 Mon Sep 17 00:00:00 2001
+From: Elizabeth Figura <zfigura@codeweavers.com>
+Date: Sun, 19 May 2024 15:24:29 -0500
+Subject: ntsync: Introduce NTSYNC_IOC_CREATE_MUTEX.
+
+This corresponds to the NT syscall NtCreateMutant().
+
+An NT mutex is recursive, with a 32-bit recursion counter. When acquired via
+NtWaitForMultipleObjects(), the recursion counter is incremented by one. The OS
+records the thread which acquired it.
+
+The OS records the thread which acquired it. However, in order to keep this
+driver self-contained, the owning thread ID is managed by user-space, and passed
+as a parameter to all relevant ioctls.
+
+The initial owner and recursion count, if any, are specified when the mutex is
+created.
+
+Signed-off-by: Elizabeth Figura <zfigura@codeweavers.com>
+---
+ drivers/misc/ntsync.c       | 77 +++++++++++++++++++++++++++++++++++--
+ include/uapi/linux/ntsync.h | 10 ++++-
+ 2 files changed, 83 insertions(+), 4 deletions(-)
+
+--- a/drivers/misc/ntsync.c
++++ b/drivers/misc/ntsync.c
+@@ -25,6 +25,7 @@
+ 
+ enum ntsync_type {
+ 	NTSYNC_TYPE_SEM,
++	NTSYNC_TYPE_MUTEX,
+ };
+ 
+ /*
+@@ -55,6 +56,10 @@ struct ntsync_obj {
+ 			__u32 count;
+ 			__u32 max;
+ 		} sem;
++		struct {
++			__u32 count;
++			pid_t owner;
++		} mutex;
+ 	} u;
+ 
+ 	/*
+@@ -92,6 +97,7 @@ struct ntsync_q_entry {
+ 
+ struct ntsync_q {
+ 	struct task_struct *task;
++	__u32 owner;
+ 
+ 	/*
+ 	 * Protected via atomic_try_cmpxchg(). Only the thread that wins the
+@@ -214,13 +220,17 @@ static void ntsync_unlock_obj(struct nts
+ 		       ((lockdep_is_held(&(obj)->dev->wait_all_lock) != LOCK_STATE_NOT_HELD) && \
+ 			(obj)->dev_locked))
+ 
+-static bool is_signaled(struct ntsync_obj *obj)
++static bool is_signaled(struct ntsync_obj *obj, __u32 owner)
+ {
+ 	ntsync_assert_held(obj);
+ 
+ 	switch (obj->type) {
+ 	case NTSYNC_TYPE_SEM:
+ 		return !!obj->u.sem.count;
++	case NTSYNC_TYPE_MUTEX:
++		if (obj->u.mutex.owner && obj->u.mutex.owner != owner)
++			return false;
++		return obj->u.mutex.count < UINT_MAX;
+ 	}
+ 
+ 	WARN(1, "bad object type %#x\n", obj->type);
+@@ -250,7 +260,7 @@ static void try_wake_all(struct ntsync_d
+ 	}
+ 
+ 	for (i = 0; i < count; i++) {
+-		if (!is_signaled(q->entries[i].obj)) {
++		if (!is_signaled(q->entries[i].obj, q->owner)) {
+ 			can_wake = false;
+ 			break;
+ 		}
+@@ -264,6 +274,10 @@ static void try_wake_all(struct ntsync_d
+ 			case NTSYNC_TYPE_SEM:
+ 				obj->u.sem.count--;
+ 				break;
++			case NTSYNC_TYPE_MUTEX:
++				obj->u.mutex.count++;
++				obj->u.mutex.owner = q->owner;
++				break;
+ 			}
+ 		}
+ 		wake_up_process(q->task);
+@@ -307,6 +321,30 @@ static void try_wake_any_sem(struct ntsy
+ 	}
+ }
+ 
++static void try_wake_any_mutex(struct ntsync_obj *mutex)
++{
++	struct ntsync_q_entry *entry;
++
++	ntsync_assert_held(mutex);
++	lockdep_assert(mutex->type == NTSYNC_TYPE_MUTEX);
++
++	list_for_each_entry(entry, &mutex->any_waiters, node) {
++		struct ntsync_q *q = entry->q;
++		int signaled = -1;
++
++		if (mutex->u.mutex.count == UINT_MAX)
++			break;
++		if (mutex->u.mutex.owner && mutex->u.mutex.owner != q->owner)
++			continue;
++
++		if (atomic_try_cmpxchg(&q->signaled, &signaled, entry->index)) {
++			mutex->u.mutex.count++;
++			mutex->u.mutex.owner = q->owner;
++			wake_up_process(q->task);
++		}
++	}
++}
++
+ /*
+  * Actually change the semaphore state, returning -EOVERFLOW if it is made
+  * invalid.
+@@ -455,6 +493,33 @@ static int ntsync_create_sem(struct ntsy
+ 	return put_user(fd, &user_args->sem);
+ }
+ 
++static int ntsync_create_mutex(struct ntsync_device *dev, void __user *argp)
++{
++	struct ntsync_mutex_args __user *user_args = argp;
++	struct ntsync_mutex_args args;
++	struct ntsync_obj *mutex;
++	int fd;
++
++	if (copy_from_user(&args, argp, sizeof(args)))
++		return -EFAULT;
++
++	if (!args.owner != !args.count)
++		return -EINVAL;
++
++	mutex = ntsync_alloc_obj(dev, NTSYNC_TYPE_MUTEX);
++	if (!mutex)
++		return -ENOMEM;
++	mutex->u.mutex.count = args.count;
++	mutex->u.mutex.owner = args.owner;
++	fd = ntsync_obj_get_fd(mutex);
++	if (fd < 0) {
++		kfree(mutex);
++		return fd;
++	}
++
++	return put_user(fd, &user_args->mutex);
++}
++
+ static struct ntsync_obj *get_obj(struct ntsync_device *dev, int fd)
+ {
+ 	struct file *file = fget(fd);
+@@ -524,7 +589,7 @@ static int setup_wait(struct ntsync_devi
+ 	struct ntsync_q *q;
+ 	__u32 i, j;
+ 
+-	if (args->pad[0] || args->pad[1] || args->pad[2] || (args->flags & ~NTSYNC_WAIT_REALTIME))
++	if (args->pad[0] || args->pad[1] || (args->flags & ~NTSYNC_WAIT_REALTIME))
+ 		return -EINVAL;
+ 
+ 	if (args->count > NTSYNC_MAX_WAIT_COUNT)
+@@ -538,6 +603,7 @@ static int setup_wait(struct ntsync_devi
+ 	if (!q)
+ 		return -ENOMEM;
+ 	q->task = current;
++	q->owner = args->owner;
+ 	atomic_set(&q->signaled, -1);
+ 	q->all = all;
+ 	q->count = count;
+@@ -580,6 +646,9 @@ static void try_wake_any_obj(struct ntsy
+ 	case NTSYNC_TYPE_SEM:
+ 		try_wake_any_sem(obj);
+ 		break;
++	case NTSYNC_TYPE_MUTEX:
++		try_wake_any_mutex(obj);
++		break;
+ 	}
+ }
+ 
+@@ -769,6 +838,8 @@ static long ntsync_char_ioctl(struct fil
+ 	void __user *argp = (void __user *)parm;
+ 
+ 	switch (cmd) {
++	case NTSYNC_IOC_CREATE_MUTEX:
++		return ntsync_create_mutex(dev, argp);
+ 	case NTSYNC_IOC_CREATE_SEM:
+ 		return ntsync_create_sem(dev, argp);
+ 	case NTSYNC_IOC_WAIT_ALL:
+--- a/include/uapi/linux/ntsync.h
++++ b/include/uapi/linux/ntsync.h
+@@ -16,6 +16,12 @@ struct ntsync_sem_args {
+ 	__u32 max;
+ };
+ 
++struct ntsync_mutex_args {
++	__u32 mutex;
++	__u32 owner;
++	__u32 count;
++};
++
+ #define NTSYNC_WAIT_REALTIME	0x1
+ 
+ struct ntsync_wait_args {
+@@ -24,7 +30,8 @@ struct ntsync_wait_args {
+ 	__u32 count;
+ 	__u32 index;
+ 	__u32 flags;
+-	__u32 pad[3];
++	__u32 owner;
++	__u32 pad[2];
+ };
+ 
+ #define NTSYNC_MAX_WAIT_COUNT 64
+@@ -32,6 +39,7 @@ struct ntsync_wait_args {
+ #define NTSYNC_IOC_CREATE_SEM		_IOWR('N', 0x80, struct ntsync_sem_args)
+ #define NTSYNC_IOC_WAIT_ANY		_IOWR('N', 0x82, struct ntsync_wait_args)
+ #define NTSYNC_IOC_WAIT_ALL		_IOWR('N', 0x83, struct ntsync_wait_args)
++#define NTSYNC_IOC_CREATE_MUTEX		_IOWR('N', 0x84, struct ntsync_sem_args)
+ 
+ #define NTSYNC_IOC_SEM_POST		_IOWR('N', 0x81, __u32)
+ 
diff --git a/debian/patches/misc-ntsync5/0004-ntsync-Introduce-NTSYNC_IOC_MUTEX_UNLOCK.patch b/debian/patches/misc-ntsync5/0004-ntsync-Introduce-NTSYNC_IOC_MUTEX_UNLOCK.patch
new file mode 100644
index 0000000..988aa19
--- /dev/null
+++ b/debian/patches/misc-ntsync5/0004-ntsync-Introduce-NTSYNC_IOC_MUTEX_UNLOCK.patch
@@ -0,0 +1,95 @@
+From cc9ade623cd90cd002fb86f3aa249af2e6e4019e Mon Sep 17 00:00:00 2001
+From: Elizabeth Figura <zfigura@codeweavers.com>
+Date: Sun, 19 May 2024 15:24:30 -0500
+Subject: ntsync: Introduce NTSYNC_IOC_MUTEX_UNLOCK.
+
+This corresponds to the NT syscall NtReleaseMutant().
+
+This syscall decrements the mutex's recursion count by one, and returns the
+previous value. If the mutex is not owned by the current task, the function
+instead fails and returns -EPERM.
+
+Signed-off-by: Elizabeth Figura <zfigura@codeweavers.com>
+---
+ drivers/misc/ntsync.c       | 53 +++++++++++++++++++++++++++++++++++++
+ include/uapi/linux/ntsync.h |  1 +
+ 2 files changed, 54 insertions(+)
+
+--- a/drivers/misc/ntsync.c
++++ b/drivers/misc/ntsync.c
+@@ -396,6 +396,57 @@ static int ntsync_sem_post(struct ntsync
+ 	return ret;
+ }
+ 
++/*
++ * Actually change the mutex state, returning -EPERM if not the owner.
++ */
++static int unlock_mutex_state(struct ntsync_obj *mutex,
++			      const struct ntsync_mutex_args *args)
++{
++	ntsync_assert_held(mutex);
++
++	if (mutex->u.mutex.owner != args->owner)
++		return -EPERM;
++
++	if (!--mutex->u.mutex.count)
++		mutex->u.mutex.owner = 0;
++	return 0;
++}
++
++static int ntsync_mutex_unlock(struct ntsync_obj *mutex, void __user *argp)
++{
++	struct ntsync_mutex_args __user *user_args = argp;
++	struct ntsync_device *dev = mutex->dev;
++	struct ntsync_mutex_args args;
++	__u32 prev_count;
++	bool all;
++	int ret;
++
++	if (copy_from_user(&args, argp, sizeof(args)))
++		return -EFAULT;
++	if (!args.owner)
++		return -EINVAL;
++
++	if (mutex->type != NTSYNC_TYPE_MUTEX)
++		return -EINVAL;
++
++	all = ntsync_lock_obj(dev, mutex);
++
++	prev_count = mutex->u.mutex.count;
++	ret = unlock_mutex_state(mutex, &args);
++	if (!ret) {
++		if (all)
++			try_wake_all_obj(dev, mutex);
++		try_wake_any_mutex(mutex);
++	}
++
++	ntsync_unlock_obj(dev, mutex, all);
++
++	if (!ret && put_user(prev_count, &user_args->count))
++		ret = -EFAULT;
++
++	return ret;
++}
++
+ static int ntsync_obj_release(struct inode *inode, struct file *file)
+ {
+ 	struct ntsync_obj *obj = file->private_data;
+@@ -415,6 +466,8 @@ static long ntsync_obj_ioctl(struct file
+ 	switch (cmd) {
+ 	case NTSYNC_IOC_SEM_POST:
+ 		return ntsync_sem_post(obj, argp);
++	case NTSYNC_IOC_MUTEX_UNLOCK:
++		return ntsync_mutex_unlock(obj, argp);
+ 	default:
+ 		return -ENOIOCTLCMD;
+ 	}
+--- a/include/uapi/linux/ntsync.h
++++ b/include/uapi/linux/ntsync.h
+@@ -42,5 +42,6 @@ struct ntsync_wait_args {
+ #define NTSYNC_IOC_CREATE_MUTEX		_IOWR('N', 0x84, struct ntsync_sem_args)
+ 
+ #define NTSYNC_IOC_SEM_POST		_IOWR('N', 0x81, __u32)
++#define NTSYNC_IOC_MUTEX_UNLOCK		_IOWR('N', 0x85, struct ntsync_mutex_args)
+ 
+ #endif
diff --git a/debian/patches/misc-ntsync5/0005-ntsync-Introduce-NTSYNC_IOC_MUTEX_KILL.patch b/debian/patches/misc-ntsync5/0005-ntsync-Introduce-NTSYNC_IOC_MUTEX_KILL.patch
new file mode 100644
index 0000000..8eb42fe
--- /dev/null
+++ b/debian/patches/misc-ntsync5/0005-ntsync-Introduce-NTSYNC_IOC_MUTEX_KILL.patch
@@ -0,0 +1,156 @@
+From dca3fe766afa42e34f5d3f62c0f2850760663176 Mon Sep 17 00:00:00 2001
+From: Elizabeth Figura <zfigura@codeweavers.com>
+Date: Sun, 19 May 2024 15:24:31 -0500
+Subject: ntsync: Introduce NTSYNC_IOC_MUTEX_KILL.
+
+This does not correspond to any NT syscall. Rather, when a thread dies, it
+should be called by the NT emulator for each mutex, with the TID of the dying
+thread.
+
+NT mutexes are robust (in the pthread sense). When an NT thread dies, any
+mutexes it owned are immediately released. Acquisition of those mutexes by other
+threads will return a special value indicating that the mutex was abandoned,
+like EOWNERDEAD returned from pthread_mutex_lock(), and EOWNERDEAD is indeed
+used here for that purpose.
+
+Signed-off-by: Elizabeth Figura <zfigura@codeweavers.com>
+---
+ drivers/misc/ntsync.c       | 61 +++++++++++++++++++++++++++++++++++--
+ include/uapi/linux/ntsync.h |  1 +
+ 2 files changed, 60 insertions(+), 2 deletions(-)
+
+--- a/drivers/misc/ntsync.c
++++ b/drivers/misc/ntsync.c
+@@ -59,6 +59,7 @@ struct ntsync_obj {
+ 		struct {
+ 			__u32 count;
+ 			pid_t owner;
++			bool ownerdead;
+ 		} mutex;
+ 	} u;
+ 
+@@ -107,6 +108,7 @@ struct ntsync_q {
+ 	atomic_t signaled;
+ 
+ 	bool all;
++	bool ownerdead;
+ 	__u32 count;
+ 	struct ntsync_q_entry entries[];
+ };
+@@ -275,6 +277,9 @@ static void try_wake_all(struct ntsync_d
+ 				obj->u.sem.count--;
+ 				break;
+ 			case NTSYNC_TYPE_MUTEX:
++				if (obj->u.mutex.ownerdead)
++					q->ownerdead = true;
++				obj->u.mutex.ownerdead = false;
+ 				obj->u.mutex.count++;
+ 				obj->u.mutex.owner = q->owner;
+ 				break;
+@@ -338,6 +343,9 @@ static void try_wake_any_mutex(struct nt
+ 			continue;
+ 
+ 		if (atomic_try_cmpxchg(&q->signaled, &signaled, entry->index)) {
++			if (mutex->u.mutex.ownerdead)
++				q->ownerdead = true;
++			mutex->u.mutex.ownerdead = false;
+ 			mutex->u.mutex.count++;
+ 			mutex->u.mutex.owner = q->owner;
+ 			wake_up_process(q->task);
+@@ -447,6 +455,52 @@ static int ntsync_mutex_unlock(struct nt
+ 	return ret;
+ }
+ 
++/*
++ * Actually change the mutex state to mark its owner as dead,
++ * returning -EPERM if not the owner.
++ */
++static int kill_mutex_state(struct ntsync_obj *mutex, __u32 owner)
++{
++	ntsync_assert_held(mutex);
++
++	if (mutex->u.mutex.owner != owner)
++		return -EPERM;
++
++	mutex->u.mutex.ownerdead = true;
++	mutex->u.mutex.owner = 0;
++	mutex->u.mutex.count = 0;
++	return 0;
++}
++
++static int ntsync_mutex_kill(struct ntsync_obj *mutex, void __user *argp)
++{
++	struct ntsync_device *dev = mutex->dev;
++	__u32 owner;
++	bool all;
++	int ret;
++
++	if (get_user(owner, (__u32 __user *)argp))
++		return -EFAULT;
++	if (!owner)
++		return -EINVAL;
++
++	if (mutex->type != NTSYNC_TYPE_MUTEX)
++		return -EINVAL;
++
++	all = ntsync_lock_obj(dev, mutex);
++
++	ret = kill_mutex_state(mutex, owner);
++	if (!ret) {
++		if (all)
++			try_wake_all_obj(dev, mutex);
++		try_wake_any_mutex(mutex);
++	}
++
++	ntsync_unlock_obj(dev, mutex, all);
++
++	return ret;
++}
++
+ static int ntsync_obj_release(struct inode *inode, struct file *file)
+ {
+ 	struct ntsync_obj *obj = file->private_data;
+@@ -468,6 +522,8 @@ static long ntsync_obj_ioctl(struct file
+ 		return ntsync_sem_post(obj, argp);
+ 	case NTSYNC_IOC_MUTEX_UNLOCK:
+ 		return ntsync_mutex_unlock(obj, argp);
++	case NTSYNC_IOC_MUTEX_KILL:
++		return ntsync_mutex_kill(obj, argp);
+ 	default:
+ 		return -ENOIOCTLCMD;
+ 	}
+@@ -659,6 +715,7 @@ static int setup_wait(struct ntsync_devi
+ 	q->owner = args->owner;
+ 	atomic_set(&q->signaled, -1);
+ 	q->all = all;
++	q->ownerdead = false;
+ 	q->count = count;
+ 
+ 	for (i = 0; i < count; i++) {
+@@ -767,7 +824,7 @@ static int ntsync_wait_any(struct ntsync
+ 		struct ntsync_wait_args __user *user_args = argp;
+ 
+ 		/* even if we caught a signal, we need to communicate success */
+-		ret = 0;
++		ret = q->ownerdead ? -EOWNERDEAD : 0;
+ 
+ 		if (put_user(signaled, &user_args->index))
+ 			ret = -EFAULT;
+@@ -848,7 +905,7 @@ static int ntsync_wait_all(struct ntsync
+ 		struct ntsync_wait_args __user *user_args = argp;
+ 
+ 		/* even if we caught a signal, we need to communicate success */
+-		ret = 0;
++		ret = q->ownerdead ? -EOWNERDEAD : 0;
+ 
+ 		if (put_user(signaled, &user_args->index))
+ 			ret = -EFAULT;
+--- a/include/uapi/linux/ntsync.h
++++ b/include/uapi/linux/ntsync.h
+@@ -43,5 +43,6 @@ struct ntsync_wait_args {
+ 
+ #define NTSYNC_IOC_SEM_POST		_IOWR('N', 0x81, __u32)
+ #define NTSYNC_IOC_MUTEX_UNLOCK		_IOWR('N', 0x85, struct ntsync_mutex_args)
++#define NTSYNC_IOC_MUTEX_KILL		_IOW ('N', 0x86, __u32)
+ 
+ #endif
diff --git a/debian/patches/misc-ntsync5/0006-ntsync-Introduce-NTSYNC_IOC_CREATE_EVENT.patch b/debian/patches/misc-ntsync5/0006-ntsync-Introduce-NTSYNC_IOC_CREATE_EVENT.patch
new file mode 100644
index 0000000..5cbb5f2
--- /dev/null
+++ b/debian/patches/misc-ntsync5/0006-ntsync-Introduce-NTSYNC_IOC_CREATE_EVENT.patch
@@ -0,0 +1,166 @@
+From 3f3bbc85f1e613364261d685b8197c32ffdeaad0 Mon Sep 17 00:00:00 2001
+From: Elizabeth Figura <zfigura@codeweavers.com>
+Date: Sun, 19 May 2024 15:24:32 -0500
+Subject: ntsync: Introduce NTSYNC_IOC_CREATE_EVENT.
+
+This correspond to the NT syscall NtCreateEvent().
+
+An NT event holds a single bit of state denoting whether it is signaled or
+unsignaled.
+
+There are two types of events: manual-reset and automatic-reset. When an
+automatic-reset event is acquired via a wait function, its state is reset to
+unsignaled. Manual-reset events are not affected by wait functions.
+
+Whether the event is manual-reset, and its initial state, are specified at
+creation time.
+
+Signed-off-by: Elizabeth Figura <zfigura@codeweavers.com>
+---
+ drivers/misc/ntsync.c       | 62 +++++++++++++++++++++++++++++++++++++
+ include/uapi/linux/ntsync.h |  7 +++++
+ 2 files changed, 69 insertions(+)
+
+--- a/drivers/misc/ntsync.c
++++ b/drivers/misc/ntsync.c
+@@ -26,6 +26,7 @@
+ enum ntsync_type {
+ 	NTSYNC_TYPE_SEM,
+ 	NTSYNC_TYPE_MUTEX,
++	NTSYNC_TYPE_EVENT,
+ };
+ 
+ /*
+@@ -61,6 +62,10 @@ struct ntsync_obj {
+ 			pid_t owner;
+ 			bool ownerdead;
+ 		} mutex;
++		struct {
++			bool manual;
++			bool signaled;
++		} event;
+ 	} u;
+ 
+ 	/*
+@@ -233,6 +238,8 @@ static bool is_signaled(struct ntsync_ob
+ 		if (obj->u.mutex.owner && obj->u.mutex.owner != owner)
+ 			return false;
+ 		return obj->u.mutex.count < UINT_MAX;
++	case NTSYNC_TYPE_EVENT:
++		return obj->u.event.signaled;
+ 	}
+ 
+ 	WARN(1, "bad object type %#x\n", obj->type);
+@@ -283,6 +290,10 @@ static void try_wake_all(struct ntsync_d
+ 				obj->u.mutex.count++;
+ 				obj->u.mutex.owner = q->owner;
+ 				break;
++			case NTSYNC_TYPE_EVENT:
++				if (!obj->u.event.manual)
++					obj->u.event.signaled = false;
++				break;
+ 			}
+ 		}
+ 		wake_up_process(q->task);
+@@ -353,6 +364,28 @@ static void try_wake_any_mutex(struct nt
+ 	}
+ }
+ 
++static void try_wake_any_event(struct ntsync_obj *event)
++{
++	struct ntsync_q_entry *entry;
++
++	ntsync_assert_held(event);
++	lockdep_assert(event->type == NTSYNC_TYPE_EVENT);
++
++	list_for_each_entry(entry, &event->any_waiters, node) {
++		struct ntsync_q *q = entry->q;
++		int signaled = -1;
++
++		if (!event->u.event.signaled)
++			break;
++
++		if (atomic_try_cmpxchg(&q->signaled, &signaled, entry->index)) {
++			if (!event->u.event.manual)
++				event->u.event.signaled = false;
++			wake_up_process(q->task);
++		}
++	}
++}
++
+ /*
+  * Actually change the semaphore state, returning -EOVERFLOW if it is made
+  * invalid.
+@@ -629,6 +662,30 @@ static int ntsync_create_mutex(struct nt
+ 	return put_user(fd, &user_args->mutex);
+ }
+ 
++static int ntsync_create_event(struct ntsync_device *dev, void __user *argp)
++{
++	struct ntsync_event_args __user *user_args = argp;
++	struct ntsync_event_args args;
++	struct ntsync_obj *event;
++	int fd;
++
++	if (copy_from_user(&args, argp, sizeof(args)))
++		return -EFAULT;
++
++	event = ntsync_alloc_obj(dev, NTSYNC_TYPE_EVENT);
++	if (!event)
++		return -ENOMEM;
++	event->u.event.manual = args.manual;
++	event->u.event.signaled = args.signaled;
++	fd = ntsync_obj_get_fd(event);
++	if (fd < 0) {
++		kfree(event);
++		return fd;
++	}
++
++	return put_user(fd, &user_args->event);
++}
++
+ static struct ntsync_obj *get_obj(struct ntsync_device *dev, int fd)
+ {
+ 	struct file *file = fget(fd);
+@@ -759,6 +816,9 @@ static void try_wake_any_obj(struct ntsy
+ 	case NTSYNC_TYPE_MUTEX:
+ 		try_wake_any_mutex(obj);
+ 		break;
++	case NTSYNC_TYPE_EVENT:
++		try_wake_any_event(obj);
++		break;
+ 	}
+ }
+ 
+@@ -948,6 +1008,8 @@ static long ntsync_char_ioctl(struct fil
+ 	void __user *argp = (void __user *)parm;
+ 
+ 	switch (cmd) {
++	case NTSYNC_IOC_CREATE_EVENT:
++		return ntsync_create_event(dev, argp);
+ 	case NTSYNC_IOC_CREATE_MUTEX:
+ 		return ntsync_create_mutex(dev, argp);
+ 	case NTSYNC_IOC_CREATE_SEM:
+--- a/include/uapi/linux/ntsync.h
++++ b/include/uapi/linux/ntsync.h
+@@ -22,6 +22,12 @@ struct ntsync_mutex_args {
+ 	__u32 count;
+ };
+ 
++struct ntsync_event_args {
++	__u32 event;
++	__u32 manual;
++	__u32 signaled;
++};
++
+ #define NTSYNC_WAIT_REALTIME	0x1
+ 
+ struct ntsync_wait_args {
+@@ -40,6 +46,7 @@ struct ntsync_wait_args {
+ #define NTSYNC_IOC_WAIT_ANY		_IOWR('N', 0x82, struct ntsync_wait_args)
+ #define NTSYNC_IOC_WAIT_ALL		_IOWR('N', 0x83, struct ntsync_wait_args)
+ #define NTSYNC_IOC_CREATE_MUTEX		_IOWR('N', 0x84, struct ntsync_sem_args)
++#define NTSYNC_IOC_CREATE_EVENT		_IOWR('N', 0x87, struct ntsync_event_args)
+ 
+ #define NTSYNC_IOC_SEM_POST		_IOWR('N', 0x81, __u32)
+ #define NTSYNC_IOC_MUTEX_UNLOCK		_IOWR('N', 0x85, struct ntsync_mutex_args)
diff --git a/debian/patches/misc-ntsync5/0007-ntsync-Introduce-NTSYNC_IOC_EVENT_SET.patch b/debian/patches/misc-ntsync5/0007-ntsync-Introduce-NTSYNC_IOC_EVENT_SET.patch
new file mode 100644
index 0000000..af5f192
--- /dev/null
+++ b/debian/patches/misc-ntsync5/0007-ntsync-Introduce-NTSYNC_IOC_EVENT_SET.patch
@@ -0,0 +1,67 @@
+From a6f107f17a976008b85c3e269bf4196e595d3f52 Mon Sep 17 00:00:00 2001
+From: Elizabeth Figura <zfigura@codeweavers.com>
+Date: Sun, 19 May 2024 15:24:33 -0500
+Subject: ntsync: Introduce NTSYNC_IOC_EVENT_SET.
+
+This corresponds to the NT syscall NtSetEvent().
+
+This sets the event to the signaled state, and returns its previous state.
+
+Signed-off-by: Elizabeth Figura <zfigura@codeweavers.com>
+---
+ drivers/misc/ntsync.c       | 27 +++++++++++++++++++++++++++
+ include/uapi/linux/ntsync.h |  1 +
+ 2 files changed, 28 insertions(+)
+
+--- a/drivers/misc/ntsync.c
++++ b/drivers/misc/ntsync.c
+@@ -534,6 +534,31 @@ static int ntsync_mutex_kill(struct ntsy
+ 	return ret;
+ }
+ 
++static int ntsync_event_set(struct ntsync_obj *event, void __user *argp)
++{
++	struct ntsync_device *dev = event->dev;
++	__u32 prev_state;
++	bool all;
++
++	if (event->type != NTSYNC_TYPE_EVENT)
++		return -EINVAL;
++
++	all = ntsync_lock_obj(dev, event);
++
++	prev_state = event->u.event.signaled;
++	event->u.event.signaled = true;
++	if (all)
++		try_wake_all_obj(dev, event);
++	try_wake_any_event(event);
++
++	ntsync_unlock_obj(dev, event, all);
++
++	if (put_user(prev_state, (__u32 __user *)argp))
++		return -EFAULT;
++
++	return 0;
++}
++
+ static int ntsync_obj_release(struct inode *inode, struct file *file)
+ {
+ 	struct ntsync_obj *obj = file->private_data;
+@@ -557,6 +582,8 @@ static long ntsync_obj_ioctl(struct file
+ 		return ntsync_mutex_unlock(obj, argp);
+ 	case NTSYNC_IOC_MUTEX_KILL:
+ 		return ntsync_mutex_kill(obj, argp);
++	case NTSYNC_IOC_EVENT_SET:
++		return ntsync_event_set(obj, argp);
+ 	default:
+ 		return -ENOIOCTLCMD;
+ 	}
+--- a/include/uapi/linux/ntsync.h
++++ b/include/uapi/linux/ntsync.h
+@@ -51,5 +51,6 @@ struct ntsync_wait_args {
+ #define NTSYNC_IOC_SEM_POST		_IOWR('N', 0x81, __u32)
+ #define NTSYNC_IOC_MUTEX_UNLOCK		_IOWR('N', 0x85, struct ntsync_mutex_args)
+ #define NTSYNC_IOC_MUTEX_KILL		_IOW ('N', 0x86, __u32)
++#define NTSYNC_IOC_EVENT_SET		_IOR ('N', 0x88, __u32)
+ 
+ #endif
diff --git a/debian/patches/misc-ntsync5/0008-ntsync-Introduce-NTSYNC_IOC_EVENT_RESET.patch b/debian/patches/misc-ntsync5/0008-ntsync-Introduce-NTSYNC_IOC_EVENT_RESET.patch
new file mode 100644
index 0000000..1500576
--- /dev/null
+++ b/debian/patches/misc-ntsync5/0008-ntsync-Introduce-NTSYNC_IOC_EVENT_RESET.patch
@@ -0,0 +1,64 @@
+From aa3ebb5870eb9ed259aba2ed4e07e9993e6cd978 Mon Sep 17 00:00:00 2001
+From: Elizabeth Figura <zfigura@codeweavers.com>
+Date: Sun, 19 May 2024 15:24:34 -0500
+Subject: ntsync: Introduce NTSYNC_IOC_EVENT_RESET.
+
+This corresponds to the NT syscall NtResetEvent().
+
+This sets the event to the unsignaled state, and returns its previous state.
+
+Signed-off-by: Elizabeth Figura <zfigura@codeweavers.com>
+---
+ drivers/misc/ntsync.c       | 24 ++++++++++++++++++++++++
+ include/uapi/linux/ntsync.h |  1 +
+ 2 files changed, 25 insertions(+)
+
+--- a/drivers/misc/ntsync.c
++++ b/drivers/misc/ntsync.c
+@@ -559,6 +559,28 @@ static int ntsync_event_set(struct ntsyn
+ 	return 0;
+ }
+ 
++static int ntsync_event_reset(struct ntsync_obj *event, void __user *argp)
++{
++	struct ntsync_device *dev = event->dev;
++	__u32 prev_state;
++	bool all;
++
++	if (event->type != NTSYNC_TYPE_EVENT)
++		return -EINVAL;
++
++	all = ntsync_lock_obj(dev, event);
++
++	prev_state = event->u.event.signaled;
++	event->u.event.signaled = false;
++
++	ntsync_unlock_obj(dev, event, all);
++
++	if (put_user(prev_state, (__u32 __user *)argp))
++		return -EFAULT;
++
++	return 0;
++}
++
+ static int ntsync_obj_release(struct inode *inode, struct file *file)
+ {
+ 	struct ntsync_obj *obj = file->private_data;
+@@ -584,6 +606,8 @@ static long ntsync_obj_ioctl(struct file
+ 		return ntsync_mutex_kill(obj, argp);
+ 	case NTSYNC_IOC_EVENT_SET:
+ 		return ntsync_event_set(obj, argp);
++	case NTSYNC_IOC_EVENT_RESET:
++		return ntsync_event_reset(obj, argp);
+ 	default:
+ 		return -ENOIOCTLCMD;
+ 	}
+--- a/include/uapi/linux/ntsync.h
++++ b/include/uapi/linux/ntsync.h
+@@ -52,5 +52,6 @@ struct ntsync_wait_args {
+ #define NTSYNC_IOC_MUTEX_UNLOCK		_IOWR('N', 0x85, struct ntsync_mutex_args)
+ #define NTSYNC_IOC_MUTEX_KILL		_IOW ('N', 0x86, __u32)
+ #define NTSYNC_IOC_EVENT_SET		_IOR ('N', 0x88, __u32)
++#define NTSYNC_IOC_EVENT_RESET		_IOR ('N', 0x89, __u32)
+ 
+ #endif
diff --git a/debian/patches/misc-ntsync5/0009-ntsync-Introduce-NTSYNC_IOC_EVENT_PULSE.patch b/debian/patches/misc-ntsync5/0009-ntsync-Introduce-NTSYNC_IOC_EVENT_PULSE.patch
new file mode 100644
index 0000000..6f14ed4
--- /dev/null
+++ b/debian/patches/misc-ntsync5/0009-ntsync-Introduce-NTSYNC_IOC_EVENT_PULSE.patch
@@ -0,0 +1,60 @@
+From 99bca5d776a3011214041c42107a210fe315a35e Mon Sep 17 00:00:00 2001
+From: Elizabeth Figura <zfigura@codeweavers.com>
+Date: Sun, 19 May 2024 15:24:35 -0500
+Subject: ntsync: Introduce NTSYNC_IOC_EVENT_PULSE.
+
+This corresponds to the NT syscall NtPulseEvent().
+
+This wakes up any waiters as if the event had been set, but does not set the
+event, instead resetting it if it had been signalled. Thus, for a manual-reset
+event, all waiters are woken, whereas for an auto-reset event, at most one
+waiter is woken.
+
+Signed-off-by: Elizabeth Figura <zfigura@codeweavers.com>
+---
+ drivers/misc/ntsync.c       | 8 ++++++--
+ include/uapi/linux/ntsync.h | 1 +
+ 2 files changed, 7 insertions(+), 2 deletions(-)
+
+--- a/drivers/misc/ntsync.c
++++ b/drivers/misc/ntsync.c
+@@ -534,7 +534,7 @@ static int ntsync_mutex_kill(struct ntsy
+ 	return ret;
+ }
+ 
+-static int ntsync_event_set(struct ntsync_obj *event, void __user *argp)
++static int ntsync_event_set(struct ntsync_obj *event, void __user *argp, bool pulse)
+ {
+ 	struct ntsync_device *dev = event->dev;
+ 	__u32 prev_state;
+@@ -550,6 +550,8 @@ static int ntsync_event_set(struct ntsyn
+ 	if (all)
+ 		try_wake_all_obj(dev, event);
+ 	try_wake_any_event(event);
++	if (pulse)
++		event->u.event.signaled = false;
+ 
+ 	ntsync_unlock_obj(dev, event, all);
+ 
+@@ -605,9 +607,11 @@ static long ntsync_obj_ioctl(struct file
+ 	case NTSYNC_IOC_MUTEX_KILL:
+ 		return ntsync_mutex_kill(obj, argp);
+ 	case NTSYNC_IOC_EVENT_SET:
+-		return ntsync_event_set(obj, argp);
++		return ntsync_event_set(obj, argp, false);
+ 	case NTSYNC_IOC_EVENT_RESET:
+ 		return ntsync_event_reset(obj, argp);
++	case NTSYNC_IOC_EVENT_PULSE:
++		return ntsync_event_set(obj, argp, true);
+ 	default:
+ 		return -ENOIOCTLCMD;
+ 	}
+--- a/include/uapi/linux/ntsync.h
++++ b/include/uapi/linux/ntsync.h
+@@ -53,5 +53,6 @@ struct ntsync_wait_args {
+ #define NTSYNC_IOC_MUTEX_KILL		_IOW ('N', 0x86, __u32)
+ #define NTSYNC_IOC_EVENT_SET		_IOR ('N', 0x88, __u32)
+ #define NTSYNC_IOC_EVENT_RESET		_IOR ('N', 0x89, __u32)
++#define NTSYNC_IOC_EVENT_PULSE		_IOR ('N', 0x8a, __u32)
+ 
+ #endif
diff --git a/debian/patches/misc-ntsync5/0010-ntsync-Introduce-NTSYNC_IOC_SEM_READ.patch b/debian/patches/misc-ntsync5/0010-ntsync-Introduce-NTSYNC_IOC_SEM_READ.patch
new file mode 100644
index 0000000..19f8345
--- /dev/null
+++ b/debian/patches/misc-ntsync5/0010-ntsync-Introduce-NTSYNC_IOC_SEM_READ.patch
@@ -0,0 +1,66 @@
+From 1ef0ea672662bd19e7c6a4eac1067d11e50844b2 Mon Sep 17 00:00:00 2001
+From: Elizabeth Figura <zfigura@codeweavers.com>
+Date: Sun, 19 May 2024 15:24:36 -0500
+Subject: ntsync: Introduce NTSYNC_IOC_SEM_READ.
+
+This corresponds to the NT syscall NtQuerySemaphore().
+
+This returns the current count and maximum count of the semaphore.
+
+Signed-off-by: Elizabeth Figura <zfigura@codeweavers.com>
+---
+ drivers/misc/ntsync.c       | 26 ++++++++++++++++++++++++++
+ include/uapi/linux/ntsync.h |  1 +
+ 2 files changed, 27 insertions(+)
+
+--- a/drivers/misc/ntsync.c
++++ b/drivers/misc/ntsync.c
+@@ -583,6 +583,30 @@ static int ntsync_event_reset(struct nts
+ 	return 0;
+ }
+ 
++static int ntsync_sem_read(struct ntsync_obj *sem, void __user *argp)
++{
++	struct ntsync_sem_args __user *user_args = argp;
++	struct ntsync_device *dev = sem->dev;
++	struct ntsync_sem_args args;
++	bool all;
++
++	if (sem->type != NTSYNC_TYPE_SEM)
++		return -EINVAL;
++
++	args.sem = 0;
++
++	all = ntsync_lock_obj(dev, sem);
++
++	args.count = sem->u.sem.count;
++	args.max = sem->u.sem.max;
++
++	ntsync_unlock_obj(dev, sem, all);
++
++	if (copy_to_user(user_args, &args, sizeof(args)))
++		return -EFAULT;
++	return 0;
++}
++
+ static int ntsync_obj_release(struct inode *inode, struct file *file)
+ {
+ 	struct ntsync_obj *obj = file->private_data;
+@@ -602,6 +626,8 @@ static long ntsync_obj_ioctl(struct file
+ 	switch (cmd) {
+ 	case NTSYNC_IOC_SEM_POST:
+ 		return ntsync_sem_post(obj, argp);
++	case NTSYNC_IOC_SEM_READ:
++		return ntsync_sem_read(obj, argp);
+ 	case NTSYNC_IOC_MUTEX_UNLOCK:
+ 		return ntsync_mutex_unlock(obj, argp);
+ 	case NTSYNC_IOC_MUTEX_KILL:
+--- a/include/uapi/linux/ntsync.h
++++ b/include/uapi/linux/ntsync.h
+@@ -54,5 +54,6 @@ struct ntsync_wait_args {
+ #define NTSYNC_IOC_EVENT_SET		_IOR ('N', 0x88, __u32)
+ #define NTSYNC_IOC_EVENT_RESET		_IOR ('N', 0x89, __u32)
+ #define NTSYNC_IOC_EVENT_PULSE		_IOR ('N', 0x8a, __u32)
++#define NTSYNC_IOC_SEM_READ		_IOR ('N', 0x8b, struct ntsync_sem_args)
+ 
+ #endif
diff --git a/debian/patches/misc-ntsync5/0011-ntsync-Introduce-NTSYNC_IOC_MUTEX_READ.patch b/debian/patches/misc-ntsync5/0011-ntsync-Introduce-NTSYNC_IOC_MUTEX_READ.patch
new file mode 100644
index 0000000..d75f8c3
--- /dev/null
+++ b/debian/patches/misc-ntsync5/0011-ntsync-Introduce-NTSYNC_IOC_MUTEX_READ.patch
@@ -0,0 +1,68 @@
+From 7891b7d15abd12975aebb955821fbc43353b45d6 Mon Sep 17 00:00:00 2001
+From: Elizabeth Figura <zfigura@codeweavers.com>
+Date: Sun, 19 May 2024 15:24:37 -0500
+Subject: ntsync: Introduce NTSYNC_IOC_MUTEX_READ.
+
+This corresponds to the NT syscall NtQueryMutant().
+
+This returns the recursion count, owner, and abandoned state of the mutex.
+
+Signed-off-by: Elizabeth Figura <zfigura@codeweavers.com>
+---
+ drivers/misc/ntsync.c       | 28 ++++++++++++++++++++++++++++
+ include/uapi/linux/ntsync.h |  1 +
+ 2 files changed, 29 insertions(+)
+
+--- a/drivers/misc/ntsync.c
++++ b/drivers/misc/ntsync.c
+@@ -607,6 +607,32 @@ static int ntsync_sem_read(struct ntsync
+ 	return 0;
+ }
+ 
++static int ntsync_mutex_read(struct ntsync_obj *mutex, void __user *argp)
++{
++	struct ntsync_mutex_args __user *user_args = argp;
++	struct ntsync_device *dev = mutex->dev;
++	struct ntsync_mutex_args args;
++	bool all;
++	int ret;
++
++	if (mutex->type != NTSYNC_TYPE_MUTEX)
++		return -EINVAL;
++
++	args.mutex = 0;
++
++	all = ntsync_lock_obj(dev, mutex);
++
++	args.count = mutex->u.mutex.count;
++	args.owner = mutex->u.mutex.owner;
++	ret = mutex->u.mutex.ownerdead ? -EOWNERDEAD : 0;
++
++	ntsync_unlock_obj(dev, mutex, all);
++
++	if (copy_to_user(user_args, &args, sizeof(args)))
++		return -EFAULT;
++	return ret;
++}
++
+ static int ntsync_obj_release(struct inode *inode, struct file *file)
+ {
+ 	struct ntsync_obj *obj = file->private_data;
+@@ -632,6 +658,8 @@ static long ntsync_obj_ioctl(struct file
+ 		return ntsync_mutex_unlock(obj, argp);
+ 	case NTSYNC_IOC_MUTEX_KILL:
+ 		return ntsync_mutex_kill(obj, argp);
++	case NTSYNC_IOC_MUTEX_READ:
++		return ntsync_mutex_read(obj, argp);
+ 	case NTSYNC_IOC_EVENT_SET:
+ 		return ntsync_event_set(obj, argp, false);
+ 	case NTSYNC_IOC_EVENT_RESET:
+--- a/include/uapi/linux/ntsync.h
++++ b/include/uapi/linux/ntsync.h
+@@ -55,5 +55,6 @@ struct ntsync_wait_args {
+ #define NTSYNC_IOC_EVENT_RESET		_IOR ('N', 0x89, __u32)
+ #define NTSYNC_IOC_EVENT_PULSE		_IOR ('N', 0x8a, __u32)
+ #define NTSYNC_IOC_SEM_READ		_IOR ('N', 0x8b, struct ntsync_sem_args)
++#define NTSYNC_IOC_MUTEX_READ		_IOR ('N', 0x8c, struct ntsync_mutex_args)
+ 
+ #endif
diff --git a/debian/patches/misc-ntsync5/0012-ntsync-Introduce-NTSYNC_IOC_EVENT_READ.patch b/debian/patches/misc-ntsync5/0012-ntsync-Introduce-NTSYNC_IOC_EVENT_READ.patch
new file mode 100644
index 0000000..a72cd1b
--- /dev/null
+++ b/debian/patches/misc-ntsync5/0012-ntsync-Introduce-NTSYNC_IOC_EVENT_READ.patch
@@ -0,0 +1,66 @@
+From 35ff252f99aa4002e0c2ecef37314a422969791b Mon Sep 17 00:00:00 2001
+From: Elizabeth Figura <zfigura@codeweavers.com>
+Date: Sun, 19 May 2024 15:24:38 -0500
+Subject: ntsync: Introduce NTSYNC_IOC_EVENT_READ.
+
+This corresponds to the NT syscall NtQueryEvent().
+
+This returns the signaled state of the event and whether it is manual-reset.
+
+Signed-off-by: Elizabeth Figura <zfigura@codeweavers.com>
+---
+ drivers/misc/ntsync.c       | 26 ++++++++++++++++++++++++++
+ include/uapi/linux/ntsync.h |  1 +
+ 2 files changed, 27 insertions(+)
+
+--- a/drivers/misc/ntsync.c
++++ b/drivers/misc/ntsync.c
+@@ -633,6 +633,30 @@ static int ntsync_mutex_read(struct ntsy
+ 	return ret;
+ }
+ 
++static int ntsync_event_read(struct ntsync_obj *event, void __user *argp)
++{
++	struct ntsync_event_args __user *user_args = argp;
++	struct ntsync_device *dev = event->dev;
++	struct ntsync_event_args args;
++	bool all;
++
++	if (event->type != NTSYNC_TYPE_EVENT)
++		return -EINVAL;
++
++	args.event = 0;
++
++	all = ntsync_lock_obj(dev, event);
++
++	args.manual = event->u.event.manual;
++	args.signaled = event->u.event.signaled;
++
++	ntsync_unlock_obj(dev, event, all);
++
++	if (copy_to_user(user_args, &args, sizeof(args)))
++		return -EFAULT;
++	return 0;
++}
++
+ static int ntsync_obj_release(struct inode *inode, struct file *file)
+ {
+ 	struct ntsync_obj *obj = file->private_data;
+@@ -666,6 +690,8 @@ static long ntsync_obj_ioctl(struct file
+ 		return ntsync_event_reset(obj, argp);
+ 	case NTSYNC_IOC_EVENT_PULSE:
+ 		return ntsync_event_set(obj, argp, true);
++	case NTSYNC_IOC_EVENT_READ:
++		return ntsync_event_read(obj, argp);
+ 	default:
+ 		return -ENOIOCTLCMD;
+ 	}
+--- a/include/uapi/linux/ntsync.h
++++ b/include/uapi/linux/ntsync.h
+@@ -56,5 +56,6 @@ struct ntsync_wait_args {
+ #define NTSYNC_IOC_EVENT_PULSE		_IOR ('N', 0x8a, __u32)
+ #define NTSYNC_IOC_SEM_READ		_IOR ('N', 0x8b, struct ntsync_sem_args)
+ #define NTSYNC_IOC_MUTEX_READ		_IOR ('N', 0x8c, struct ntsync_mutex_args)
++#define NTSYNC_IOC_EVENT_READ		_IOR ('N', 0x8d, struct ntsync_event_args)
+ 
+ #endif
diff --git a/debian/patches/misc-ntsync5/0013-ntsync-Introduce-alertable-waits.patch b/debian/patches/misc-ntsync5/0013-ntsync-Introduce-alertable-waits.patch
new file mode 100644
index 0000000..88580a0
--- /dev/null
+++ b/debian/patches/misc-ntsync5/0013-ntsync-Introduce-alertable-waits.patch
@@ -0,0 +1,187 @@
+From 2c391d57d1393cd46bf8bab08232ddc3dd32d5e5 Mon Sep 17 00:00:00 2001
+From: Elizabeth Figura <zfigura@codeweavers.com>
+Date: Sun, 19 May 2024 15:24:39 -0500
+Subject: ntsync: Introduce alertable waits.
+
+NT waits can optionally be made "alertable". This is a special channel for
+thread wakeup that is mildly similar to SIGIO. A thread has an internal single
+bit of "alerted" state, and if a thread is alerted while an alertable wait, the
+wait will return a special value, consume the "alerted" state, and will not
+consume any of its objects.
+
+Alerts are implemented using events; the user-space NT emulator is expected to
+create an internal ntsync event for each thread and pass that event to wait
+functions.
+
+Signed-off-by: Elizabeth Figura <zfigura@codeweavers.com>
+---
+ drivers/misc/ntsync.c       | 70 ++++++++++++++++++++++++++++++++-----
+ include/uapi/linux/ntsync.h |  3 +-
+ 2 files changed, 63 insertions(+), 10 deletions(-)
+
+--- a/drivers/misc/ntsync.c
++++ b/drivers/misc/ntsync.c
+@@ -885,22 +885,29 @@ static int setup_wait(struct ntsync_devi
+ 		      const struct ntsync_wait_args *args, bool all,
+ 		      struct ntsync_q **ret_q)
+ {
++	int fds[NTSYNC_MAX_WAIT_COUNT + 1];
+ 	const __u32 count = args->count;
+-	int fds[NTSYNC_MAX_WAIT_COUNT];
+ 	struct ntsync_q *q;
++	__u32 total_count;
+ 	__u32 i, j;
+ 
+-	if (args->pad[0] || args->pad[1] || (args->flags & ~NTSYNC_WAIT_REALTIME))
++	if (args->pad || (args->flags & ~NTSYNC_WAIT_REALTIME))
+ 		return -EINVAL;
+ 
+ 	if (args->count > NTSYNC_MAX_WAIT_COUNT)
+ 		return -EINVAL;
+ 
++	total_count = count;
++	if (args->alert)
++		total_count++;
++
+ 	if (copy_from_user(fds, u64_to_user_ptr(args->objs),
+ 			   array_size(count, sizeof(*fds))))
+ 		return -EFAULT;
++	if (args->alert)
++		fds[count] = args->alert;
+ 
+-	q = kmalloc(struct_size(q, entries, count), GFP_KERNEL);
++	q = kmalloc(struct_size(q, entries, total_count), GFP_KERNEL);
+ 	if (!q)
+ 		return -ENOMEM;
+ 	q->task = current;
+@@ -910,7 +917,7 @@ static int setup_wait(struct ntsync_devi
+ 	q->ownerdead = false;
+ 	q->count = count;
+ 
+-	for (i = 0; i < count; i++) {
++	for (i = 0; i < total_count; i++) {
+ 		struct ntsync_q_entry *entry = &q->entries[i];
+ 		struct ntsync_obj *obj = get_obj(dev, fds[i]);
+ 
+@@ -960,10 +967,10 @@ static void try_wake_any_obj(struct ntsy
+ static int ntsync_wait_any(struct ntsync_device *dev, void __user *argp)
+ {
+ 	struct ntsync_wait_args args;
++	__u32 i, total_count;
+ 	struct ntsync_q *q;
+ 	int signaled;
+ 	bool all;
+-	__u32 i;
+ 	int ret;
+ 
+ 	if (copy_from_user(&args, argp, sizeof(args)))
+@@ -973,9 +980,13 @@ static int ntsync_wait_any(struct ntsync
+ 	if (ret < 0)
+ 		return ret;
+ 
++	total_count = args.count;
++	if (args.alert)
++		total_count++;
++
+ 	/* queue ourselves */
+ 
+-	for (i = 0; i < args.count; i++) {
++	for (i = 0; i < total_count; i++) {
+ 		struct ntsync_q_entry *entry = &q->entries[i];
+ 		struct ntsync_obj *obj = entry->obj;
+ 
+@@ -984,9 +995,15 @@ static int ntsync_wait_any(struct ntsync
+ 		ntsync_unlock_obj(dev, obj, all);
+ 	}
+ 
+-	/* check if we are already signaled */
++	/*
++	 * Check if we are already signaled.
++	 *
++	 * Note that the API requires that normal objects are checked before
++	 * the alert event. Hence we queue the alert event last, and check
++	 * objects in order.
++	 */
+ 
+-	for (i = 0; i < args.count; i++) {
++	for (i = 0; i < total_count; i++) {
+ 		struct ntsync_obj *obj = q->entries[i].obj;
+ 
+ 		if (atomic_read(&q->signaled) != -1)
+@@ -1003,7 +1020,7 @@ static int ntsync_wait_any(struct ntsync
+ 
+ 	/* and finally, unqueue */
+ 
+-	for (i = 0; i < args.count; i++) {
++	for (i = 0; i < total_count; i++) {
+ 		struct ntsync_q_entry *entry = &q->entries[i];
+ 		struct ntsync_obj *obj = entry->obj;
+ 
+@@ -1063,6 +1080,14 @@ static int ntsync_wait_all(struct ntsync
+ 		 */
+ 		list_add_tail(&entry->node, &obj->all_waiters);
+ 	}
++	if (args.alert) {
++		struct ntsync_q_entry *entry = &q->entries[args.count];
++		struct ntsync_obj *obj = entry->obj;
++
++		dev_lock_obj(dev, obj);
++		list_add_tail(&entry->node, &obj->any_waiters);
++		dev_unlock_obj(dev, obj);
++	}
+ 
+ 	/* check if we are already signaled */
+ 
+@@ -1070,6 +1095,21 @@ static int ntsync_wait_all(struct ntsync
+ 
+ 	mutex_unlock(&dev->wait_all_lock);
+ 
++	/*
++	 * Check if the alert event is signaled, making sure to do so only
++	 * after checking if the other objects are signaled.
++	 */
++
++	if (args.alert) {
++		struct ntsync_obj *obj = q->entries[args.count].obj;
++
++		if (atomic_read(&q->signaled) == -1) {
++			bool all = ntsync_lock_obj(dev, obj);
++			try_wake_any_obj(obj);
++			ntsync_unlock_obj(dev, obj, all);
++		}
++	}
++
+ 	/* sleep */
+ 
+ 	ret = ntsync_schedule(q, &args);
+@@ -1095,6 +1135,18 @@ static int ntsync_wait_all(struct ntsync
+ 
+ 	mutex_unlock(&dev->wait_all_lock);
+ 
++	if (args.alert) {
++		struct ntsync_q_entry *entry = &q->entries[args.count];
++		struct ntsync_obj *obj = entry->obj;
++		bool all;
++
++		all = ntsync_lock_obj(dev, obj);
++		list_del(&entry->node);
++		ntsync_unlock_obj(dev, obj, all);
++
++		put_obj(obj);
++	}
++
+ 	signaled = atomic_read(&q->signaled);
+ 	if (signaled != -1) {
+ 		struct ntsync_wait_args __user *user_args = argp;
+--- a/include/uapi/linux/ntsync.h
++++ b/include/uapi/linux/ntsync.h
+@@ -37,7 +37,8 @@ struct ntsync_wait_args {
+ 	__u32 index;
+ 	__u32 flags;
+ 	__u32 owner;
+-	__u32 pad[2];
++	__u32 alert;
++	__u32 pad;
+ };
+ 
+ #define NTSYNC_MAX_WAIT_COUNT 64
diff --git a/debian/patches/misc-ntsync5/0014-maintainers-Add-an-entry-for-ntsync.patch b/debian/patches/misc-ntsync5/0014-maintainers-Add-an-entry-for-ntsync.patch
new file mode 100644
index 0000000..8baae5c
--- /dev/null
+++ b/debian/patches/misc-ntsync5/0014-maintainers-Add-an-entry-for-ntsync.patch
@@ -0,0 +1,30 @@
+From 8d87043cd76368bb9996ba541d12e40cbb4201e5 Mon Sep 17 00:00:00 2001
+From: Elizabeth Figura <zfigura@codeweavers.com>
+Date: Sun, 19 May 2024 15:24:52 -0500
+Subject: maintainers: Add an entry for ntsync.
+
+Add myself as maintainer, supported by CodeWeavers.
+
+Signed-off-by: Elizabeth Figura <zfigura@codeweavers.com>
+---
+ MAINTAINERS | 9 +++++++++
+ 1 file changed, 9 insertions(+)
+
+--- a/MAINTAINERS
++++ b/MAINTAINERS
+@@ -16319,6 +16319,15 @@ T:	git https://github.com/Paragon-Softwa
+ F:	Documentation/filesystems/ntfs3.rst
+ F:	fs/ntfs3/
+ 
++NTSYNC SYNCHRONIZATION PRIMITIVE DRIVER
++M:	Elizabeth Figura <zfigura@codeweavers.com>
++L:	wine-devel@winehq.org
++S:	Supported
++F:	Documentation/userspace-api/ntsync.rst
++F:	drivers/misc/ntsync.c
++F:	include/uapi/linux/ntsync.h
++F:	tools/testing/selftests/drivers/ntsync/
++
+ NUBUS SUBSYSTEM
+ M:	Finn Thain <fthain@linux-m68k.org>
+ L:	linux-m68k@lists.linux-m68k.org
diff --git a/debian/patches/misc-ntsync5/0015-docs-ntsync-Add-documentation-for-the-ntsync-uAPI.patch b/debian/patches/misc-ntsync5/0015-docs-ntsync-Add-documentation-for-the-ntsync-uAPI.patch
new file mode 100644
index 0000000..a5f7b7e
--- /dev/null
+++ b/debian/patches/misc-ntsync5/0015-docs-ntsync-Add-documentation-for-the-ntsync-uAPI.patch
@@ -0,0 +1,426 @@
+From 4cb25d42d38f1e0b144b084674591b70afa60bb0 Mon Sep 17 00:00:00 2001
+From: Elizabeth Figura <zfigura@codeweavers.com>
+Date: Sun, 19 May 2024 15:24:53 -0500
+Subject: docs: ntsync: Add documentation for the ntsync uAPI.
+
+Add an overall explanation of the driver architecture, and complete and precise
+specification for its intended behaviour.
+
+Signed-off-by: Elizabeth Figura <zfigura@codeweavers.com>
+---
+ Documentation/userspace-api/index.rst  |   1 +
+ Documentation/userspace-api/ntsync.rst | 398 +++++++++++++++++++++++++
+ 2 files changed, 399 insertions(+)
+ create mode 100644 Documentation/userspace-api/ntsync.rst
+
+--- a/Documentation/userspace-api/index.rst
++++ b/Documentation/userspace-api/index.rst
+@@ -63,6 +63,7 @@ Everything else
+    vduse
+    futex2
+    perf_ring_buffer
++   ntsync
+ 
+ .. only::  subproject and html
+ 
+--- /dev/null
++++ b/Documentation/userspace-api/ntsync.rst
+@@ -0,0 +1,398 @@
++===================================
++NT synchronization primitive driver
++===================================
++
++This page documents the user-space API for the ntsync driver.
++
++ntsync is a support driver for emulation of NT synchronization
++primitives by user-space NT emulators. It exists because implementation
++in user-space, using existing tools, cannot match Windows performance
++while offering accurate semantics. It is implemented entirely in
++software, and does not drive any hardware device.
++
++This interface is meant as a compatibility tool only, and should not
++be used for general synchronization. Instead use generic, versatile
++interfaces such as futex(2) and poll(2).
++
++Synchronization primitives
++==========================
++
++The ntsync driver exposes three types of synchronization primitives:
++semaphores, mutexes, and events.
++
++A semaphore holds a single volatile 32-bit counter, and a static 32-bit
++integer denoting the maximum value. It is considered signaled (that is,
++can be acquired without contention, or will wake up a waiting thread)
++when the counter is nonzero. The counter is decremented by one when a
++wait is satisfied. Both the initial and maximum count are established
++when the semaphore is created.
++
++A mutex holds a volatile 32-bit recursion count, and a volatile 32-bit
++identifier denoting its owner. A mutex is considered signaled when its
++owner is zero (indicating that it is not owned). The recursion count is
++incremented when a wait is satisfied, and ownership is set to the given
++identifier.
++
++A mutex also holds an internal flag denoting whether its previous owner
++has died; such a mutex is said to be abandoned. Owner death is not
++tracked automatically based on thread death, but rather must be
++communicated using ``NTSYNC_IOC_MUTEX_KILL``. An abandoned mutex is
++inherently considered unowned.
++
++Except for the "unowned" semantics of zero, the actual value of the
++owner identifier is not interpreted by the ntsync driver at all. The
++intended use is to store a thread identifier; however, the ntsync
++driver does not actually validate that a calling thread provides
++consistent or unique identifiers.
++
++An event is similar to a semaphore with a maximum count of one. It holds
++a volatile boolean state denoting whether it is signaled or not. There
++are two types of events, auto-reset and manual-reset. An auto-reset
++event is designaled when a wait is satisfied; a manual-reset event is
++not. The event type is specified when the event is created.
++
++Unless specified otherwise, all operations on an object are atomic and
++totally ordered with respect to other operations on the same object.
++
++Objects are represented by files. When all file descriptors to an
++object are closed, that object is deleted.
++
++Char device
++===========
++
++The ntsync driver creates a single char device /dev/ntsync. Each file
++description opened on the device represents a unique instance intended
++to back an individual NT virtual machine. Objects created by one ntsync
++instance may only be used with other objects created by the same
++instance.
++
++ioctl reference
++===============
++
++All operations on the device are done through ioctls. There are four
++structures used in ioctl calls::
++
++   struct ntsync_sem_args {
++   	__u32 sem;
++   	__u32 count;
++   	__u32 max;
++   };
++
++   struct ntsync_mutex_args {
++   	__u32 mutex;
++   	__u32 owner;
++   	__u32 count;
++   };
++
++   struct ntsync_event_args {
++   	__u32 event;
++   	__u32 signaled;
++   	__u32 manual;
++   };
++
++   struct ntsync_wait_args {
++   	__u64 timeout;
++   	__u64 objs;
++   	__u32 count;
++   	__u32 owner;
++   	__u32 index;
++   	__u32 alert;
++   	__u32 flags;
++   	__u32 pad;
++   };
++
++Depending on the ioctl, members of the structure may be used as input,
++output, or not at all. All ioctls return 0 on success.
++
++The ioctls on the device file are as follows:
++
++.. c:macro:: NTSYNC_IOC_CREATE_SEM
++
++  Create a semaphore object. Takes a pointer to struct
++  :c:type:`ntsync_sem_args`, which is used as follows:
++
++  .. list-table::
++
++     * - ``sem``
++       - On output, contains a file descriptor to the created semaphore.
++     * - ``count``
++       - Initial count of the semaphore.
++     * - ``max``
++       - Maximum count of the semaphore.
++
++  Fails with ``EINVAL`` if ``count`` is greater than ``max``.
++
++.. c:macro:: NTSYNC_IOC_CREATE_MUTEX
++
++  Create a mutex object. Takes a pointer to struct
++  :c:type:`ntsync_mutex_args`, which is used as follows:
++
++  .. list-table::
++
++     * - ``mutex``
++       - On output, contains a file descriptor to the created mutex.
++     * - ``count``
++       - Initial recursion count of the mutex.
++     * - ``owner``
++       - Initial owner of the mutex.
++
++  If ``owner`` is nonzero and ``count`` is zero, or if ``owner`` is
++  zero and ``count`` is nonzero, the function fails with ``EINVAL``.
++
++.. c:macro:: NTSYNC_IOC_CREATE_EVENT
++
++  Create an event object. Takes a pointer to struct
++  :c:type:`ntsync_event_args`, which is used as follows:
++
++  .. list-table::
++
++     * - ``event``
++       - On output, contains a file descriptor to the created event.
++     * - ``signaled``
++       - If nonzero, the event is initially signaled, otherwise
++         nonsignaled.
++     * - ``manual``
++       - If nonzero, the event is a manual-reset event, otherwise
++         auto-reset.
++
++The ioctls on the individual objects are as follows:
++
++.. c:macro:: NTSYNC_IOC_SEM_POST
++
++  Post to a semaphore object. Takes a pointer to a 32-bit integer,
++  which on input holds the count to be added to the semaphore, and on
++  output contains its previous count.
++
++  If adding to the semaphore's current count would raise the latter
++  past the semaphore's maximum count, the ioctl fails with
++  ``EOVERFLOW`` and the semaphore is not affected. If raising the
++  semaphore's count causes it to become signaled, eligible threads
++  waiting on this semaphore will be woken and the semaphore's count
++  decremented appropriately.
++
++.. c:macro:: NTSYNC_IOC_MUTEX_UNLOCK
++
++  Release a mutex object. Takes a pointer to struct
++  :c:type:`ntsync_mutex_args`, which is used as follows:
++
++  .. list-table::
++
++     * - ``mutex``
++       - Ignored.
++     * - ``owner``
++       - Specifies the owner trying to release this mutex.
++     * - ``count``
++       - On output, contains the previous recursion count.
++
++  If ``owner`` is zero, the ioctl fails with ``EINVAL``. If ``owner``
++  is not the current owner of the mutex, the ioctl fails with
++  ``EPERM``.
++
++  The mutex's count will be decremented by one. If decrementing the
++  mutex's count causes it to become zero, the mutex is marked as
++  unowned and signaled, and eligible threads waiting on it will be
++  woken as appropriate.
++
++.. c:macro:: NTSYNC_IOC_SET_EVENT
++
++  Signal an event object. Takes a pointer to a 32-bit integer, which on
++  output contains the previous state of the event.
++
++  Eligible threads will be woken, and auto-reset events will be
++  designaled appropriately.
++
++.. c:macro:: NTSYNC_IOC_RESET_EVENT
++
++  Designal an event object. Takes a pointer to a 32-bit integer, which
++  on output contains the previous state of the event.
++
++.. c:macro:: NTSYNC_IOC_PULSE_EVENT
++
++  Wake threads waiting on an event object while leaving it in an
++  unsignaled state. Takes a pointer to a 32-bit integer, which on
++  output contains the previous state of the event.
++
++  A pulse operation can be thought of as a set followed by a reset,
++  performed as a single atomic operation. If two threads are waiting on
++  an auto-reset event which is pulsed, only one will be woken. If two
++  threads are waiting a manual-reset event which is pulsed, both will
++  be woken. However, in both cases, the event will be unsignaled
++  afterwards, and a simultaneous read operation will always report the
++  event as unsignaled.
++
++.. c:macro:: NTSYNC_IOC_READ_SEM
++
++  Read the current state of a semaphore object. Takes a pointer to
++  struct :c:type:`ntsync_sem_args`, which is used as follows:
++
++  .. list-table::
++
++     * - ``sem``
++       - Ignored.
++     * - ``count``
++       - On output, contains the current count of the semaphore.
++     * - ``max``
++       - On output, contains the maximum count of the semaphore.
++
++.. c:macro:: NTSYNC_IOC_READ_MUTEX
++
++  Read the current state of a mutex object. Takes a pointer to struct
++  :c:type:`ntsync_mutex_args`, which is used as follows:
++
++  .. list-table::
++
++     * - ``mutex``
++       - Ignored.
++     * - ``owner``
++       - On output, contains the current owner of the mutex, or zero
++         if the mutex is not currently owned.
++     * - ``count``
++       - On output, contains the current recursion count of the mutex.
++
++  If the mutex is marked as abandoned, the function fails with
++  ``EOWNERDEAD``. In this case, ``count`` and ``owner`` are set to
++  zero.
++
++.. c:macro:: NTSYNC_IOC_READ_EVENT
++
++  Read the current state of an event object. Takes a pointer to struct
++  :c:type:`ntsync_event_args`, which is used as follows:
++
++  .. list-table::
++
++     * - ``event``
++       - Ignored.
++     * - ``signaled``
++       - On output, contains the current state of the event.
++     * - ``manual``
++       - On output, contains 1 if the event is a manual-reset event,
++         and 0 otherwise.
++
++.. c:macro:: NTSYNC_IOC_KILL_OWNER
++
++  Mark a mutex as unowned and abandoned if it is owned by the given
++  owner. Takes an input-only pointer to a 32-bit integer denoting the
++  owner. If the owner is zero, the ioctl fails with ``EINVAL``. If the
++  owner does not own the mutex, the function fails with ``EPERM``.
++
++  Eligible threads waiting on the mutex will be woken as appropriate
++  (and such waits will fail with ``EOWNERDEAD``, as described below).
++
++.. c:macro:: NTSYNC_IOC_WAIT_ANY
++
++  Poll on any of a list of objects, atomically acquiring at most one.
++  Takes a pointer to struct :c:type:`ntsync_wait_args`, which is
++  used as follows:
++
++  .. list-table::
++
++     * - ``timeout``
++       - Absolute timeout in nanoseconds. If ``NTSYNC_WAIT_REALTIME``
++         is set, the timeout is measured against the REALTIME clock;
++         otherwise it is measured against the MONOTONIC clock. If the
++         timeout is equal to or earlier than the current time, the
++         function returns immediately without sleeping. If ``timeout``
++         is U64_MAX, the function will sleep until an object is
++         signaled, and will not fail with ``ETIMEDOUT``.
++     * - ``objs``
++       - Pointer to an array of ``count`` file descriptors
++         (specified as an integer so that the structure has the same
++         size regardless of architecture). If any object is
++         invalid, the function fails with ``EINVAL``.
++     * - ``count``
++       - Number of objects specified in the ``objs`` array.
++         If greater than ``NTSYNC_MAX_WAIT_COUNT``, the function fails
++         with ``EINVAL``.
++     * - ``owner``
++       - Mutex owner identifier. If any object in ``objs`` is a mutex,
++         the ioctl will attempt to acquire that mutex on behalf of
++         ``owner``. If ``owner`` is zero, the ioctl fails with
++         ``EINVAL``.
++     * - ``index``
++       - On success, contains the index (into ``objs``) of the object
++         which was signaled. If ``alert`` was signaled instead,
++         this contains ``count``.
++     * - ``alert``
++       - Optional event object file descriptor. If nonzero, this
++         specifies an "alert" event object which, if signaled, will
++         terminate the wait. If nonzero, the identifier must point to a
++         valid event.
++     * - ``flags``
++       - Zero or more flags. Currently the only flag is
++         ``NTSYNC_WAIT_REALTIME``, which causes the timeout to be
++         measured against the REALTIME clock instead of MONOTONIC.
++     * - ``pad``
++       - Unused, must be set to zero.
++
++  This function attempts to acquire one of the given objects. If unable
++  to do so, it sleeps until an object becomes signaled, subsequently
++  acquiring it, or the timeout expires. In the latter case the ioctl
++  fails with ``ETIMEDOUT``. The function only acquires one object, even
++  if multiple objects are signaled.
++
++  A semaphore is considered to be signaled if its count is nonzero, and
++  is acquired by decrementing its count by one. A mutex is considered
++  to be signaled if it is unowned or if its owner matches the ``owner``
++  argument, and is acquired by incrementing its recursion count by one
++  and setting its owner to the ``owner`` argument. An auto-reset event
++  is acquired by designaling it; a manual-reset event is not affected
++  by acquisition.
++
++  Acquisition is atomic and totally ordered with respect to other
++  operations on the same object. If two wait operations (with different
++  ``owner`` identifiers) are queued on the same mutex, only one is
++  signaled. If two wait operations are queued on the same semaphore,
++  and a value of one is posted to it, only one is signaled.
++
++  If an abandoned mutex is acquired, the ioctl fails with
++  ``EOWNERDEAD``. Although this is a failure return, the function may
++  otherwise be considered successful. The mutex is marked as owned by
++  the given owner (with a recursion count of 1) and as no longer
++  abandoned, and ``index`` is still set to the index of the mutex.
++
++  The ``alert`` argument is an "extra" event which can terminate the
++  wait, independently of all other objects.
++
++  It is valid to pass the same object more than once, including by
++  passing the same event in the ``objs`` array and in ``alert``. If a
++  wakeup occurs due to that object being signaled, ``index`` is set to
++  the lowest index corresponding to that object.
++
++  The function may fail with ``EINTR`` if a signal is received.
++
++.. c:macro:: NTSYNC_IOC_WAIT_ALL
++
++  Poll on a list of objects, atomically acquiring all of them. Takes a
++  pointer to struct :c:type:`ntsync_wait_args`, which is used
++  identically to ``NTSYNC_IOC_WAIT_ANY``, except that ``index`` is
++  always filled with zero on success if not woken via alert.
++
++  This function attempts to simultaneously acquire all of the given
++  objects. If unable to do so, it sleeps until all objects become
++  simultaneously signaled, subsequently acquiring them, or the timeout
++  expires. In the latter case the ioctl fails with ``ETIMEDOUT`` and no
++  objects are modified.
++
++  Objects may become signaled and subsequently designaled (through
++  acquisition by other threads) while this thread is sleeping. Only
++  once all objects are simultaneously signaled does the ioctl acquire
++  them and return. The entire acquisition is atomic and totally ordered
++  with respect to other operations on any of the given objects.
++
++  If an abandoned mutex is acquired, the ioctl fails with
++  ``EOWNERDEAD``. Similarly to ``NTSYNC_IOC_WAIT_ANY``, all objects are
++  nevertheless marked as acquired. Note that if multiple mutex objects
++  are specified, there is no way to know which were marked as
++  abandoned.
++
++  As with "any" waits, the ``alert`` argument is an "extra" event which
++  can terminate the wait. Critically, however, an "all" wait will
++  succeed if all members in ``objs`` are signaled, *or* if ``alert`` is
++  signaled. In the latter case ``index`` will be set to ``count``. As
++  with "any" waits, if both conditions are filled, the former takes
++  priority, and objects in ``objs`` will be acquired.
++
++  Unlike ``NTSYNC_IOC_WAIT_ANY``, it is not valid to pass the same
++  object more than once, nor is it valid to pass the same object in
++  ``objs`` and in ``alert``. If this is attempted, the function fails
++  with ``EINVAL``.
diff --git a/debian/patches/misc-ntsync5/0016-Revert-misc-ntsync-mark-driver-as-broken-to-prevent-.patch b/debian/patches/misc-ntsync5/0016-Revert-misc-ntsync-mark-driver-as-broken-to-prevent-.patch
new file mode 100644
index 0000000..b46522c
--- /dev/null
+++ b/debian/patches/misc-ntsync5/0016-Revert-misc-ntsync-mark-driver-as-broken-to-prevent-.patch
@@ -0,0 +1,21 @@
+From 06bc88f16094c6f38e0890992af4a32415716c5d Mon Sep 17 00:00:00 2001
+From: "Jan Alexander Steffens (heftig)" <heftig@archlinux.org>
+Date: Thu, 18 Jul 2024 21:19:39 +0200
+Subject: Revert "misc: ntsync: mark driver as "broken" to prevent from
+ building"
+
+This reverts commit f5b335dc025cfee90957efa90dc72fada0d5abb4.
+---
+ drivers/misc/Kconfig | 1 -
+ 1 file changed, 1 deletion(-)
+
+--- a/drivers/misc/Kconfig
++++ b/drivers/misc/Kconfig
+@@ -507,7 +507,6 @@ config OPEN_DICE
+ 
+ config NTSYNC
+ 	tristate "NT synchronization primitive emulation"
+-	depends on BROKEN
+ 	help
+ 	  This module provides kernel support for emulation of Windows NT
+ 	  synchronization primitives. It is not a hardware driver.
diff --git a/debian/patches/misc-openwrt/0001-mac80211-ignore-AP-power-level-when-tx-power-type-is.patch b/debian/patches/misc-openwrt/0001-mac80211-ignore-AP-power-level-when-tx-power-type-is.patch
new file mode 100644
index 0000000..9a93cc6
--- /dev/null
+++ b/debian/patches/misc-openwrt/0001-mac80211-ignore-AP-power-level-when-tx-power-type-is.patch
@@ -0,0 +1,32 @@
+From 1fd57f0a5e998d99035ecb3b110cbdb588403c83 Mon Sep 17 00:00:00 2001
+From: Felix Fietkau <nbd@openwrt.org>
+Date: Sat, 5 Dec 2015 15:07:03 +0100
+Subject: [PATCH] mac80211: ignore AP power level when tx power type is "fixed"
+
+In some cases a user might want to connect to a far away access point,
+which announces a low tx power limit. Using the AP's power limit can
+make the connection significantly more unstable or even impossible, and
+mac80211 currently provides no way to disable this behavior.
+
+To fix this, use the currently unused distinction between limited and
+fixed tx power to decide whether a remote AP's power limit should be
+accepted.
+
+Signed-off-by: Felix Fietkau <nbd@openwrt.org>
+Signed-off-by: Alexandre Frade <kernel@xanmod.org>
+---
+ net/mac80211/iface.c | 3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+--- a/net/mac80211/iface.c
++++ b/net/mac80211/iface.c
+@@ -62,7 +62,8 @@ bool __ieee80211_recalc_txpower(struct i
+ 	if (sdata->deflink.user_power_level != IEEE80211_UNSET_POWER_LEVEL)
+ 		power = min(power, sdata->deflink.user_power_level);
+ 
+-	if (sdata->deflink.ap_power_level != IEEE80211_UNSET_POWER_LEVEL)
++	if (sdata->deflink.ap_power_level != IEEE80211_UNSET_POWER_LEVEL &&
++	    sdata->vif.bss_conf.txpower_type != NL80211_TX_POWER_FIXED)
+ 		power = min(power, sdata->deflink.ap_power_level);
+ 
+ 	if (power != sdata->vif.bss_conf.txpower) {
diff --git a/debian/patches/misc-openwrt/0002-net-enable-fraglist-GRO-by-default.patch b/debian/patches/misc-openwrt/0002-net-enable-fraglist-GRO-by-default.patch
new file mode 100644
index 0000000..51f9900
--- /dev/null
+++ b/debian/patches/misc-openwrt/0002-net-enable-fraglist-GRO-by-default.patch
@@ -0,0 +1,24 @@
+From: Felix Fietkau <nbd@nbd.name>
+Date: Tue, 23 Apr 2024 12:35:21 +0200
+Subject: [PATCH] net: enable fraglist GRO by default
+
+This can significantly improve performance for packet forwarding/bridging
+
+Signed-off-by: Felix Fietkau <nbd@nbd.name>
+---
+
+--- a/include/linux/netdev_features.h
++++ b/include/linux/netdev_features.h
+@@ -242,10 +242,10 @@ static inline int find_next_netdev_featu
+ #define NETIF_F_UPPER_DISABLES	NETIF_F_LRO
+ 
+ /* changeable features with no special hardware requirements */
+-#define NETIF_F_SOFT_FEATURES	(NETIF_F_GSO | NETIF_F_GRO)
++#define NETIF_F_SOFT_FEATURES	(NETIF_F_GSO | NETIF_F_GRO | NETIF_F_GRO_FRAGLIST)
+ 
+ /* Changeable features with no special hardware requirements that defaults to off. */
+-#define NETIF_F_SOFT_FEATURES_OFF	(NETIF_F_GRO_FRAGLIST | NETIF_F_GRO_UDP_FWD)
++#define NETIF_F_SOFT_FEATURES_OFF	(NETIF_F_GRO_UDP_FWD)
+ 
+ #define NETIF_F_VLAN_FEATURES	(NETIF_F_HW_VLAN_CTAG_FILTER | \
+ 				 NETIF_F_HW_VLAN_CTAG_RX | \
diff --git a/debian/patches/misc-openwrt/0003-net-remove-NETIF_F_GSO_FRAGLIST-from-NETIF_F_GSO_SOF.patch b/debian/patches/misc-openwrt/0003-net-remove-NETIF_F_GSO_FRAGLIST-from-NETIF_F_GSO_SOF.patch
new file mode 100644
index 0000000..9240cc2
--- /dev/null
+++ b/debian/patches/misc-openwrt/0003-net-remove-NETIF_F_GSO_FRAGLIST-from-NETIF_F_GSO_SOF.patch
@@ -0,0 +1,129 @@
+From: Felix Fietkau <nbd@nbd.name>
+Date: Thu, 15 Aug 2024 21:15:13 +0200
+Subject: [PATCH] net: remove NETIF_F_GSO_FRAGLIST from NETIF_F_GSO_SOFTWARE
+
+Several drivers set NETIF_F_GSO_SOFTWARE, but mangle fraglist GRO packets
+in a way that they can't be properly segmented anymore.
+In order to properly deal with this, remove fraglist GSO from
+NETIF_F_GSO_SOFTWARE and switch to NETIF_F_GSO_SOFTWARE_ALL (which includes
+fraglist GSO) in places where it's safe to add.
+
+Signed-off-by: Felix Fietkau <nbd@nbd.name>
+---
+
+--- a/drivers/net/dummy.c
++++ b/drivers/net/dummy.c
+@@ -110,7 +110,7 @@ static void dummy_setup(struct net_devic
+ 	dev->flags &= ~IFF_MULTICAST;
+ 	dev->priv_flags |= IFF_LIVE_ADDR_CHANGE | IFF_NO_QUEUE;
+ 	dev->features	|= NETIF_F_SG | NETIF_F_FRAGLIST;
+-	dev->features	|= NETIF_F_GSO_SOFTWARE;
++	dev->features	|= NETIF_F_GSO_SOFTWARE_ALL;
+ 	dev->features	|= NETIF_F_HW_CSUM | NETIF_F_HIGHDMA | NETIF_F_LLTX;
+ 	dev->features	|= NETIF_F_GSO_ENCAP_ALL;
+ 	dev->hw_features |= dev->features;
+--- a/drivers/net/loopback.c
++++ b/drivers/net/loopback.c
+@@ -172,7 +172,7 @@ static void gen_lo_setup(struct net_devi
+ 	dev->flags		= IFF_LOOPBACK;
+ 	dev->priv_flags		|= IFF_LIVE_ADDR_CHANGE | IFF_NO_QUEUE;
+ 	netif_keep_dst(dev);
+-	dev->hw_features	= NETIF_F_GSO_SOFTWARE;
++	dev->hw_features	= NETIF_F_GSO_SOFTWARE_ALL;
+ 	dev->features		= NETIF_F_SG | NETIF_F_FRAGLIST
+ 		| NETIF_F_GSO_SOFTWARE
+ 		| NETIF_F_HW_CSUM
+--- a/drivers/net/macvlan.c
++++ b/drivers/net/macvlan.c
+@@ -897,7 +897,7 @@ static int macvlan_hwtstamp_set(struct n
+ static struct lock_class_key macvlan_netdev_addr_lock_key;
+ 
+ #define ALWAYS_ON_OFFLOADS \
+-	(NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_GSO_SOFTWARE | \
++	(NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_GSO_SOFTWARE_ALL | \
+ 	 NETIF_F_GSO_ROBUST | NETIF_F_GSO_ENCAP_ALL)
+ 
+ #define ALWAYS_ON_FEATURES (ALWAYS_ON_OFFLOADS | NETIF_F_LLTX)
+--- a/include/linux/netdev_features.h
++++ b/include/linux/netdev_features.h
+@@ -219,13 +219,14 @@ static inline int find_next_netdev_featu
+ 
+ /* List of features with software fallbacks. */
+ #define NETIF_F_GSO_SOFTWARE	(NETIF_F_ALL_TSO | NETIF_F_GSO_SCTP |	     \
+-				 NETIF_F_GSO_UDP_L4 | NETIF_F_GSO_FRAGLIST)
++				 NETIF_F_GSO_UDP_L4)
++#define NETIF_F_GSO_SOFTWARE_ALL (NETIF_F_GSO_SOFTWARE | NETIF_F_GSO_FRAGLIST)
+ 
+ /*
+  * If one device supports one of these features, then enable them
+  * for all in netdev_increment_features.
+  */
+-#define NETIF_F_ONE_FOR_ALL	(NETIF_F_GSO_SOFTWARE | NETIF_F_GSO_ROBUST | \
++#define NETIF_F_ONE_FOR_ALL	(NETIF_F_GSO_SOFTWARE_ALL | NETIF_F_GSO_ROBUST | \
+ 				 NETIF_F_SG | NETIF_F_HIGHDMA |		\
+ 				 NETIF_F_FRAGLIST | NETIF_F_VLAN_CHALLENGED)
+ 
+--- a/net/8021q/vlan.h
++++ b/net/8021q/vlan.h
+@@ -108,7 +108,7 @@ static inline netdev_features_t vlan_tnl
+ 	netdev_features_t ret;
+ 
+ 	ret = real_dev->hw_enc_features &
+-	      (NETIF_F_CSUM_MASK | NETIF_F_GSO_SOFTWARE |
++	      (NETIF_F_CSUM_MASK | NETIF_F_GSO_SOFTWARE_ALL |
+ 	       NETIF_F_GSO_ENCAP_ALL);
+ 
+ 	if ((ret & NETIF_F_GSO_ENCAP_ALL) && (ret & NETIF_F_CSUM_MASK))
+--- a/net/8021q/vlan_dev.c
++++ b/net/8021q/vlan_dev.c
+@@ -561,7 +561,7 @@ static int vlan_dev_init(struct net_devi
+ 		dev->state |= (1 << __LINK_STATE_NOCARRIER);
+ 
+ 	dev->hw_features = NETIF_F_HW_CSUM | NETIF_F_SG |
+-			   NETIF_F_FRAGLIST | NETIF_F_GSO_SOFTWARE |
++			   NETIF_F_FRAGLIST | NETIF_F_GSO_SOFTWARE_ALL |
+ 			   NETIF_F_GSO_ENCAP_ALL |
+ 			   NETIF_F_HIGHDMA | NETIF_F_SCTP_CRC |
+ 			   NETIF_F_ALL_FCOE;
+@@ -654,7 +654,7 @@ static netdev_features_t vlan_dev_fix_fe
+ 	if (lower_features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
+ 		lower_features |= NETIF_F_HW_CSUM;
+ 	features = netdev_intersect_features(features, lower_features);
+-	features |= old_features & (NETIF_F_SOFT_FEATURES | NETIF_F_GSO_SOFTWARE);
++	features |= old_features & (NETIF_F_SOFT_FEATURES | NETIF_F_GSO_SOFTWARE_ALL);
+ 	features |= NETIF_F_LLTX;
+ 
+ 	return features;
+--- a/net/core/sock.c
++++ b/net/core/sock.c
+@@ -2451,7 +2451,7 @@ void sk_setup_caps(struct sock *sk, stru
+ 	if (sk_is_tcp(sk))
+ 		sk->sk_route_caps |= NETIF_F_GSO;
+ 	if (sk->sk_route_caps & NETIF_F_GSO)
+-		sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
++		sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE_ALL;
+ 	if (unlikely(sk->sk_gso_disabled))
+ 		sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
+ 	if (sk_can_gso(sk)) {
+--- a/net/mac80211/ieee80211_i.h
++++ b/net/mac80211/ieee80211_i.h
+@@ -2009,7 +2009,7 @@ void ieee80211_color_collision_detection
+ /* interface handling */
+ #define MAC80211_SUPPORTED_FEATURES_TX	(NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM | \
+ 					 NETIF_F_HW_CSUM | NETIF_F_SG | \
+-					 NETIF_F_HIGHDMA | NETIF_F_GSO_SOFTWARE | \
++					 NETIF_F_HIGHDMA | NETIF_F_GSO_SOFTWARE_ALL | \
+ 					 NETIF_F_HW_TC)
+ #define MAC80211_SUPPORTED_FEATURES_RX	(NETIF_F_RXCSUM)
+ #define MAC80211_SUPPORTED_FEATURES	(MAC80211_SUPPORTED_FEATURES_TX | \
+--- a/net/openvswitch/vport-internal_dev.c
++++ b/net/openvswitch/vport-internal_dev.c
+@@ -109,7 +109,7 @@ static void do_setup(struct net_device *
+ 
+ 	netdev->features = NETIF_F_LLTX | NETIF_F_SG | NETIF_F_FRAGLIST |
+ 			   NETIF_F_HIGHDMA | NETIF_F_HW_CSUM |
+-			   NETIF_F_GSO_SOFTWARE | NETIF_F_GSO_ENCAP_ALL;
++			   NETIF_F_GSO_SOFTWARE_ALL | NETIF_F_GSO_ENCAP_ALL;
+ 
+ 	netdev->vlan_features = netdev->features;
+ 	netdev->hw_enc_features = netdev->features;
diff --git a/debian/patches/mixed-arch/0001-ZEN-Add-graysky-s-more-ISA-levels-and-uarches.patch b/debian/patches/mixed-arch/0001-ZEN-Add-graysky-s-more-ISA-levels-and-uarches.patch
new file mode 100644
index 0000000..dda4678
--- /dev/null
+++ b/debian/patches/mixed-arch/0001-ZEN-Add-graysky-s-more-ISA-levels-and-uarches.patch
@@ -0,0 +1,762 @@
+From a657df31affbb91d8cb2718e70f42cf8ed6e9a7c Mon Sep 17 00:00:00 2001
+From: graysky <therealgraysky AT proton DOT me>
+Date: Mon, 16 Sep 2024 05:55:58 -0400
+Subject: ZEN: Add graysky's more-ISA-levels-and-uarches
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From https://github.com/graysky2/kernel_compiler_patch
+
+more-ISA-levels-and-uarches-for-kernel-6.8-rc4+.patch
+
+FEATURES
+This patch adds additional tunings via new x86-64 ISA levels and
+more micro-architecture options to the Linux kernel in three classes.
+
+1. New generic x86-64 ISA levels
+
+These are selectable under:
+	Processor type and features ---> x86-64 compiler ISA level
+
+• x86-64     A value of (1) is the default
+• x86-64-v2  A value of (2) brings support for vector
+             instructions up to Streaming SIMD Extensions 4.2 (SSE4.2)
+	     and Supplemental Streaming SIMD Extensions 3 (SSSE3), the
+	     POPCNT instruction, and CMPXCHG16B.
+• x86-64-v3  A value of (3) adds vector instructions up to AVX2, MOVBE,
+             and additional bit-manipulation instructions.
+
+There is also x86-64-v4 but including this makes little sense as
+the kernel does not use any of the AVX512 instructions anyway.
+
+Users of glibc 2.33 and above can see which level is supported by running:
+	/lib/ld-linux-x86-64.so.2 --help | grep supported
+Or
+	/lib64/ld-linux-x86-64.so.2 --help | grep supported
+
+2. New micro-architectures
+
+These are selectable under:
+	Processor type and features ---> Processor family
+
+• AMD Improved K8-family
+• AMD K10-family
+• AMD Family 10h (Barcelona)
+• AMD Family 14h (Bobcat)
+• AMD Family 16h (Jaguar)
+• AMD Family 15h (Bulldozer)
+• AMD Family 15h (Piledriver)
+• AMD Family 15h (Steamroller)
+• AMD Family 15h (Excavator)
+• AMD Family 17h (Zen)
+• AMD Family 17h (Zen 2)
+• AMD Family 19h (Zen 3)**
+• AMD Family 19h (Zen 4)‡
+• AMD Family 1Ah (Zen 5)§
+• Intel Silvermont low-power processors
+• Intel Goldmont low-power processors (Apollo Lake and Denverton)
+• Intel Goldmont Plus low-power processors (Gemini Lake)
+• Intel 1st Gen Core i3/i5/i7 (Nehalem)
+• Intel 1.5 Gen Core i3/i5/i7 (Westmere)
+• Intel 2nd Gen Core i3/i5/i7 (Sandybridge)
+• Intel 3rd Gen Core i3/i5/i7 (Ivybridge)
+• Intel 4th Gen Core i3/i5/i7 (Haswell)
+• Intel 5th Gen Core i3/i5/i7 (Broadwell)
+• Intel 6th Gen Core i3/i5/i7 (Skylake)
+• Intel 6th Gen Core i7/i9 (Skylake X)
+• Intel 8th Gen Core i3/i5/i7 (Cannon Lake)
+• Intel 10th Gen Core i7/i9 (Ice Lake)
+• Intel Xeon (Cascade Lake)
+• Intel Xeon (Cooper Lake)*
+• Intel 3rd Gen 10nm++ i3/i5/i7/i9-family (Tiger Lake)*
+• Intel 4th Gen 10nm++ Xeon (Sapphire Rapids)†
+• Intel 11th Gen i3/i5/i7/i9-family (Rocket Lake)†
+• Intel 12th Gen i3/i5/i7/i9-family (Alder Lake)†
+• Intel 13th Gen i3/i5/i7/i9-family (Raptor Lake)‡
+• Intel 14th Gen i3/i5/i7/i9-family (Meteor Lake)‡
+• Intel 5th Gen 10nm++ Xeon (Emerald Rapids)‡
+
+Notes: If not otherwise noted, gcc >=9.1 is required for support.
+       *Requires gcc >=10.1 or clang >=10.0
+      **Required gcc >=10.3 or clang >=12.0
+       †Required gcc >=11.1 or clang >=12.0
+       ‡Required gcc >=13.0 or clang >=15.0.5
+       §Required gcc  >14.0 or clang >=19.0?
+
+3. Auto-detected micro-architecture levels
+
+Compile by passing the '-march=native' option which, "selects the CPU
+to generate code for at compilation time by determining the processor type of
+the compiling machine. Using -march=native enables all instruction subsets
+supported by the local machine and will produce code optimized for the local
+machine under the constraints of the selected instruction set."[1]
+
+Users of Intel CPUs should select the 'Intel-Native' option and users of AMD
+CPUs should select the 'AMD-Native' option.
+
+MINOR NOTES RELATING TO INTEL ATOM PROCESSORS
+This patch also changes -march=atom to -march=bonnell in accordance with the
+gcc v4.9 changes. Upstream is using the deprecated -match=atom flags when I
+believe it should use the newer -march=bonnell flag for atom processors.[2]
+
+It is not recommended to compile on Atom-CPUs with the 'native' option.[3] The
+recommendation is to use the 'atom' option instead.
+
+BENEFITS
+Small but real speed increases are measurable using a make endpoint comparing
+a generic kernel to one built with one of the respective microarchs.
+
+See the following experimental evidence supporting this statement:
+https://github.com/graysky2/kernel_compiler_patch?tab=readme-ov-file#benchmarks
+
+REQUIREMENTS
+linux version 6.8-rc3+
+gcc version >=9.0 or clang version >=9.0
+
+ACKNOWLEDGMENTS
+This patch builds on the seminal work by Jeroen.[4]
+
+REFERENCES
+1.  https://gcc.gnu.org/onlinedocs/gcc/x86-Options.html#index-x86-Options
+2.  https://bugzilla.kernel.org/show_bug.cgi?id=77461
+3.  https://github.com/graysky2/kernel_gcc_patch/issues/15
+4.  http://www.linuxforge.net/docs/linux/linux-gcc.php
+---
+ arch/x86/Kconfig.cpu            | 363 ++++++++++++++++++++++++++++++--
+ arch/x86/Makefile               |  89 +++++++-
+ arch/x86/include/asm/vermagic.h |  70 ++++++
+ 3 files changed, 506 insertions(+), 16 deletions(-)
+
+--- a/arch/x86/Kconfig.cpu
++++ b/arch/x86/Kconfig.cpu
+@@ -155,9 +155,8 @@ config MPENTIUM4
+ 		-Paxville
+ 		-Dempsey
+ 
+-
+ config MK6
+-	bool "K6/K6-II/K6-III"
++	bool "AMD K6/K6-II/K6-III"
+ 	depends on X86_32
+ 	help
+ 	  Select this for an AMD K6-family processor.  Enables use of
+@@ -165,7 +164,7 @@ config MK6
+ 	  flags to GCC.
+ 
+ config MK7
+-	bool "Athlon/Duron/K7"
++	bool "AMD Athlon/Duron/K7"
+ 	depends on X86_32
+ 	help
+ 	  Select this for an AMD Athlon K7-family processor.  Enables use of
+@@ -173,12 +172,114 @@ config MK7
+ 	  flags to GCC.
+ 
+ config MK8
+-	bool "Opteron/Athlon64/Hammer/K8"
++	bool "AMD Opteron/Athlon64/Hammer/K8"
+ 	help
+ 	  Select this for an AMD Opteron or Athlon64 Hammer-family processor.
+ 	  Enables use of some extended instructions, and passes appropriate
+ 	  optimization flags to GCC.
+ 
++config MK8SSE3
++	bool "AMD Opteron/Athlon64/Hammer/K8 with SSE3"
++	help
++	  Select this for improved AMD Opteron or Athlon64 Hammer-family processors.
++	  Enables use of some extended instructions, and passes appropriate
++	  optimization flags to GCC.
++
++config MK10
++	bool "AMD 61xx/7x50/PhenomX3/X4/II/K10"
++	help
++	  Select this for an AMD 61xx Eight-Core Magny-Cours, Athlon X2 7x50,
++	  Phenom X3/X4/II, Athlon II X2/X3/X4, or Turion II-family processor.
++	  Enables use of some extended instructions, and passes appropriate
++	  optimization flags to GCC.
++
++config MBARCELONA
++	bool "AMD Barcelona"
++	help
++	  Select this for AMD Family 10h Barcelona processors.
++
++	  Enables -march=barcelona
++
++config MBOBCAT
++	bool "AMD Bobcat"
++	help
++	  Select this for AMD Family 14h Bobcat processors.
++
++	  Enables -march=btver1
++
++config MJAGUAR
++	bool "AMD Jaguar"
++	help
++	  Select this for AMD Family 16h Jaguar processors.
++
++	  Enables -march=btver2
++
++config MBULLDOZER
++	bool "AMD Bulldozer"
++	help
++	  Select this for AMD Family 15h Bulldozer processors.
++
++	  Enables -march=bdver1
++
++config MPILEDRIVER
++	bool "AMD Piledriver"
++	help
++	  Select this for AMD Family 15h Piledriver processors.
++
++	  Enables -march=bdver2
++
++config MSTEAMROLLER
++	bool "AMD Steamroller"
++	help
++	  Select this for AMD Family 15h Steamroller processors.
++
++	  Enables -march=bdver3
++
++config MEXCAVATOR
++	bool "AMD Excavator"
++	help
++	  Select this for AMD Family 15h Excavator processors.
++
++	  Enables -march=bdver4
++
++config MZEN
++	bool "AMD Zen"
++	help
++	  Select this for AMD Family 17h Zen processors.
++
++	  Enables -march=znver1
++
++config MZEN2
++	bool "AMD Zen 2"
++	help
++	  Select this for AMD Family 17h Zen 2 processors.
++
++	  Enables -march=znver2
++
++config MZEN3
++	bool "AMD Zen 3"
++	depends on (CC_IS_GCC && GCC_VERSION >= 100300) || (CC_IS_CLANG && CLANG_VERSION >= 120000)
++	help
++	  Select this for AMD Family 19h Zen 3 processors.
++
++	  Enables -march=znver3
++
++config MZEN4
++	bool "AMD Zen 4"
++	depends on (CC_IS_GCC && GCC_VERSION >= 130000) || (CC_IS_CLANG && CLANG_VERSION >= 160000)
++	help
++	  Select this for AMD Family 19h Zen 4 processors.
++
++	  Enables -march=znver4
++
++config MZEN5
++	bool "AMD Zen 5"
++	depends on (CC_IS_GCC && GCC_VERSION > 140000) || (CC_IS_CLANG && CLANG_VERSION >= 191000)
++	help
++	  Select this for AMD Family 19h Zen 5 processors.
++
++	  Enables -march=znver5
++
+ config MCRUSOE
+ 	bool "Crusoe"
+ 	depends on X86_32
+@@ -269,8 +370,17 @@ config MPSC
+ 	  using the cpu family field
+ 	  in /proc/cpuinfo. Family 15 is an older Xeon, Family 6 a newer one.
+ 
++config MATOM
++	bool "Intel Atom"
++	help
++
++	  Select this for the Intel Atom platform. Intel Atom CPUs have an
++	  in-order pipelining architecture and thus can benefit from
++	  accordingly optimized code. Use a recent GCC with specific Atom
++	  support in order to fully benefit from selecting this option.
++
+ config MCORE2
+-	bool "Core 2/newer Xeon"
++	bool "Intel Core 2"
+ 	help
+ 
+ 	  Select this for Intel Core 2 and newer Core 2 Xeons (Xeon 51xx and
+@@ -278,14 +388,191 @@ config MCORE2
+ 	  family in /proc/cpuinfo. Newer ones have 6 and older ones 15
+ 	  (not a typo)
+ 
+-config MATOM
+-	bool "Intel Atom"
++	  Enables -march=core2
++
++config MNEHALEM
++	bool "Intel Nehalem"
+ 	help
+ 
+-	  Select this for the Intel Atom platform. Intel Atom CPUs have an
+-	  in-order pipelining architecture and thus can benefit from
+-	  accordingly optimized code. Use a recent GCC with specific Atom
+-	  support in order to fully benefit from selecting this option.
++	  Select this for 1st Gen Core processors in the Nehalem family.
++
++	  Enables -march=nehalem
++
++config MWESTMERE
++	bool "Intel Westmere"
++	help
++
++	  Select this for the Intel Westmere formerly Nehalem-C family.
++
++	  Enables -march=westmere
++
++config MSILVERMONT
++	bool "Intel Silvermont"
++	help
++
++	  Select this for the Intel Silvermont platform.
++
++	  Enables -march=silvermont
++
++config MGOLDMONT
++	bool "Intel Goldmont"
++	help
++
++	  Select this for the Intel Goldmont platform including Apollo Lake and Denverton.
++
++	  Enables -march=goldmont
++
++config MGOLDMONTPLUS
++	bool "Intel Goldmont Plus"
++	help
++
++	  Select this for the Intel Goldmont Plus platform including Gemini Lake.
++
++	  Enables -march=goldmont-plus
++
++config MSANDYBRIDGE
++	bool "Intel Sandy Bridge"
++	help
++
++	  Select this for 2nd Gen Core processors in the Sandy Bridge family.
++
++	  Enables -march=sandybridge
++
++config MIVYBRIDGE
++	bool "Intel Ivy Bridge"
++	help
++
++	  Select this for 3rd Gen Core processors in the Ivy Bridge family.
++
++	  Enables -march=ivybridge
++
++config MHASWELL
++	bool "Intel Haswell"
++	help
++
++	  Select this for 4th Gen Core processors in the Haswell family.
++
++	  Enables -march=haswell
++
++config MBROADWELL
++	bool "Intel Broadwell"
++	help
++
++	  Select this for 5th Gen Core processors in the Broadwell family.
++
++	  Enables -march=broadwell
++
++config MSKYLAKE
++	bool "Intel Skylake"
++	help
++
++	  Select this for 6th Gen Core processors in the Skylake family.
++
++	  Enables -march=skylake
++
++config MSKYLAKEX
++	bool "Intel Skylake X"
++	help
++
++	  Select this for 6th Gen Core processors in the Skylake X family.
++
++	  Enables -march=skylake-avx512
++
++config MCANNONLAKE
++	bool "Intel Cannon Lake"
++	help
++
++	  Select this for 8th Gen Core processors
++
++	  Enables -march=cannonlake
++
++config MICELAKE
++	bool "Intel Ice Lake"
++	help
++
++	  Select this for 10th Gen Core processors in the Ice Lake family.
++
++	  Enables -march=icelake-client
++
++config MCASCADELAKE
++	bool "Intel Cascade Lake"
++	help
++
++	  Select this for Xeon processors in the Cascade Lake family.
++
++	  Enables -march=cascadelake
++
++config MCOOPERLAKE
++	bool "Intel Cooper Lake"
++	depends on (CC_IS_GCC && GCC_VERSION > 100100) || (CC_IS_CLANG && CLANG_VERSION >= 100000)
++	help
++
++	  Select this for Xeon processors in the Cooper Lake family.
++
++	  Enables -march=cooperlake
++
++config MTIGERLAKE
++	bool "Intel Tiger Lake"
++	depends on  (CC_IS_GCC && GCC_VERSION > 100100) || (CC_IS_CLANG && CLANG_VERSION >= 100000)
++	help
++
++	  Select this for third-generation 10 nm process processors in the Tiger Lake family.
++
++	  Enables -march=tigerlake
++
++config MSAPPHIRERAPIDS
++	bool "Intel Sapphire Rapids"
++	depends on (CC_IS_GCC && GCC_VERSION > 110000) || (CC_IS_CLANG && CLANG_VERSION >= 120000)
++	help
++
++	  Select this for fourth-generation 10 nm process processors in the Sapphire Rapids family.
++
++	  Enables -march=sapphirerapids
++
++config MROCKETLAKE
++	bool "Intel Rocket Lake"
++	depends on (CC_IS_GCC && GCC_VERSION > 110000) || (CC_IS_CLANG && CLANG_VERSION >= 120000)
++	help
++
++	  Select this for eleventh-generation processors in the Rocket Lake family.
++
++	  Enables -march=rocketlake
++
++config MALDERLAKE
++	bool "Intel Alder Lake"
++	depends on (CC_IS_GCC && GCC_VERSION > 110000) || (CC_IS_CLANG && CLANG_VERSION >= 120000)
++	help
++
++	  Select this for twelfth-generation processors in the Alder Lake family.
++
++	  Enables -march=alderlake
++
++config MRAPTORLAKE
++	bool "Intel Raptor Lake"
++	depends on (CC_IS_GCC && GCC_VERSION >= 130000) || (CC_IS_CLANG && CLANG_VERSION >= 150500)
++	help
++
++	  Select this for thirteenth-generation processors in the Raptor Lake family.
++
++	  Enables -march=raptorlake
++
++config MMETEORLAKE
++	bool "Intel Meteor Lake"
++	depends on (CC_IS_GCC && GCC_VERSION >= 130000) || (CC_IS_CLANG && CLANG_VERSION >= 150500)
++	help
++
++	  Select this for fourteenth-generation processors in the Meteor Lake family.
++
++	  Enables -march=meteorlake
++
++config MEMERALDRAPIDS
++	bool "Intel Emerald Rapids"
++	depends on (CC_IS_GCC && GCC_VERSION > 130000) || (CC_IS_CLANG && CLANG_VERSION >= 150500)
++	help
++
++	  Select this for fifth-generation 10 nm process processors in the Emerald Rapids family.
++
++	  Enables -march=emeraldrapids
+ 
+ config GENERIC_CPU
+ 	bool "Generic-x86-64"
+@@ -294,8 +581,32 @@ config GENERIC_CPU
+ 	  Generic x86-64 CPU.
+ 	  Run equally well on all x86-64 CPUs.
+ 
++config MNATIVE_INTEL
++	bool "Intel-Native optimizations autodetected by the compiler"
++	help
++
++	  Clang 3.8, GCC 4.2 and above support -march=native, which automatically detects
++	  the optimum settings to use based on your processor. Do NOT use this
++	  for AMD CPUs.  Intel Only!
++
++	  Enables -march=native
++
++config MNATIVE_AMD
++	bool "AMD-Native optimizations autodetected by the compiler"
++	help
++
++	  Clang 3.8, GCC 4.2 and above support -march=native, which automatically detects
++	  the optimum settings to use based on your processor. Do NOT use this
++	  for Intel CPUs.  AMD Only!
++
++	  Enables -march=native
++
+ endchoice
+ 
++config SUPPORT_MARCH_CODEVERS
++	bool
++	default y if (CC_IS_GCC && GCC_VERSION > 110000) || (CC_IS_CLANG && CLANG_VERSION >= 120000)
++
+ config X86_GENERIC
+ 	bool "Generic x86 support"
+ 	depends on X86_32
+@@ -308,6 +619,30 @@ config X86_GENERIC
+ 	  This is really intended for distributors who need more
+ 	  generic optimizations.
+ 
++config X86_64_VERSION
++	int "x86-64 compiler ISA level"
++	range 1 3
++	depends on (CC_IS_GCC && GCC_VERSION > 110000) || (CC_IS_CLANG && CLANG_VERSION >= 120000)
++	depends on X86_64 && GENERIC_CPU
++	help
++	  Specify a specific x86-64 compiler ISA level.
++
++	  There are three x86-64 ISA levels that work on top of
++	  the x86-64 baseline, namely: x86-64-v2, x86-64-v3, and x86-64-v4.
++
++	  x86-64-v2 brings support for vector instructions up to Streaming SIMD
++	  Extensions 4.2 (SSE4.2) and Supplemental Streaming SIMD Extensions 3
++	  (SSSE3), the POPCNT instruction, and CMPXCHG16B.
++
++	  x86-64-v3 adds vector instructions up to AVX2, MOVBE, and additional
++	  bit-manipulation instructions.
++
++	  x86-64-v4 is not included since the kernel does not use AVX512 instructions
++
++	  You can find the best version for your CPU by running one of the following:
++	  /lib/ld-linux-x86-64.so.2 --help | grep supported
++	  /lib64/ld-linux-x86-64.so.2 --help | grep supported
++
+ #
+ # Define implied options from the CPU selection here
+ config X86_INTERNODE_CACHE_SHIFT
+@@ -318,7 +653,7 @@ config X86_INTERNODE_CACHE_SHIFT
+ config X86_L1_CACHE_SHIFT
+ 	int
+ 	default "7" if MPENTIUM4 || MPSC
+-	default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MATOM || MVIAC7 || X86_GENERIC || GENERIC_CPU
++	default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MATOM || MVIAC7 || X86_GENERIC || GENERIC_CPU || MK8SSE3 || MK10 || MBARCELONA || MBOBCAT || MJAGUAR || MBULLDOZER || MPILEDRIVER || MSTEAMROLLER || MEXCAVATOR || MZEN || MZEN2 || MZEN3 || MZEN4 || MZEN5 || MNEHALEM || MWESTMERE || MSILVERMONT || MGOLDMONT || MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MSKYLAKEX || MCANNONLAKE || MICELAKE || MCASCADELAKE || MCOOPERLAKE || MTIGERLAKE || MSAPPHIRERAPIDS || MROCKETLAKE || MALDERLAKE || MRAPTORLAKE || MMETEORLAKE || MEMERALDRAPIDS || MNATIVE_INTEL || MNATIVE_AMD
+ 	default "4" if MELAN || M486SX || M486 || MGEODEGX1
+ 	default "5" if MWINCHIP3D || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX
+ 
+@@ -336,11 +671,11 @@ config X86_ALIGNMENT_16
+ 
+ config X86_INTEL_USERCOPY
+ 	def_bool y
+-	depends on MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M586MMX || X86_GENERIC || MK8 || MK7 || MEFFICEON || MCORE2
++	depends on MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M586MMX || X86_GENERIC || MK8 || MK7 || MEFFICEON || MCORE2 || MNEHALEM || MWESTMERE || MSILVERMONT || MGOLDMONT || MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MSKYLAKEX || MCANNONLAKE || MICELAKE || MCASCADELAKE || MCOOPERLAKE || MTIGERLAKE || MSAPPHIRERAPIDS || MROCKETLAKE || MALDERLAKE || MRAPTORLAKE || MMETEORLAKE || MEMERALDRAPIDS || MNATIVE_INTEL
+ 
+ config X86_USE_PPRO_CHECKSUM
+ 	def_bool y
+-	depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MVIAC7 || MEFFICEON || MGEODE_LX || MCORE2 || MATOM
++	depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MVIAC7 || MEFFICEON || MGEODE_LX || MCORE2 || MATOM || MK8SSE3 || MK10 || MBARCELONA || MBOBCAT || MJAGUAR || MBULLDOZER || MPILEDRIVER || MSTEAMROLLER || MEXCAVATOR || MZEN || MZEN2 || MZEN3 || MZEN4 || MZEN5 || MNEHALEM || MWESTMERE || MSILVERMONT || MGOLDMONT || MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MSKYLAKEX || MCANNONLAKE || MICELAKE || MCASCADELAKE || MCOOPERLAKE || MTIGERLAKE || MSAPPHIRERAPIDS || MROCKETLAKE || MALDERLAKE || MRAPTORLAKE || MMETEORLAKE || MEMERALDRAPIDS || MNATIVE_INTEL || MNATIVE_AMD
+ 
+ #
+ # P6_NOPs are a relatively minor optimization that require a family >=
+--- a/arch/x86/Makefile
++++ b/arch/x86/Makefile
+@@ -178,14 +178,99 @@ else
+         cflags-$(CONFIG_MPSC)		+= -march=nocona
+         cflags-$(CONFIG_MCORE2)		+= -march=core2
+         cflags-$(CONFIG_MATOM)		+= -march=atom
+-        cflags-$(CONFIG_GENERIC_CPU)	+= -mtune=generic
++        ifeq ($(CONFIG_X86_64_VERSION),1)
++          cflags-$(CONFIG_GENERIC_CPU)		+= -mtune=generic
++          rustflags-$(CONFIG_GENERIC_CPU)	+= -Ztune-cpu=generic
++        else
++          cflags-$(CONFIG_GENERIC_CPU)		+= -march=x86-64-v$(CONFIG_X86_64_VERSION)
++          rustflags-$(CONFIG_GENERIC_CPU)	+= -Ctarget-cpu=x86-64-v$(CONFIG_X86_64_VERSION)
++        endif
++        cflags-$(CONFIG_MK8SSE3)	+= -march=k8-sse3
++        cflags-$(CONFIG_MK10) 		+= -march=amdfam10
++        cflags-$(CONFIG_MBARCELONA) 	+= -march=barcelona
++        cflags-$(CONFIG_MBOBCAT) 	+= -march=btver1
++        cflags-$(CONFIG_MJAGUAR) 	+= -march=btver2
++        cflags-$(CONFIG_MBULLDOZER) 	+= -march=bdver1
++        cflags-$(CONFIG_MPILEDRIVER)	+= -march=bdver2 -mno-tbm
++        cflags-$(CONFIG_MSTEAMROLLER) 	+= -march=bdver3 -mno-tbm
++        cflags-$(CONFIG_MEXCAVATOR) 	+= -march=bdver4 -mno-tbm
++        cflags-$(CONFIG_MZEN) 		+= -march=znver1
++        cflags-$(CONFIG_MZEN2) 	+= -march=znver2
++        cflags-$(CONFIG_MZEN3) 	+= -march=znver3
++        cflags-$(CONFIG_MZEN4) 	+= -march=znver4
++        cflags-$(CONFIG_MZEN5) 	+= -march=znver5
++        cflags-$(CONFIG_MNATIVE_INTEL) += -march=native
++        cflags-$(CONFIG_MNATIVE_AMD) 	+= -march=native -mno-tbm
++        cflags-$(CONFIG_MATOM) 	+= -march=bonnell
++        cflags-$(CONFIG_MCORE2) 	+= -march=core2
++        cflags-$(CONFIG_MNEHALEM) 	+= -march=nehalem
++        cflags-$(CONFIG_MWESTMERE) 	+= -march=westmere
++        cflags-$(CONFIG_MSILVERMONT) 	+= -march=silvermont
++        cflags-$(CONFIG_MGOLDMONT) 	+= -march=goldmont
++        cflags-$(CONFIG_MGOLDMONTPLUS) += -march=goldmont-plus
++        cflags-$(CONFIG_MSANDYBRIDGE) 	+= -march=sandybridge
++        cflags-$(CONFIG_MIVYBRIDGE) 	+= -march=ivybridge
++        cflags-$(CONFIG_MHASWELL) 	+= -march=haswell
++        cflags-$(CONFIG_MBROADWELL) 	+= -march=broadwell
++        cflags-$(CONFIG_MSKYLAKE) 	+= -march=skylake
++        cflags-$(CONFIG_MSKYLAKEX) 	+= -march=skylake-avx512
++        cflags-$(CONFIG_MCANNONLAKE) 	+= -march=cannonlake
++        cflags-$(CONFIG_MICELAKE) 	+= -march=icelake-client
++        cflags-$(CONFIG_MCASCADELAKE) 	+= -march=cascadelake
++        cflags-$(CONFIG_MCOOPERLAKE) 	+= -march=cooperlake
++        cflags-$(CONFIG_MTIGERLAKE) 	+= -march=tigerlake
++        cflags-$(CONFIG_MSAPPHIRERAPIDS) += -march=sapphirerapids
++        cflags-$(CONFIG_MROCKETLAKE) 	+= -march=rocketlake
++        cflags-$(CONFIG_MALDERLAKE) 	+= -march=alderlake
++        cflags-$(CONFIG_MRAPTORLAKE) 	+= -march=raptorlake
++        cflags-$(CONFIG_MMETEORLAKE) 	+= -march=meteorlake
++        cflags-$(CONFIG_MEMERALDRAPIDS)	+= -march=emeraldrapids
+         KBUILD_CFLAGS += $(cflags-y)
+ 
+         rustflags-$(CONFIG_MK8)		+= -Ctarget-cpu=k8
+         rustflags-$(CONFIG_MPSC)	+= -Ctarget-cpu=nocona
+         rustflags-$(CONFIG_MCORE2)	+= -Ctarget-cpu=core2
+         rustflags-$(CONFIG_MATOM)	+= -Ctarget-cpu=atom
+-        rustflags-$(CONFIG_GENERIC_CPU)	+= -Ztune-cpu=generic
++        rustflags-$(CONFIG_MK8SSE3)	+= -Ctarget-cpu=k8-sse3
++        rustflags-$(CONFIG_MK10) 		+= -Ctarget-cpu=amdfam10
++        rustflags-$(CONFIG_MBARCELONA) 	+= -Ctarget-cpu=barcelona
++        rustflags-$(CONFIG_MBOBCAT) 	+= -Ctarget-cpu=btver1
++        rustflags-$(CONFIG_MJAGUAR) 	+= -Ctarget-cpu=btver2
++        rustflags-$(CONFIG_MBULLDOZER) 	+= -Ctarget-cpu=bdver1
++        rustflags-$(CONFIG_MPILEDRIVER)	+= -Ctarget-cpu=bdver2
++        rustflags-$(CONFIG_MSTEAMROLLER) 	+= -Ctarget-cpu=bdver3
++        rustflags-$(CONFIG_MEXCAVATOR) 	+= -Ctarget-cpu=bdver4
++        rustflags-$(CONFIG_MZEN) 		+= -Ctarget-cpu=znver1
++        rustflags-$(CONFIG_MZEN2) 	+= -Ctarget-cpu=znver2
++        rustflags-$(CONFIG_MZEN3) 	+= -Ctarget-cpu=znver3
++        rustflags-$(CONFIG_MZEN4) 	+= -Ctarget-cpu=znver4
++        rustflags-$(CONFIG_MZEN5) 	+= -Ctarget-cpu=znver5
++        rustflags-$(CONFIG_MNATIVE_INTEL) += -Ctarget-cpu=native
++        rustflags-$(CONFIG_MNATIVE_AMD) 	+= -Ctarget-cpu=native
++        rustflags-$(CONFIG_MATOM) 	+= -Ctarget-cpu=bonnell
++        rustflags-$(CONFIG_MCORE2) 	+= -Ctarget-cpu=core2
++        rustflags-$(CONFIG_MNEHALEM) 	+= -Ctarget-cpu=nehalem
++        rustflags-$(CONFIG_MWESTMERE) 	+= -Ctarget-cpu=westmere
++        rustflags-$(CONFIG_MSILVERMONT) 	+= -Ctarget-cpu=silvermont
++        rustflags-$(CONFIG_MGOLDMONT) 	+= -Ctarget-cpu=goldmont
++        rustflags-$(CONFIG_MGOLDMONTPLUS) += -Ctarget-cpu=goldmont-plus
++        rustflags-$(CONFIG_MSANDYBRIDGE) 	+= -Ctarget-cpu=sandybridge
++        rustflags-$(CONFIG_MIVYBRIDGE) 	+= -Ctarget-cpu=ivybridge
++        rustflags-$(CONFIG_MHASWELL) 	+= -Ctarget-cpu=haswell
++        rustflags-$(CONFIG_MBROADWELL) 	+= -Ctarget-cpu=broadwell
++        rustflags-$(CONFIG_MSKYLAKE) 	+= -Ctarget-cpu=skylake
++        rustflags-$(CONFIG_MSKYLAKEX) 	+= -Ctarget-cpu=skylake-avx512
++        rustflags-$(CONFIG_MCANNONLAKE) 	+= -Ctarget-cpu=cannonlake
++        rustflags-$(CONFIG_MICELAKE) 	+= -Ctarget-cpu=icelake-client
++        rustflags-$(CONFIG_MCASCADELAKE) 	+= -Ctarget-cpu=cascadelake
++        rustflags-$(CONFIG_MCOOPERLAKE) 	+= -Ctarget-cpu=cooperlake
++        rustflags-$(CONFIG_MTIGERLAKE) 	+= -Ctarget-cpu=tigerlake
++        rustflags-$(CONFIG_MSAPPHIRERAPIDS) += -Ctarget-cpu=sapphirerapids
++        rustflags-$(CONFIG_MROCKETLAKE) 	+= -Ctarget-cpu=rocketlake
++        rustflags-$(CONFIG_MALDERLAKE) 	+= -Ctarget-cpu=alderlake
++        rustflags-$(CONFIG_MRAPTORLAKE) 	+= -Ctarget-cpu=raptorlake
++        rustflags-$(CONFIG_MMETEORLAKE) 	+= -Ctarget-cpu=meteorlake
++        rustflags-$(CONFIG_MEMERALDRAPIDS)	+= -Ctarget-cpu=emeraldrapids
+         KBUILD_RUSTFLAGS += $(rustflags-y)
+ 
+         KBUILD_CFLAGS += -mno-red-zone
+--- a/arch/x86/include/asm/vermagic.h
++++ b/arch/x86/include/asm/vermagic.h
+@@ -17,6 +17,54 @@
+ #define MODULE_PROC_FAMILY "586MMX "
+ #elif defined CONFIG_MCORE2
+ #define MODULE_PROC_FAMILY "CORE2 "
++#elif defined CONFIG_MNATIVE_INTEL
++#define MODULE_PROC_FAMILY "NATIVE_INTEL "
++#elif defined CONFIG_MNATIVE_AMD
++#define MODULE_PROC_FAMILY "NATIVE_AMD "
++#elif defined CONFIG_MNEHALEM
++#define MODULE_PROC_FAMILY "NEHALEM "
++#elif defined CONFIG_MWESTMERE
++#define MODULE_PROC_FAMILY "WESTMERE "
++#elif defined CONFIG_MSILVERMONT
++#define MODULE_PROC_FAMILY "SILVERMONT "
++#elif defined CONFIG_MGOLDMONT
++#define MODULE_PROC_FAMILY "GOLDMONT "
++#elif defined CONFIG_MGOLDMONTPLUS
++#define MODULE_PROC_FAMILY "GOLDMONTPLUS "
++#elif defined CONFIG_MSANDYBRIDGE
++#define MODULE_PROC_FAMILY "SANDYBRIDGE "
++#elif defined CONFIG_MIVYBRIDGE
++#define MODULE_PROC_FAMILY "IVYBRIDGE "
++#elif defined CONFIG_MHASWELL
++#define MODULE_PROC_FAMILY "HASWELL "
++#elif defined CONFIG_MBROADWELL
++#define MODULE_PROC_FAMILY "BROADWELL "
++#elif defined CONFIG_MSKYLAKE
++#define MODULE_PROC_FAMILY "SKYLAKE "
++#elif defined CONFIG_MSKYLAKEX
++#define MODULE_PROC_FAMILY "SKYLAKEX "
++#elif defined CONFIG_MCANNONLAKE
++#define MODULE_PROC_FAMILY "CANNONLAKE "
++#elif defined CONFIG_MICELAKE
++#define MODULE_PROC_FAMILY "ICELAKE "
++#elif defined CONFIG_MCASCADELAKE
++#define MODULE_PROC_FAMILY "CASCADELAKE "
++#elif defined CONFIG_MCOOPERLAKE
++#define MODULE_PROC_FAMILY "COOPERLAKE "
++#elif defined CONFIG_MTIGERLAKE
++#define MODULE_PROC_FAMILY "TIGERLAKE "
++#elif defined CONFIG_MSAPPHIRERAPIDS
++#define MODULE_PROC_FAMILY "SAPPHIRERAPIDS "
++#elif defined CONFIG_ROCKETLAKE
++#define MODULE_PROC_FAMILY "ROCKETLAKE "
++#elif defined CONFIG_MALDERLAKE
++#define MODULE_PROC_FAMILY "ALDERLAKE "
++#elif defined CONFIG_MRAPTORLAKE
++#define MODULE_PROC_FAMILY "RAPTORLAKE "
++#elif defined CONFIG_MMETEORLAKE
++#define MODULE_PROC_FAMILY "METEORLAKE "
++#elif defined CONFIG_MEMERALDRAPIDS
++#define MODULE_PROC_FAMILY "EMERALDRAPIDS "
+ #elif defined CONFIG_MATOM
+ #define MODULE_PROC_FAMILY "ATOM "
+ #elif defined CONFIG_M686
+@@ -35,6 +83,28 @@
+ #define MODULE_PROC_FAMILY "K7 "
+ #elif defined CONFIG_MK8
+ #define MODULE_PROC_FAMILY "K8 "
++#elif defined CONFIG_MK8SSE3
++#define MODULE_PROC_FAMILY "K8SSE3 "
++#elif defined CONFIG_MK10
++#define MODULE_PROC_FAMILY "K10 "
++#elif defined CONFIG_MBARCELONA
++#define MODULE_PROC_FAMILY "BARCELONA "
++#elif defined CONFIG_MBOBCAT
++#define MODULE_PROC_FAMILY "BOBCAT "
++#elif defined CONFIG_MBULLDOZER
++#define MODULE_PROC_FAMILY "BULLDOZER "
++#elif defined CONFIG_MPILEDRIVER
++#define MODULE_PROC_FAMILY "PILEDRIVER "
++#elif defined CONFIG_MSTEAMROLLER
++#define MODULE_PROC_FAMILY "STEAMROLLER "
++#elif defined CONFIG_MJAGUAR
++#define MODULE_PROC_FAMILY "JAGUAR "
++#elif defined CONFIG_MEXCAVATOR
++#define MODULE_PROC_FAMILY "EXCAVATOR "
++#elif defined CONFIG_MZEN
++#define MODULE_PROC_FAMILY "ZEN "
++#elif defined CONFIG_MZEN2
++#define MODULE_PROC_FAMILY "ZEN2 "
+ #elif defined CONFIG_MELAN
+ #define MODULE_PROC_FAMILY "ELAN "
+ #elif defined CONFIG_MCRUSOE
diff --git a/debian/patches/mixed-arch/0002-ZEN-Fixup-graysky-s-more-ISA-levels-and-uarches.patch b/debian/patches/mixed-arch/0002-ZEN-Fixup-graysky-s-more-ISA-levels-and-uarches.patch
new file mode 100644
index 0000000..35fee6e
--- /dev/null
+++ b/debian/patches/mixed-arch/0002-ZEN-Fixup-graysky-s-more-ISA-levels-and-uarches.patch
@@ -0,0 +1,60 @@
+From 44295ad130b8735cecb288dd7463a14892803d9b Mon Sep 17 00:00:00 2001
+From: "Jan Alexander Steffens (heftig)" <heftig@archlinux.org>
+Date: Tue, 1 Oct 2024 02:05:12 +0200
+Subject: ZEN: Fixup graysky's more-ISA-levels-and-uarches
+
+See: https://github.com/graysky2/kernel_compiler_patch/issues/105
+---
+ arch/x86/Kconfig.cpu            | 4 ----
+ arch/x86/Makefile               | 4 ----
+ arch/x86/include/asm/vermagic.h | 6 ++++++
+ 3 files changed, 6 insertions(+), 8 deletions(-)
+
+--- a/arch/x86/Kconfig.cpu
++++ b/arch/x86/Kconfig.cpu
+@@ -603,10 +603,6 @@ config MNATIVE_AMD
+ 
+ endchoice
+ 
+-config SUPPORT_MARCH_CODEVERS
+-	bool
+-	default y if (CC_IS_GCC && GCC_VERSION > 110000) || (CC_IS_CLANG && CLANG_VERSION >= 120000)
+-
+ config X86_GENERIC
+ 	bool "Generic x86 support"
+ 	depends on X86_32
+--- a/arch/x86/Makefile
++++ b/arch/x86/Makefile
+@@ -176,8 +176,6 @@ else
+         # FIXME - should be integrated in Makefile.cpu (Makefile_32.cpu)
+         cflags-$(CONFIG_MK8)		+= -march=k8
+         cflags-$(CONFIG_MPSC)		+= -march=nocona
+-        cflags-$(CONFIG_MCORE2)		+= -march=core2
+-        cflags-$(CONFIG_MATOM)		+= -march=atom
+         ifeq ($(CONFIG_X86_64_VERSION),1)
+           cflags-$(CONFIG_GENERIC_CPU)		+= -mtune=generic
+           rustflags-$(CONFIG_GENERIC_CPU)	+= -Ztune-cpu=generic
+@@ -229,8 +227,6 @@ else
+ 
+         rustflags-$(CONFIG_MK8)		+= -Ctarget-cpu=k8
+         rustflags-$(CONFIG_MPSC)	+= -Ctarget-cpu=nocona
+-        rustflags-$(CONFIG_MCORE2)	+= -Ctarget-cpu=core2
+-        rustflags-$(CONFIG_MATOM)	+= -Ctarget-cpu=atom
+         rustflags-$(CONFIG_MK8SSE3)	+= -Ctarget-cpu=k8-sse3
+         rustflags-$(CONFIG_MK10) 		+= -Ctarget-cpu=amdfam10
+         rustflags-$(CONFIG_MBARCELONA) 	+= -Ctarget-cpu=barcelona
+--- a/arch/x86/include/asm/vermagic.h
++++ b/arch/x86/include/asm/vermagic.h
+@@ -105,6 +105,12 @@
+ #define MODULE_PROC_FAMILY "ZEN "
+ #elif defined CONFIG_MZEN2
+ #define MODULE_PROC_FAMILY "ZEN2 "
++#elif defined CONFIG_MZEN3
++#define MODULE_PROC_FAMILY "ZEN3 "
++#elif defined CONFIG_MZEN4
++#define MODULE_PROC_FAMILY "ZEN4 "
++#elif defined CONFIG_MZEN5
++#define MODULE_PROC_FAMILY "ZEN5 "
+ #elif defined CONFIG_MELAN
+ #define MODULE_PROC_FAMILY "ELAN "
+ #elif defined CONFIG_MCRUSOE
diff --git a/debian/patches/mixed-arch/0003-ZEN-Restore-CONFIG_OPTIMIZE_FOR_PERFORMANCE_O3.patch b/debian/patches/mixed-arch/0003-ZEN-Restore-CONFIG_OPTIMIZE_FOR_PERFORMANCE_O3.patch
new file mode 100644
index 0000000..b3ff8a4
--- /dev/null
+++ b/debian/patches/mixed-arch/0003-ZEN-Restore-CONFIG_OPTIMIZE_FOR_PERFORMANCE_O3.patch
@@ -0,0 +1,40 @@
+From 8dc948926f5b68b16a6a47a8f6e0a2154ac8ef3e Mon Sep 17 00:00:00 2001
+From: "Jan Alexander Steffens (heftig)" <heftig@archlinux.org>
+Date: Sun, 11 Dec 2022 23:51:16 +0100
+Subject: ZEN: Restore CONFIG_OPTIMIZE_FOR_PERFORMANCE_O3
+
+This reverts a6036a41bffba3d5007e377483b425d470ad8042 (kbuild: drop
+support for CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3), removes the
+dependency on CONFIG_ARC and adds RUSTFLAGS.
+---
+ Makefile     | 3 +++
+ init/Kconfig | 6 ++++++
+ 2 files changed, 9 insertions(+)
+
+--- a/Makefile
++++ b/Makefile
+@@ -814,6 +814,9 @@ KBUILD_CFLAGS	+= -fno-delete-null-pointe
+ ifdef CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE
+ KBUILD_CFLAGS += -O2
+ KBUILD_RUSTFLAGS += -Copt-level=2
++else ifdef CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3
++KBUILD_CFLAGS += -O3
++KBUILD_RUSTFLAGS += -Copt-level=3
+ else ifdef CONFIG_CC_OPTIMIZE_FOR_SIZE
+ KBUILD_CFLAGS += -Os
+ KBUILD_RUSTFLAGS += -Copt-level=s
+--- a/init/Kconfig
++++ b/init/Kconfig
+@@ -1407,6 +1407,12 @@ config CC_OPTIMIZE_FOR_PERFORMANCE
+ 	  with the "-O2" compiler flag for best performance and most
+ 	  helpful compile-time warnings.
+ 
++config CC_OPTIMIZE_FOR_PERFORMANCE_O3
++	bool "Optimize more for performance (-O3)"
++	help
++	  Choosing this option will pass "-O3" to your compiler to optimize
++	  the kernel yet more for performance.
++
+ config CC_OPTIMIZE_FOR_SIZE
+ 	bool "Optimize for size (-Os)"
+ 	help
diff --git a/debian/patches/mixed-arch/0004-krd-adjust-CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3.patch b/debian/patches/mixed-arch/0004-krd-adjust-CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3.patch
new file mode 100644
index 0000000..2507221
--- /dev/null
+++ b/debian/patches/mixed-arch/0004-krd-adjust-CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3.patch
@@ -0,0 +1,11 @@
+--- a/Makefile
++++ b/Makefile
+@@ -815,7 +815,7 @@ ifdef CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE
+ KBUILD_CFLAGS += -O2
+ KBUILD_RUSTFLAGS += -Copt-level=2
+ else ifdef CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3
+-KBUILD_CFLAGS += -O3
++KBUILD_CFLAGS += -O3 $(call cc-option,-fivopts)
+ KBUILD_RUSTFLAGS += -Copt-level=3
+ else ifdef CONFIG_CC_OPTIMIZE_FOR_SIZE
+ KBUILD_CFLAGS += -Os
diff --git a/debian/patches/mixed-arch/0005-XANMOD-x86-build-Prevent-generating-avx2-and-avx512-.patch b/debian/patches/mixed-arch/0005-XANMOD-x86-build-Prevent-generating-avx2-and-avx512-.patch
new file mode 100644
index 0000000..b66b0b9
--- /dev/null
+++ b/debian/patches/mixed-arch/0005-XANMOD-x86-build-Prevent-generating-avx2-and-avx512-.patch
@@ -0,0 +1,25 @@
+From 3ebc1fdf3e0ee9bff1efe20eb5791eba5c84a810 Mon Sep 17 00:00:00 2001
+From: Alexandre Frade <kernel@xanmod.org>
+Date: Thu, 3 Aug 2023 13:53:49 +0000
+Subject: [PATCH 01/19] XANMOD: x86/build: Prevent generating avx2 and avx512
+ floating-point code
+
+Signed-off-by: Alexandre Frade <kernel@xanmod.org>
+---
+ arch/x86/Makefile | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+--- a/arch/x86/Makefile
++++ b/arch/x86/Makefile
+@@ -70,9 +70,9 @@ export BITS
+ #
+ #    https://gcc.gnu.org/bugzilla/show_bug.cgi?id=53383
+ #
+-KBUILD_CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow -mno-avx
++KBUILD_CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow -mno-avx -mno-avx2 -mno-avx512f
+ KBUILD_RUSTFLAGS += --target=$(objtree)/scripts/target.json
+-KBUILD_RUSTFLAGS += -Ctarget-feature=-sse,-sse2,-sse3,-ssse3,-sse4.1,-sse4.2,-avx,-avx2
++KBUILD_RUSTFLAGS += -Ctarget-feature=-sse,-sse2,-sse3,-ssse3,-sse4.1,-sse4.2,-avx,-avx2,-avx512f
+ 
+ #
+ # CFLAGS for compiling floating point code inside the kernel.
diff --git a/debian/patches/mixed-arch/0006-krd-adjust-KBUILD_CFLAGS-fno-tree-vectorize.patch b/debian/patches/mixed-arch/0006-krd-adjust-KBUILD_CFLAGS-fno-tree-vectorize.patch
new file mode 100644
index 0000000..58c9da1
--- /dev/null
+++ b/debian/patches/mixed-arch/0006-krd-adjust-KBUILD_CFLAGS-fno-tree-vectorize.patch
@@ -0,0 +1,11 @@
+--- a/arch/x86/Makefile
++++ b/arch/x86/Makefile
+@@ -70,7 +70,7 @@ export BITS
+ #
+ #    https://gcc.gnu.org/bugzilla/show_bug.cgi?id=53383
+ #
+-KBUILD_CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow -mno-avx -mno-avx2 -mno-avx512f
++KBUILD_CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow -mno-avx -mno-avx2 -mno-avx512f -fno-tree-vectorize
+ KBUILD_RUSTFLAGS += --target=$(objtree)/scripts/target.json
+ KBUILD_RUSTFLAGS += -Ctarget-feature=-sse,-sse2,-sse3,-ssse3,-sse4.1,-sse4.2,-avx,-avx2,-avx512f
+ 
diff --git a/debian/patches/patchset-pf/amd-pstate/0001-cpufreq-amd-pstate-add-quirk-for-Ryzen-3000-series-p.patch b/debian/patches/patchset-pf/amd-pstate/0001-cpufreq-amd-pstate-add-quirk-for-Ryzen-3000-series-p.patch
new file mode 100644
index 0000000..01b3aed
--- /dev/null
+++ b/debian/patches/patchset-pf/amd-pstate/0001-cpufreq-amd-pstate-add-quirk-for-Ryzen-3000-series-p.patch
@@ -0,0 +1,70 @@
+From 3427331872c37b2edb42406c65764e1565b0591b Mon Sep 17 00:00:00 2001
+From: Perry Yuan <perry.yuan@amd.com>
+Date: Fri, 9 Aug 2024 14:09:05 +0800
+Subject: cpufreq: amd-pstate: add quirk for Ryzen 3000 series processor
+
+The Ryzen 3000 series processors have been observed lacking the
+nominal_freq and lowest_freq parameters in their ACPI tables. This
+absence causes issues with loading the amd-pstate driver on these
+systems. Introduces a fix to resolve the dependency issue
+by adding a quirk specifically for the Ryzen 3000 series.
+
+Reported-by: David Wang <00107082@163.com>
+Signed-off-by: Perry Yuan <perry.yuan@amd.com>
+---
+ drivers/cpufreq/amd-pstate.c | 30 ++++++++++++++++++++++++++++++
+ 1 file changed, 30 insertions(+)
+
+--- a/drivers/cpufreq/amd-pstate.c
++++ b/drivers/cpufreq/amd-pstate.c
+@@ -142,6 +142,11 @@ static struct quirk_entry quirk_amd_7k62
+ 	.lowest_freq = 550,
+ };
+ 
++static struct quirk_entry quirk_amd_mts = {
++	.nominal_freq = 3600,
++	.lowest_freq = 550,
++};
++
+ static int __init dmi_matched_7k62_bios_bug(const struct dmi_system_id *dmi)
+ {
+ 	/**
+@@ -158,6 +163,21 @@ static int __init dmi_matched_7k62_bios_
+ 	return 0;
+ }
+ 
++static int __init dmi_matched_mts_bios_bug(const struct dmi_system_id *dmi)
++{
++	/**
++	 * match the broken bios for ryzen 3000 series processor support CPPC V2
++	 * broken BIOS lack of nominal_freq and lowest_freq capabilities
++	 * definition in ACPI tables
++	 */
++	if (cpu_feature_enabled(X86_FEATURE_ZEN2)) {
++		quirks = dmi->driver_data;
++		pr_info("Overriding nominal and lowest frequencies for %s\n", dmi->ident);
++		return 1;
++	}
++
++	return 0;
++}
+ static const struct dmi_system_id amd_pstate_quirks_table[] __initconst = {
+ 	{
+ 		.callback = dmi_matched_7k62_bios_bug,
+@@ -168,6 +188,16 @@ static const struct dmi_system_id amd_ps
+ 		},
+ 		.driver_data = &quirk_amd_7k62,
+ 	},
++	{
++		.callback = dmi_matched_mts_bios_bug,
++		.ident = "AMD Ryzen 3000",
++		.matches = {
++			DMI_MATCH(DMI_PRODUCT_NAME, "B450M MORTAR MAX (MS-7B89)"),
++			DMI_MATCH(DMI_BIOS_RELEASE, "06/10/2020"),
++			DMI_MATCH(DMI_BIOS_VERSION, "5.14"),
++		},
++		.driver_data = &quirk_amd_mts,
++	},
+ 	{}
+ };
+ MODULE_DEVICE_TABLE(dmi, amd_pstate_quirks_table);
diff --git a/debian/patches/patchset-pf/amd-pstate/0002-cpufreq-amd-pstate-Export-symbols-for-changing-modes.patch b/debian/patches/patchset-pf/amd-pstate/0002-cpufreq-amd-pstate-Export-symbols-for-changing-modes.patch
new file mode 100644
index 0000000..a2f1bf5
--- /dev/null
+++ b/debian/patches/patchset-pf/amd-pstate/0002-cpufreq-amd-pstate-Export-symbols-for-changing-modes.patch
@@ -0,0 +1,88 @@
+From 44f21855901b1fd618ac16b07dbd14e8fea4ee13 Mon Sep 17 00:00:00 2001
+From: Mario Limonciello <mario.limonciello@amd.com>
+Date: Sat, 31 Aug 2024 21:49:11 -0500
+Subject: cpufreq/amd-pstate: Export symbols for changing modes
+
+In order to effectively test all mode switch combinations export
+everything necessarily for amd-pstate-ut to trigger a mode switch.
+
+Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
+---
+ drivers/cpufreq/amd-pstate.c | 23 ++++++++++-------------
+ drivers/cpufreq/amd-pstate.h | 14 ++++++++++++++
+ 2 files changed, 24 insertions(+), 13 deletions(-)
+
+--- a/drivers/cpufreq/amd-pstate.c
++++ b/drivers/cpufreq/amd-pstate.c
+@@ -60,18 +60,6 @@
+ #define AMD_CPPC_EPP_BALANCE_POWERSAVE		0xBF
+ #define AMD_CPPC_EPP_POWERSAVE			0xFF
+ 
+-/*
+- * enum amd_pstate_mode - driver working mode of amd pstate
+- */
+-enum amd_pstate_mode {
+-	AMD_PSTATE_UNDEFINED = 0,
+-	AMD_PSTATE_DISABLE,
+-	AMD_PSTATE_PASSIVE,
+-	AMD_PSTATE_ACTIVE,
+-	AMD_PSTATE_GUIDED,
+-	AMD_PSTATE_MAX,
+-};
+-
+ static const char * const amd_pstate_mode_string[] = {
+ 	[AMD_PSTATE_UNDEFINED]   = "undefined",
+ 	[AMD_PSTATE_DISABLE]     = "disable",
+@@ -81,6 +69,14 @@ static const char * const amd_pstate_mod
+ 	NULL,
+ };
+ 
++const char *amd_pstate_get_mode_string(enum amd_pstate_mode mode)
++{
++	if (mode < 0 || mode >= AMD_PSTATE_MAX)
++		return NULL;
++	return amd_pstate_mode_string[mode];
++}
++EXPORT_SYMBOL_GPL(amd_pstate_get_mode_string);
++
+ struct quirk_entry {
+ 	u32 nominal_freq;
+ 	u32 lowest_freq;
+@@ -1392,7 +1388,7 @@ static ssize_t amd_pstate_show_status(ch
+ 	return sysfs_emit(buf, "%s\n", amd_pstate_mode_string[cppc_state]);
+ }
+ 
+-static int amd_pstate_update_status(const char *buf, size_t size)
++int amd_pstate_update_status(const char *buf, size_t size)
+ {
+ 	int mode_idx;
+ 
+@@ -1409,6 +1405,7 @@ static int amd_pstate_update_status(cons
+ 
+ 	return 0;
+ }
++EXPORT_SYMBOL_GPL(amd_pstate_update_status);
+ 
+ static ssize_t status_show(struct device *dev,
+ 			   struct device_attribute *attr, char *buf)
+--- a/drivers/cpufreq/amd-pstate.h
++++ b/drivers/cpufreq/amd-pstate.h
+@@ -103,4 +103,18 @@ struct amd_cpudata {
+ 	bool	boost_state;
+ };
+ 
++/*
++ * enum amd_pstate_mode - driver working mode of amd pstate
++ */
++enum amd_pstate_mode {
++	AMD_PSTATE_UNDEFINED = 0,
++	AMD_PSTATE_DISABLE,
++	AMD_PSTATE_PASSIVE,
++	AMD_PSTATE_ACTIVE,
++	AMD_PSTATE_GUIDED,
++	AMD_PSTATE_MAX,
++};
++const char *amd_pstate_get_mode_string(enum amd_pstate_mode mode);
++int amd_pstate_update_status(const char *buf, size_t size);
++
+ #endif /* _LINUX_AMD_PSTATE_H */
diff --git a/debian/patches/patchset-pf/amd-pstate/0003-cpufreq-amd-pstate-ut-Add-test-case-for-mode-switche.patch b/debian/patches/patchset-pf/amd-pstate/0003-cpufreq-amd-pstate-ut-Add-test-case-for-mode-switche.patch
new file mode 100644
index 0000000..ccfc5ab
--- /dev/null
+++ b/debian/patches/patchset-pf/amd-pstate/0003-cpufreq-amd-pstate-ut-Add-test-case-for-mode-switche.patch
@@ -0,0 +1,77 @@
+From aabfc7370a7da9c52be97c79ba70a20201e6864a Mon Sep 17 00:00:00 2001
+From: Mario Limonciello <mario.limonciello@amd.com>
+Date: Sat, 31 Aug 2024 21:49:12 -0500
+Subject: cpufreq/amd-pstate-ut: Add test case for mode switches
+
+There is a state machine in the amd-pstate driver utilized for
+switches for all modes. To make sure that cleanup and setup works
+properly for each mode add a unit test case that tries all
+combinations.
+
+Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
+---
+ drivers/cpufreq/amd-pstate-ut.c | 41 ++++++++++++++++++++++++++++++++-
+ 1 file changed, 40 insertions(+), 1 deletion(-)
+
+--- a/drivers/cpufreq/amd-pstate-ut.c
++++ b/drivers/cpufreq/amd-pstate-ut.c
+@@ -54,12 +54,14 @@ static void amd_pstate_ut_acpi_cpc_valid
+ static void amd_pstate_ut_check_enabled(u32 index);
+ static void amd_pstate_ut_check_perf(u32 index);
+ static void amd_pstate_ut_check_freq(u32 index);
++static void amd_pstate_ut_check_driver(u32 index);
+ 
+ static struct amd_pstate_ut_struct amd_pstate_ut_cases[] = {
+ 	{"amd_pstate_ut_acpi_cpc_valid",   amd_pstate_ut_acpi_cpc_valid   },
+ 	{"amd_pstate_ut_check_enabled",    amd_pstate_ut_check_enabled    },
+ 	{"amd_pstate_ut_check_perf",       amd_pstate_ut_check_perf       },
+-	{"amd_pstate_ut_check_freq",       amd_pstate_ut_check_freq       }
++	{"amd_pstate_ut_check_freq",       amd_pstate_ut_check_freq       },
++	{"amd_pstate_ut_check_driver",	   amd_pstate_ut_check_driver     }
+ };
+ 
+ static bool get_shared_mem(void)
+@@ -257,6 +259,43 @@ skip_test:
+ 	cpufreq_cpu_put(policy);
+ }
+ 
++static int amd_pstate_set_mode(enum amd_pstate_mode mode)
++{
++	const char *mode_str = amd_pstate_get_mode_string(mode);
++
++	pr_debug("->setting mode to %s\n", mode_str);
++
++	return amd_pstate_update_status(mode_str, strlen(mode_str));
++}
++
++static void amd_pstate_ut_check_driver(u32 index)
++{
++	enum amd_pstate_mode mode1, mode2;
++	int ret;
++
++	for (mode1 = AMD_PSTATE_DISABLE; mode1 < AMD_PSTATE_MAX; mode1++) {
++		ret = amd_pstate_set_mode(mode1);
++		if (ret)
++			goto out;
++		for (mode2 = AMD_PSTATE_DISABLE; mode2 < AMD_PSTATE_MAX; mode2++) {
++			if (mode1 == mode2)
++				continue;
++			ret = amd_pstate_set_mode(mode2);
++			if (ret)
++				goto out;
++		}
++	}
++out:
++	if (ret)
++		pr_warn("%s: failed to update status for %s->%s: %d\n", __func__,
++			amd_pstate_get_mode_string(mode1),
++			amd_pstate_get_mode_string(mode2), ret);
++
++	amd_pstate_ut_cases[index].result = ret ?
++					    AMD_PSTATE_UT_RESULT_FAIL :
++					    AMD_PSTATE_UT_RESULT_PASS;
++}
++
+ static int __init amd_pstate_ut_init(void)
+ {
+ 	u32 i = 0, arr_size = ARRAY_SIZE(amd_pstate_ut_cases);
diff --git a/debian/patches/patchset-pf/amd-pstate/0004-cpufreq-amd-pstate-Catch-failures-for-amd_pstate_epp.patch b/debian/patches/patchset-pf/amd-pstate/0004-cpufreq-amd-pstate-Catch-failures-for-amd_pstate_epp.patch
new file mode 100644
index 0000000..0201b74
--- /dev/null
+++ b/debian/patches/patchset-pf/amd-pstate/0004-cpufreq-amd-pstate-Catch-failures-for-amd_pstate_epp.patch
@@ -0,0 +1,60 @@
+From 24e62fbc101d079d398ac6fc76f458676d3d9491 Mon Sep 17 00:00:00 2001
+From: Mario Limonciello <mario.limonciello@amd.com>
+Date: Sun, 1 Sep 2024 00:00:35 -0500
+Subject: cpufreq/amd-pstate: Catch failures for amd_pstate_epp_update_limit()
+
+amd_pstate_set_epp() calls cppc_set_epp_perf() which can fail for
+a variety of reasons but this is ignored.  Change the return flow
+to allow failures.
+
+Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
+---
+ drivers/cpufreq/amd-pstate.c | 11 +++++++----
+ 1 file changed, 7 insertions(+), 4 deletions(-)
+
+--- a/drivers/cpufreq/amd-pstate.c
++++ b/drivers/cpufreq/amd-pstate.c
+@@ -1595,7 +1595,7 @@ static void amd_pstate_epp_cpu_exit(stru
+ 	pr_debug("CPU %d exiting\n", policy->cpu);
+ }
+ 
+-static void amd_pstate_epp_update_limit(struct cpufreq_policy *policy)
++static int amd_pstate_epp_update_limit(struct cpufreq_policy *policy)
+ {
+ 	struct amd_cpudata *cpudata = policy->driver_data;
+ 	u32 max_perf, min_perf, min_limit_perf, max_limit_perf;
+@@ -1645,7 +1645,7 @@ static void amd_pstate_epp_update_limit(
+ 		 * This return value can only be negative for shared_memory
+ 		 * systems where EPP register read/write not supported.
+ 		 */
+-		return;
++		return epp;
+ 	}
+ 
+ 	if (cpudata->policy == CPUFREQ_POLICY_PERFORMANCE)
+@@ -1658,12 +1658,13 @@ static void amd_pstate_epp_update_limit(
+ 	}
+ 
+ 	WRITE_ONCE(cpudata->cppc_req_cached, value);
+-	amd_pstate_set_epp(cpudata, epp);
++	return amd_pstate_set_epp(cpudata, epp);
+ }
+ 
+ static int amd_pstate_epp_set_policy(struct cpufreq_policy *policy)
+ {
+ 	struct amd_cpudata *cpudata = policy->driver_data;
++	int ret;
+ 
+ 	if (!policy->cpuinfo.max_freq)
+ 		return -ENODEV;
+@@ -1673,7 +1674,9 @@ static int amd_pstate_epp_set_policy(str
+ 
+ 	cpudata->policy = policy->policy;
+ 
+-	amd_pstate_epp_update_limit(policy);
++	ret = amd_pstate_epp_update_limit(policy);
++	if (ret)
++		return ret;
+ 
+ 	/*
+ 	 * policy->cur is never updated with the amd_pstate_epp driver, but it
diff --git a/debian/patches/patchset-pf/amd-pstate/0005-x86-amd-Move-amd_get_highest_perf-from-amd.c-to-cppc.patch b/debian/patches/patchset-pf/amd-pstate/0005-x86-amd-Move-amd_get_highest_perf-from-amd.c-to-cppc.patch
new file mode 100644
index 0000000..32b183b
--- /dev/null
+++ b/debian/patches/patchset-pf/amd-pstate/0005-x86-amd-Move-amd_get_highest_perf-from-amd.c-to-cppc.patch
@@ -0,0 +1,67 @@
+From 29c0347dd542e091e2f7e5980dd885f918f5f676 Mon Sep 17 00:00:00 2001
+From: Mario Limonciello <mario.limonciello@amd.com>
+Date: Thu, 5 Sep 2024 11:29:57 -0500
+Subject: x86/amd: Move amd_get_highest_perf() from amd.c to cppc.c
+
+To prepare to let amd_get_highest_perf() detect preferred cores
+it will require CPPC functions. Move amd_get_highest_perf() to
+cppc.c to prepare for 'preferred core detection' rework.
+
+No functional changes intended.
+
+Reviewed-by: Perry Yuan <perry.yuan@amd.com>
+Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
+Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
+Reviewed-by: Gautham R. Shenoy <gautham.sheoy@amd.com>
+---
+ arch/x86/kernel/acpi/cppc.c | 16 ++++++++++++++++
+ arch/x86/kernel/cpu/amd.c   | 16 ----------------
+ 2 files changed, 16 insertions(+), 16 deletions(-)
+
+--- a/arch/x86/kernel/acpi/cppc.c
++++ b/arch/x86/kernel/acpi/cppc.c
+@@ -116,3 +116,19 @@ void init_freq_invariance_cppc(void)
+ 	init_done = true;
+ 	mutex_unlock(&freq_invariance_lock);
+ }
++
++u32 amd_get_highest_perf(void)
++{
++	struct cpuinfo_x86 *c = &boot_cpu_data;
++
++	if (c->x86 == 0x17 && ((c->x86_model >= 0x30 && c->x86_model < 0x40) ||
++			       (c->x86_model >= 0x70 && c->x86_model < 0x80)))
++		return 166;
++
++	if (c->x86 == 0x19 && ((c->x86_model >= 0x20 && c->x86_model < 0x30) ||
++			       (c->x86_model >= 0x40 && c->x86_model < 0x70)))
++		return 166;
++
++	return 255;
++}
++EXPORT_SYMBOL_GPL(amd_get_highest_perf);
+--- a/arch/x86/kernel/cpu/amd.c
++++ b/arch/x86/kernel/cpu/amd.c
+@@ -1190,22 +1190,6 @@ unsigned long amd_get_dr_addr_mask(unsig
+ }
+ EXPORT_SYMBOL_GPL(amd_get_dr_addr_mask);
+ 
+-u32 amd_get_highest_perf(void)
+-{
+-	struct cpuinfo_x86 *c = &boot_cpu_data;
+-
+-	if (c->x86 == 0x17 && ((c->x86_model >= 0x30 && c->x86_model < 0x40) ||
+-			       (c->x86_model >= 0x70 && c->x86_model < 0x80)))
+-		return 166;
+-
+-	if (c->x86 == 0x19 && ((c->x86_model >= 0x20 && c->x86_model < 0x30) ||
+-			       (c->x86_model >= 0x40 && c->x86_model < 0x70)))
+-		return 166;
+-
+-	return 255;
+-}
+-EXPORT_SYMBOL_GPL(amd_get_highest_perf);
+-
+ static void zenbleed_check_cpu(void *unused)
+ {
+ 	struct cpuinfo_x86 *c = &cpu_data(smp_processor_id());
diff --git a/debian/patches/patchset-pf/amd-pstate/0006-ACPI-CPPC-Adjust-return-code-for-inline-functions-in.patch b/debian/patches/patchset-pf/amd-pstate/0006-ACPI-CPPC-Adjust-return-code-for-inline-functions-in.patch
new file mode 100644
index 0000000..7548e16
--- /dev/null
+++ b/debian/patches/patchset-pf/amd-pstate/0006-ACPI-CPPC-Adjust-return-code-for-inline-functions-in.patch
@@ -0,0 +1,95 @@
+From 072efeb45349edd8ba9def11b6a450eaf56690a8 Mon Sep 17 00:00:00 2001
+From: Mario Limonciello <mario.limonciello@amd.com>
+Date: Thu, 5 Sep 2024 11:29:58 -0500
+Subject: ACPI: CPPC: Adjust return code for inline functions in
+ !CONFIG_ACPI_CPPC_LIB
+
+Checkpath emits the following warning:
+```
+WARNING: ENOTSUPP is not a SUSV4 error code, prefer EOPNOTSUPP
+```
+
+Adjust the code accordingly.
+
+Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
+Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
+Reviewed-by: Gautham R. Shenoy <gautham.sheoy@amd.com>
+---
+ include/acpi/cppc_acpi.h | 26 +++++++++++++-------------
+ 1 file changed, 13 insertions(+), 13 deletions(-)
+
+--- a/include/acpi/cppc_acpi.h
++++ b/include/acpi/cppc_acpi.h
+@@ -164,31 +164,31 @@ extern int cppc_set_auto_sel(int cpu, bo
+ #else /* !CONFIG_ACPI_CPPC_LIB */
+ static inline int cppc_get_desired_perf(int cpunum, u64 *desired_perf)
+ {
+-	return -ENOTSUPP;
++	return -EOPNOTSUPP;
+ }
+ static inline int cppc_get_nominal_perf(int cpunum, u64 *nominal_perf)
+ {
+-	return -ENOTSUPP;
++	return -EOPNOTSUPP;
+ }
+ static inline int cppc_get_highest_perf(int cpunum, u64 *highest_perf)
+ {
+-	return -ENOTSUPP;
++	return -EOPNOTSUPP;
+ }
+ static inline int cppc_get_perf_ctrs(int cpu, struct cppc_perf_fb_ctrs *perf_fb_ctrs)
+ {
+-	return -ENOTSUPP;
++	return -EOPNOTSUPP;
+ }
+ static inline int cppc_set_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls)
+ {
+-	return -ENOTSUPP;
++	return -EOPNOTSUPP;
+ }
+ static inline int cppc_set_enable(int cpu, bool enable)
+ {
+-	return -ENOTSUPP;
++	return -EOPNOTSUPP;
+ }
+ static inline int cppc_get_perf_caps(int cpu, struct cppc_perf_caps *caps)
+ {
+-	return -ENOTSUPP;
++	return -EOPNOTSUPP;
+ }
+ static inline bool cppc_perf_ctrs_in_pcc(void)
+ {
+@@ -212,27 +212,27 @@ static inline bool cpc_ffh_supported(voi
+ }
+ static inline int cpc_read_ffh(int cpunum, struct cpc_reg *reg, u64 *val)
+ {
+-	return -ENOTSUPP;
++	return -EOPNOTSUPP;
+ }
+ static inline int cpc_write_ffh(int cpunum, struct cpc_reg *reg, u64 val)
+ {
+-	return -ENOTSUPP;
++	return -EOPNOTSUPP;
+ }
+ static inline int cppc_set_epp_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls, bool enable)
+ {
+-	return -ENOTSUPP;
++	return -EOPNOTSUPP;
+ }
+ static inline int cppc_get_epp_perf(int cpunum, u64 *epp_perf)
+ {
+-	return -ENOTSUPP;
++	return -EOPNOTSUPP;
+ }
+ static inline int cppc_set_auto_sel(int cpu, bool enable)
+ {
+-	return -ENOTSUPP;
++	return -EOPNOTSUPP;
+ }
+ static inline int cppc_get_auto_sel_caps(int cpunum, struct cppc_perf_caps *perf_caps)
+ {
+-	return -ENOTSUPP;
++	return -EOPNOTSUPP;
+ }
+ #endif /* !CONFIG_ACPI_CPPC_LIB */
+ 
diff --git a/debian/patches/patchset-pf/amd-pstate/0007-x86-amd-Rename-amd_get_highest_perf-to-amd_get_boost.patch b/debian/patches/patchset-pf/amd-pstate/0007-x86-amd-Rename-amd_get_highest_perf-to-amd_get_boost.patch
new file mode 100644
index 0000000..dea1ebe
--- /dev/null
+++ b/debian/patches/patchset-pf/amd-pstate/0007-x86-amd-Rename-amd_get_highest_perf-to-amd_get_boost.patch
@@ -0,0 +1,162 @@
+From 21492d91ffc7c3fdb6507f64a74abf8326c75141 Mon Sep 17 00:00:00 2001
+From: Mario Limonciello <mario.limonciello@amd.com>
+Date: Thu, 5 Sep 2024 11:29:59 -0500
+Subject: x86/amd: Rename amd_get_highest_perf() to
+ amd_get_boost_ratio_numerator()
+
+The function name is ambiguous because it returns an intermediate value
+for calculating maximum frequency rather than the CPPC 'Highest Perf'
+register.
+
+Rename the function to clarify its use and allow the function to return
+errors. Adjust the consumer in acpi-cpufreq to catch errors.
+
+Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
+Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
+Reviewed-by: Gautham R. Shenoy <gautham.sheoy@amd.com>
+---
+ arch/x86/include/asm/processor.h |  3 ---
+ arch/x86/kernel/acpi/cppc.c      | 44 +++++++++++++++++++++++---------
+ drivers/cpufreq/acpi-cpufreq.c   | 12 ++++++---
+ include/acpi/cppc_acpi.h         |  5 ++++
+ 4 files changed, 46 insertions(+), 18 deletions(-)
+
+--- a/arch/x86/include/asm/processor.h
++++ b/arch/x86/include/asm/processor.h
+@@ -691,8 +691,6 @@ static inline u32 per_cpu_l2c_id(unsigne
+ }
+ 
+ #ifdef CONFIG_CPU_SUP_AMD
+-extern u32 amd_get_highest_perf(void);
+-
+ /*
+  * Issue a DIV 0/1 insn to clear any division data from previous DIV
+  * operations.
+@@ -705,7 +703,6 @@ static __always_inline void amd_clear_di
+ 
+ extern void amd_check_microcode(void);
+ #else
+-static inline u32 amd_get_highest_perf(void)		{ return 0; }
+ static inline void amd_clear_divider(void)		{ }
+ static inline void amd_check_microcode(void)		{ }
+ #endif
+--- a/arch/x86/kernel/acpi/cppc.c
++++ b/arch/x86/kernel/acpi/cppc.c
+@@ -69,7 +69,7 @@ int cpc_write_ffh(int cpunum, struct cpc
+ static void amd_set_max_freq_ratio(void)
+ {
+ 	struct cppc_perf_caps perf_caps;
+-	u64 highest_perf, nominal_perf;
++	u64 numerator, nominal_perf;
+ 	u64 perf_ratio;
+ 	int rc;
+ 
+@@ -79,15 +79,19 @@ static void amd_set_max_freq_ratio(void)
+ 		return;
+ 	}
+ 
+-	highest_perf = amd_get_highest_perf();
++	rc = amd_get_boost_ratio_numerator(0, &numerator);
++	if (rc) {
++		pr_debug("Could not retrieve highest performance (%d)\n", rc);
++		return;
++	}
+ 	nominal_perf = perf_caps.nominal_perf;
+ 
+-	if (!highest_perf || !nominal_perf) {
+-		pr_debug("Could not retrieve highest or nominal performance\n");
++	if (!nominal_perf) {
++		pr_debug("Could not retrieve nominal performance\n");
+ 		return;
+ 	}
+ 
+-	perf_ratio = div_u64(highest_perf * SCHED_CAPACITY_SCALE, nominal_perf);
++	perf_ratio = div_u64(numerator * SCHED_CAPACITY_SCALE, nominal_perf);
+ 	/* midpoint between max_boost and max_P */
+ 	perf_ratio = (perf_ratio + SCHED_CAPACITY_SCALE) >> 1;
+ 	if (!perf_ratio) {
+@@ -117,18 +121,34 @@ void init_freq_invariance_cppc(void)
+ 	mutex_unlock(&freq_invariance_lock);
+ }
+ 
+-u32 amd_get_highest_perf(void)
++/**
++ * amd_get_boost_ratio_numerator: Get the numerator to use for boost ratio calculation
++ * @cpu: CPU to get numerator for.
++ * @numerator: Output variable for numerator.
++ *
++ * Determine the numerator to use for calculating the boost ratio on
++ * a CPU. On systems that support preferred cores, this will be a hardcoded
++ * value. On other systems this will the highest performance register value.
++ *
++ * Return: 0 for success, negative error code otherwise.
++ */
++int amd_get_boost_ratio_numerator(unsigned int cpu, u64 *numerator)
+ {
+ 	struct cpuinfo_x86 *c = &boot_cpu_data;
+ 
+ 	if (c->x86 == 0x17 && ((c->x86_model >= 0x30 && c->x86_model < 0x40) ||
+-			       (c->x86_model >= 0x70 && c->x86_model < 0x80)))
+-		return 166;
++			       (c->x86_model >= 0x70 && c->x86_model < 0x80))) {
++		*numerator = 166;
++		return 0;
++	}
+ 
+ 	if (c->x86 == 0x19 && ((c->x86_model >= 0x20 && c->x86_model < 0x30) ||
+-			       (c->x86_model >= 0x40 && c->x86_model < 0x70)))
+-		return 166;
++			       (c->x86_model >= 0x40 && c->x86_model < 0x70))) {
++		*numerator = 166;
++		return 0;
++	}
++	*numerator = 255;
+ 
+-	return 255;
++	return 0;
+ }
+-EXPORT_SYMBOL_GPL(amd_get_highest_perf);
++EXPORT_SYMBOL_GPL(amd_get_boost_ratio_numerator);
+--- a/drivers/cpufreq/acpi-cpufreq.c
++++ b/drivers/cpufreq/acpi-cpufreq.c
+@@ -642,10 +642,16 @@ static u64 get_max_boost_ratio(unsigned
+ 		return 0;
+ 	}
+ 
+-	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
+-		highest_perf = amd_get_highest_perf();
+-	else
++	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
++		ret = amd_get_boost_ratio_numerator(cpu, &highest_perf);
++		if (ret) {
++			pr_debug("CPU%d: Unable to get boost ratio numerator (%d)\n",
++				 cpu, ret);
++			return 0;
++		}
++	} else {
+ 		highest_perf = perf_caps.highest_perf;
++	}
+ 
+ 	nominal_perf = perf_caps.nominal_perf;
+ 
+--- a/include/acpi/cppc_acpi.h
++++ b/include/acpi/cppc_acpi.h
+@@ -161,6 +161,7 @@ extern int cppc_get_epp_perf(int cpunum,
+ extern int cppc_set_epp_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls, bool enable);
+ extern int cppc_get_auto_sel_caps(int cpunum, struct cppc_perf_caps *perf_caps);
+ extern int cppc_set_auto_sel(int cpu, bool enable);
++extern int amd_get_boost_ratio_numerator(unsigned int cpu, u64 *numerator);
+ #else /* !CONFIG_ACPI_CPPC_LIB */
+ static inline int cppc_get_desired_perf(int cpunum, u64 *desired_perf)
+ {
+@@ -234,6 +235,10 @@ static inline int cppc_get_auto_sel_caps
+ {
+ 	return -EOPNOTSUPP;
+ }
++static inline int amd_get_boost_ratio_numerator(unsigned int cpu, u64 *numerator)
++{
++	return -EOPNOTSUPP;
++}
+ #endif /* !CONFIG_ACPI_CPPC_LIB */
+ 
+ #endif /* _CPPC_ACPI_H*/
diff --git a/debian/patches/patchset-pf/amd-pstate/0008-ACPI-CPPC-Drop-check-for-non-zero-perf-ratio.patch b/debian/patches/patchset-pf/amd-pstate/0008-ACPI-CPPC-Drop-check-for-non-zero-perf-ratio.patch
new file mode 100644
index 0000000..dcfd0a2
--- /dev/null
+++ b/debian/patches/patchset-pf/amd-pstate/0008-ACPI-CPPC-Drop-check-for-non-zero-perf-ratio.patch
@@ -0,0 +1,35 @@
+From 6f10d066dce0f1781b514a0352f0b427a32b1bb2 Mon Sep 17 00:00:00 2001
+From: Mario Limonciello <mario.limonciello@amd.com>
+Date: Thu, 5 Sep 2024 11:30:00 -0500
+Subject: ACPI: CPPC: Drop check for non zero perf ratio
+
+perf_ratio is a u64 and SCHED_CAPACITY_SCALE is a large number.
+Shifting by one will never have a zero value.
+
+Drop the check.
+
+Suggested-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
+Reviewed-by: Gautham R. Shenoy <gautham.sheoy@amd.com>
+Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
+Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
+---
+ arch/x86/kernel/acpi/cppc.c | 7 +------
+ 1 file changed, 1 insertion(+), 6 deletions(-)
+
+--- a/arch/x86/kernel/acpi/cppc.c
++++ b/arch/x86/kernel/acpi/cppc.c
+@@ -91,13 +91,8 @@ static void amd_set_max_freq_ratio(void)
+ 		return;
+ 	}
+ 
+-	perf_ratio = div_u64(numerator * SCHED_CAPACITY_SCALE, nominal_perf);
+ 	/* midpoint between max_boost and max_P */
+-	perf_ratio = (perf_ratio + SCHED_CAPACITY_SCALE) >> 1;
+-	if (!perf_ratio) {
+-		pr_debug("Non-zero highest/nominal perf values led to a 0 ratio\n");
+-		return;
+-	}
++	perf_ratio = (div_u64(numerator * SCHED_CAPACITY_SCALE, nominal_perf) + SCHED_CAPACITY_SCALE) >> 1;
+ 
+ 	freq_invariance_set_perf_ratio(perf_ratio, false);
+ }
diff --git a/debian/patches/patchset-pf/amd-pstate/0009-ACPI-CPPC-Adjust-debug-messages-in-amd_set_max_freq_.patch b/debian/patches/patchset-pf/amd-pstate/0009-ACPI-CPPC-Adjust-debug-messages-in-amd_set_max_freq_.patch
new file mode 100644
index 0000000..f7caec6
--- /dev/null
+++ b/debian/patches/patchset-pf/amd-pstate/0009-ACPI-CPPC-Adjust-debug-messages-in-amd_set_max_freq_.patch
@@ -0,0 +1,44 @@
+From 8c142a91a58f24119e99d4e66b11890f4a4ef984 Mon Sep 17 00:00:00 2001
+From: Mario Limonciello <mario.limonciello@amd.com>
+Date: Thu, 5 Sep 2024 11:30:01 -0500
+Subject: ACPI: CPPC: Adjust debug messages in amd_set_max_freq_ratio() to warn
+
+If the boost ratio isn't calculated properly for the system for any
+reason this can cause other problems that are non-obvious.
+
+Raise all messages to warn instead.
+
+Suggested-by: Perry Yuan <Perry.Yuan@amd.com>
+Reviewed-by: Perry Yuan <perry.yuan@amd.com>
+Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
+Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
+Reviewed-by: Gautham R. Shenoy <gautham.sheoy@amd.com>
+---
+ arch/x86/kernel/acpi/cppc.c | 6 +++---
+ 1 file changed, 3 insertions(+), 3 deletions(-)
+
+--- a/arch/x86/kernel/acpi/cppc.c
++++ b/arch/x86/kernel/acpi/cppc.c
+@@ -75,19 +75,19 @@ static void amd_set_max_freq_ratio(void)
+ 
+ 	rc = cppc_get_perf_caps(0, &perf_caps);
+ 	if (rc) {
+-		pr_debug("Could not retrieve perf counters (%d)\n", rc);
++		pr_warn("Could not retrieve perf counters (%d)\n", rc);
+ 		return;
+ 	}
+ 
+ 	rc = amd_get_boost_ratio_numerator(0, &numerator);
+ 	if (rc) {
+-		pr_debug("Could not retrieve highest performance (%d)\n", rc);
++		pr_warn("Could not retrieve highest performance (%d)\n", rc);
+ 		return;
+ 	}
+ 	nominal_perf = perf_caps.nominal_perf;
+ 
+ 	if (!nominal_perf) {
+-		pr_debug("Could not retrieve nominal performance\n");
++		pr_warn("Could not retrieve nominal performance\n");
+ 		return;
+ 	}
+ 
diff --git a/debian/patches/patchset-pf/amd-pstate/0010-x86-amd-Move-amd_get_highest_perf-out-of-amd-pstate.patch b/debian/patches/patchset-pf/amd-pstate/0010-x86-amd-Move-amd_get_highest_perf-out-of-amd-pstate.patch
new file mode 100644
index 0000000..8081de8
--- /dev/null
+++ b/debian/patches/patchset-pf/amd-pstate/0010-x86-amd-Move-amd_get_highest_perf-out-of-amd-pstate.patch
@@ -0,0 +1,138 @@
+From 952e7bdc4cf67603f230f8eb91818ad4676e5a83 Mon Sep 17 00:00:00 2001
+From: Mario Limonciello <mario.limonciello@amd.com>
+Date: Thu, 5 Sep 2024 11:30:02 -0500
+Subject: x86/amd: Move amd_get_highest_perf() out of amd-pstate
+
+amd_pstate_get_highest_perf() is a helper used to get the highest perf
+value on AMD systems.  It's used in amd-pstate as part of preferred
+core handling, but applicable for acpi-cpufreq as well.
+
+Move it out to cppc handling code as amd_get_highest_perf().
+
+Reviewed-by: Perry Yuan <perry.yuan@amd.com>
+Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
+Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
+Reviewed-by: Gautham R. Shenoy <gautham.sheoy@amd.com>
+---
+ arch/x86/kernel/acpi/cppc.c  | 30 ++++++++++++++++++++++++++++++
+ drivers/cpufreq/amd-pstate.c | 34 ++--------------------------------
+ include/acpi/cppc_acpi.h     |  5 +++++
+ 3 files changed, 37 insertions(+), 32 deletions(-)
+
+--- a/arch/x86/kernel/acpi/cppc.c
++++ b/arch/x86/kernel/acpi/cppc.c
+@@ -116,6 +116,36 @@ void init_freq_invariance_cppc(void)
+ 	mutex_unlock(&freq_invariance_lock);
+ }
+ 
++/*
++ * Get the highest performance register value.
++ * @cpu: CPU from which to get highest performance.
++ * @highest_perf: Return address for highest performance value.
++ *
++ * Return: 0 for success, negative error code otherwise.
++ */
++int amd_get_highest_perf(unsigned int cpu, u32 *highest_perf)
++{
++	u64 val;
++	int ret;
++
++	if (cpu_feature_enabled(X86_FEATURE_CPPC)) {
++		ret = rdmsrl_safe_on_cpu(cpu, MSR_AMD_CPPC_CAP1, &val);
++		if (ret)
++			goto out;
++
++		val = AMD_CPPC_HIGHEST_PERF(val);
++	} else {
++		ret = cppc_get_highest_perf(cpu, &val);
++		if (ret)
++			goto out;
++	}
++
++	WRITE_ONCE(*highest_perf, (u32)val);
++out:
++	return ret;
++}
++EXPORT_SYMBOL_GPL(amd_get_highest_perf);
++
+ /**
+  * amd_get_boost_ratio_numerator: Get the numerator to use for boost ratio calculation
+  * @cpu: CPU to get numerator for.
+--- a/drivers/cpufreq/amd-pstate.c
++++ b/drivers/cpufreq/amd-pstate.c
+@@ -837,36 +837,6 @@ static void amd_pstste_sched_prefcore_wo
+ }
+ static DECLARE_WORK(sched_prefcore_work, amd_pstste_sched_prefcore_workfn);
+ 
+-/*
+- * Get the highest performance register value.
+- * @cpu: CPU from which to get highest performance.
+- * @highest_perf: Return address.
+- *
+- * Return: 0 for success, -EIO otherwise.
+- */
+-static int amd_pstate_get_highest_perf(int cpu, u32 *highest_perf)
+-{
+-	int ret;
+-
+-	if (cpu_feature_enabled(X86_FEATURE_CPPC)) {
+-		u64 cap1;
+-
+-		ret = rdmsrl_safe_on_cpu(cpu, MSR_AMD_CPPC_CAP1, &cap1);
+-		if (ret)
+-			return ret;
+-		WRITE_ONCE(*highest_perf, AMD_CPPC_HIGHEST_PERF(cap1));
+-	} else {
+-		u64 cppc_highest_perf;
+-
+-		ret = cppc_get_highest_perf(cpu, &cppc_highest_perf);
+-		if (ret)
+-			return ret;
+-		WRITE_ONCE(*highest_perf, cppc_highest_perf);
+-	}
+-
+-	return (ret);
+-}
+-
+ #define CPPC_MAX_PERF	U8_MAX
+ 
+ static void amd_pstate_init_prefcore(struct amd_cpudata *cpudata)
+@@ -874,7 +844,7 @@ static void amd_pstate_init_prefcore(str
+ 	int ret, prio;
+ 	u32 highest_perf;
+ 
+-	ret = amd_pstate_get_highest_perf(cpudata->cpu, &highest_perf);
++	ret = amd_get_highest_perf(cpudata->cpu, &highest_perf);
+ 	if (ret)
+ 		return;
+ 
+@@ -918,7 +888,7 @@ static void amd_pstate_update_limits(uns
+ 	if ((!amd_pstate_prefcore) || (!cpudata->hw_prefcore))
+ 		goto free_cpufreq_put;
+ 
+-	ret = amd_pstate_get_highest_perf(cpu, &cur_high);
++	ret = amd_get_highest_perf(cpu, &cur_high);
+ 	if (ret)
+ 		goto free_cpufreq_put;
+ 
+--- a/include/acpi/cppc_acpi.h
++++ b/include/acpi/cppc_acpi.h
+@@ -161,6 +161,7 @@ extern int cppc_get_epp_perf(int cpunum,
+ extern int cppc_set_epp_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls, bool enable);
+ extern int cppc_get_auto_sel_caps(int cpunum, struct cppc_perf_caps *perf_caps);
+ extern int cppc_set_auto_sel(int cpu, bool enable);
++extern int amd_get_highest_perf(unsigned int cpu, u32 *highest_perf);
+ extern int amd_get_boost_ratio_numerator(unsigned int cpu, u64 *numerator);
+ #else /* !CONFIG_ACPI_CPPC_LIB */
+ static inline int cppc_get_desired_perf(int cpunum, u64 *desired_perf)
+@@ -235,6 +236,10 @@ static inline int cppc_get_auto_sel_caps
+ {
+ 	return -EOPNOTSUPP;
+ }
++static inline int amd_get_highest_perf(unsigned int cpu, u32 *highest_perf)
++{
++	return -ENODEV;
++}
+ static inline int amd_get_boost_ratio_numerator(unsigned int cpu, u64 *numerator)
+ {
+ 	return -EOPNOTSUPP;
diff --git a/debian/patches/patchset-pf/amd-pstate/0011-x86-amd-Detect-preferred-cores-in-amd_get_boost_rati.patch b/debian/patches/patchset-pf/amd-pstate/0011-x86-amd-Detect-preferred-cores-in-amd_get_boost_rati.patch
new file mode 100644
index 0000000..60ddc56
--- /dev/null
+++ b/debian/patches/patchset-pf/amd-pstate/0011-x86-amd-Detect-preferred-cores-in-amd_get_boost_rati.patch
@@ -0,0 +1,251 @@
+From 3ab7da5bbf2087982dbfe2b0f2937d0dddc3afb1 Mon Sep 17 00:00:00 2001
+From: Mario Limonciello <mario.limonciello@amd.com>
+Date: Thu, 5 Sep 2024 11:30:03 -0500
+Subject: x86/amd: Detect preferred cores in amd_get_boost_ratio_numerator()
+
+AMD systems that support preferred cores will use "166" as their
+numerator for max frequency calculations instead of "255".
+
+Add a function for detecting preferred cores by looking at the
+highest perf value on all cores.
+
+If preferred cores are enabled return 166 and if disabled the
+value in the highest perf register. As the function will be called
+multiple times, cache the values for the boost numerator and if
+preferred cores will be enabled in global variables.
+
+Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
+Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
+---
+ arch/x86/kernel/acpi/cppc.c  | 93 ++++++++++++++++++++++++++++++++----
+ drivers/cpufreq/amd-pstate.c | 34 +++++--------
+ include/acpi/cppc_acpi.h     |  5 ++
+ 3 files changed, 101 insertions(+), 31 deletions(-)
+
+--- a/arch/x86/kernel/acpi/cppc.c
++++ b/arch/x86/kernel/acpi/cppc.c
+@@ -9,6 +9,16 @@
+ #include <asm/processor.h>
+ #include <asm/topology.h>
+ 
++#define CPPC_HIGHEST_PERF_PREFCORE	166
++
++enum amd_pref_core {
++	AMD_PREF_CORE_UNKNOWN = 0,
++	AMD_PREF_CORE_SUPPORTED,
++	AMD_PREF_CORE_UNSUPPORTED,
++};
++static enum amd_pref_core amd_pref_core_detected;
++static u64 boost_numerator;
++
+ /* Refer to drivers/acpi/cppc_acpi.c for the description of functions */
+ 
+ bool cpc_supported_by_cpu(void)
+@@ -147,6 +157,66 @@ out:
+ EXPORT_SYMBOL_GPL(amd_get_highest_perf);
+ 
+ /**
++ * amd_detect_prefcore: Detect if CPUs in the system support preferred cores
++ * @detected: Output variable for the result of the detection.
++ *
++ * Determine whether CPUs in the system support preferred cores. On systems
++ * that support preferred cores, different highest perf values will be found
++ * on different cores. On other systems, the highest perf value will be the
++ * same on all cores.
++ *
++ * The result of the detection will be stored in the 'detected' parameter.
++ *
++ * Return: 0 for success, negative error code otherwise
++ */
++int amd_detect_prefcore(bool *detected)
++{
++	int cpu, count = 0;
++	u64 highest_perf[2] = {0};
++
++	if (WARN_ON(!detected))
++		return -EINVAL;
++
++	switch (amd_pref_core_detected) {
++	case AMD_PREF_CORE_SUPPORTED:
++		*detected = true;
++		return 0;
++	case AMD_PREF_CORE_UNSUPPORTED:
++		*detected = false;
++		return 0;
++	default:
++		break;
++	}
++
++	for_each_present_cpu(cpu) {
++		u32 tmp;
++		int ret;
++
++		ret = amd_get_highest_perf(cpu, &tmp);
++		if (ret)
++			return ret;
++
++		if (!count || (count == 1 && tmp != highest_perf[0]))
++			highest_perf[count++] = tmp;
++
++		if (count == 2)
++			break;
++	}
++
++	*detected = (count == 2);
++	boost_numerator = highest_perf[0];
++
++	amd_pref_core_detected = *detected ? AMD_PREF_CORE_SUPPORTED :
++					     AMD_PREF_CORE_UNSUPPORTED;
++
++	pr_debug("AMD CPPC preferred core is %ssupported (highest perf: 0x%llx)\n",
++		 *detected ? "" : "un", highest_perf[0]);
++
++	return 0;
++}
++EXPORT_SYMBOL_GPL(amd_detect_prefcore);
++
++/**
+  * amd_get_boost_ratio_numerator: Get the numerator to use for boost ratio calculation
+  * @cpu: CPU to get numerator for.
+  * @numerator: Output variable for numerator.
+@@ -155,24 +225,27 @@ EXPORT_SYMBOL_GPL(amd_get_highest_perf);
+  * a CPU. On systems that support preferred cores, this will be a hardcoded
+  * value. On other systems this will the highest performance register value.
+  *
++ * If booting the system with amd-pstate enabled but preferred cores disabled then
++ * the correct boost numerator will be returned to match hardware capabilities
++ * even if the preferred cores scheduling hints are not enabled.
++ *
+  * Return: 0 for success, negative error code otherwise.
+  */
+ int amd_get_boost_ratio_numerator(unsigned int cpu, u64 *numerator)
+ {
+-	struct cpuinfo_x86 *c = &boot_cpu_data;
+-
+-	if (c->x86 == 0x17 && ((c->x86_model >= 0x30 && c->x86_model < 0x40) ||
+-			       (c->x86_model >= 0x70 && c->x86_model < 0x80))) {
+-		*numerator = 166;
+-		return 0;
+-	}
++	bool prefcore;
++	int ret;
+ 
+-	if (c->x86 == 0x19 && ((c->x86_model >= 0x20 && c->x86_model < 0x30) ||
+-			       (c->x86_model >= 0x40 && c->x86_model < 0x70))) {
+-		*numerator = 166;
++	ret = amd_detect_prefcore(&prefcore);
++	if (ret)
++		return ret;
++
++	/* without preferred cores, return the highest perf register value */
++	if (!prefcore) {
++		*numerator = boost_numerator;
+ 		return 0;
+ 	}
+-	*numerator = 255;
++	*numerator = CPPC_HIGHEST_PERF_PREFCORE;
+ 
+ 	return 0;
+ }
+--- a/drivers/cpufreq/amd-pstate.c
++++ b/drivers/cpufreq/amd-pstate.c
+@@ -841,32 +841,18 @@ static DECLARE_WORK(sched_prefcore_work,
+ 
+ static void amd_pstate_init_prefcore(struct amd_cpudata *cpudata)
+ {
+-	int ret, prio;
+-	u32 highest_perf;
+-
+-	ret = amd_get_highest_perf(cpudata->cpu, &highest_perf);
+-	if (ret)
++	/* user disabled or not detected */
++	if (!amd_pstate_prefcore)
+ 		return;
+ 
+ 	cpudata->hw_prefcore = true;
+-	/* check if CPPC preferred core feature is enabled*/
+-	if (highest_perf < CPPC_MAX_PERF)
+-		prio = (int)highest_perf;
+-	else {
+-		pr_debug("AMD CPPC preferred core is unsupported!\n");
+-		cpudata->hw_prefcore = false;
+-		return;
+-	}
+-
+-	if (!amd_pstate_prefcore)
+-		return;
+ 
+ 	/*
+ 	 * The priorities can be set regardless of whether or not
+ 	 * sched_set_itmt_support(true) has been called and it is valid to
+ 	 * update them at any time after it has been called.
+ 	 */
+-	sched_set_itmt_core_prio(prio, cpudata->cpu);
++	sched_set_itmt_core_prio((int)READ_ONCE(cpudata->highest_perf), cpudata->cpu);
+ 
+ 	schedule_work(&sched_prefcore_work);
+ }
+@@ -1037,12 +1023,12 @@ static int amd_pstate_cpu_init(struct cp
+ 
+ 	cpudata->cpu = policy->cpu;
+ 
+-	amd_pstate_init_prefcore(cpudata);
+-
+ 	ret = amd_pstate_init_perf(cpudata);
+ 	if (ret)
+ 		goto free_cpudata1;
+ 
++	amd_pstate_init_prefcore(cpudata);
++
+ 	ret = amd_pstate_init_freq(cpudata);
+ 	if (ret)
+ 		goto free_cpudata1;
+@@ -1493,12 +1479,12 @@ static int amd_pstate_epp_cpu_init(struc
+ 	cpudata->cpu = policy->cpu;
+ 	cpudata->epp_policy = 0;
+ 
+-	amd_pstate_init_prefcore(cpudata);
+-
+ 	ret = amd_pstate_init_perf(cpudata);
+ 	if (ret)
+ 		goto free_cpudata1;
+ 
++	amd_pstate_init_prefcore(cpudata);
++
+ 	ret = amd_pstate_init_freq(cpudata);
+ 	if (ret)
+ 		goto free_cpudata1;
+@@ -1960,6 +1946,12 @@ static int __init amd_pstate_init(void)
+ 		static_call_update(amd_pstate_update_perf, cppc_update_perf);
+ 	}
+ 
++	if (amd_pstate_prefcore) {
++		ret = amd_detect_prefcore(&amd_pstate_prefcore);
++		if (ret)
++			return ret;
++	}
++
+ 	/* enable amd pstate feature */
+ 	ret = amd_pstate_enable(true);
+ 	if (ret) {
+--- a/include/acpi/cppc_acpi.h
++++ b/include/acpi/cppc_acpi.h
+@@ -163,6 +163,7 @@ extern int cppc_get_auto_sel_caps(int cp
+ extern int cppc_set_auto_sel(int cpu, bool enable);
+ extern int amd_get_highest_perf(unsigned int cpu, u32 *highest_perf);
+ extern int amd_get_boost_ratio_numerator(unsigned int cpu, u64 *numerator);
++extern int amd_detect_prefcore(bool *detected);
+ #else /* !CONFIG_ACPI_CPPC_LIB */
+ static inline int cppc_get_desired_perf(int cpunum, u64 *desired_perf)
+ {
+@@ -244,6 +245,10 @@ static inline int amd_get_boost_ratio_nu
+ {
+ 	return -EOPNOTSUPP;
+ }
++static inline int amd_detect_prefcore(bool *detected)
++{
++	return -ENODEV;
++}
+ #endif /* !CONFIG_ACPI_CPPC_LIB */
+ 
+ #endif /* _CPPC_ACPI_H*/
diff --git a/debian/patches/patchset-pf/amd-pstate/0012-cpufreq-amd-pstate-Merge-amd_pstate_highest_perf_set.patch b/debian/patches/patchset-pf/amd-pstate/0012-cpufreq-amd-pstate-Merge-amd_pstate_highest_perf_set.patch
new file mode 100644
index 0000000..e65aaff
--- /dev/null
+++ b/debian/patches/patchset-pf/amd-pstate/0012-cpufreq-amd-pstate-Merge-amd_pstate_highest_perf_set.patch
@@ -0,0 +1,169 @@
+From 68d89574b86625f4bd7a784fe9bcc221dc290e4f Mon Sep 17 00:00:00 2001
+From: Mario Limonciello <mario.limonciello@amd.com>
+Date: Thu, 5 Sep 2024 11:30:04 -0500
+Subject: cpufreq: amd-pstate: Merge amd_pstate_highest_perf_set() into
+ amd_get_boost_ratio_numerator()
+
+The special case in amd_pstate_highest_perf_set() is the value used
+for calculating the boost numerator.  Merge this into
+amd_get_boost_ratio_numerator() and then use that to calculate boost
+ratio.
+
+This allows dropping more special casing of the highest perf value.
+
+Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
+Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
+Reviewed-by: Gautham R. Shenoy <gautham.sheoy@amd.com>
+---
+ Documentation/admin-guide/pm/amd-pstate.rst |  3 +-
+ arch/x86/kernel/acpi/cppc.c                 | 16 +++++++
+ drivers/cpufreq/amd-pstate.c                | 52 ++++-----------------
+ 3 files changed, 28 insertions(+), 43 deletions(-)
+
+--- a/Documentation/admin-guide/pm/amd-pstate.rst
++++ b/Documentation/admin-guide/pm/amd-pstate.rst
+@@ -251,7 +251,8 @@ performance supported in `AMD CPPC Perfo
+ In some ASICs, the highest CPPC performance is not the one in the ``_CPC``
+ table, so we need to expose it to sysfs. If boost is not active, but
+ still supported, this maximum frequency will be larger than the one in
+-``cpuinfo``.
++``cpuinfo``. On systems that support preferred core, the driver will have
++different values for some cores than others.
+ This attribute is read-only.
+ 
+ ``amd_pstate_lowest_nonlinear_freq``
+--- a/arch/x86/kernel/acpi/cppc.c
++++ b/arch/x86/kernel/acpi/cppc.c
+@@ -9,6 +9,7 @@
+ #include <asm/processor.h>
+ #include <asm/topology.h>
+ 
++#define CPPC_HIGHEST_PERF_PERFORMANCE	196
+ #define CPPC_HIGHEST_PERF_PREFCORE	166
+ 
+ enum amd_pref_core {
+@@ -245,6 +246,21 @@ int amd_get_boost_ratio_numerator(unsign
+ 		*numerator = boost_numerator;
+ 		return 0;
+ 	}
++
++	/*
++	 * For AMD CPUs with Family ID 19H and Model ID range 0x70 to 0x7f,
++	 * the highest performance level is set to 196.
++	 * https://bugzilla.kernel.org/show_bug.cgi?id=218759
++	 */
++	if (cpu_feature_enabled(X86_FEATURE_ZEN4)) {
++		switch (boot_cpu_data.x86_model) {
++		case 0x70 ... 0x7f:
++			*numerator = CPPC_HIGHEST_PERF_PERFORMANCE;
++			return 0;
++		default:
++			break;
++		}
++	}
+ 	*numerator = CPPC_HIGHEST_PERF_PREFCORE;
+ 
+ 	return 0;
+--- a/drivers/cpufreq/amd-pstate.c
++++ b/drivers/cpufreq/amd-pstate.c
+@@ -52,8 +52,6 @@
+ #define AMD_PSTATE_TRANSITION_LATENCY	20000
+ #define AMD_PSTATE_TRANSITION_DELAY	1000
+ #define AMD_PSTATE_FAST_CPPC_TRANSITION_DELAY 600
+-#define CPPC_HIGHEST_PERF_PERFORMANCE	196
+-#define CPPC_HIGHEST_PERF_DEFAULT	166
+ 
+ #define AMD_CPPC_EPP_PERFORMANCE		0x00
+ #define AMD_CPPC_EPP_BALANCE_PERFORMANCE	0x80
+@@ -398,43 +396,17 @@ static inline int amd_pstate_enable(bool
+ 	return static_call(amd_pstate_enable)(enable);
+ }
+ 
+-static u32 amd_pstate_highest_perf_set(struct amd_cpudata *cpudata)
+-{
+-	struct cpuinfo_x86 *c = &cpu_data(0);
+-
+-	/*
+-	 * For AMD CPUs with Family ID 19H and Model ID range 0x70 to 0x7f,
+-	 * the highest performance level is set to 196.
+-	 * https://bugzilla.kernel.org/show_bug.cgi?id=218759
+-	 */
+-	if (c->x86 == 0x19 && (c->x86_model >= 0x70 && c->x86_model <= 0x7f))
+-		return CPPC_HIGHEST_PERF_PERFORMANCE;
+-
+-	return CPPC_HIGHEST_PERF_DEFAULT;
+-}
+-
+ static int pstate_init_perf(struct amd_cpudata *cpudata)
+ {
+ 	u64 cap1;
+-	u32 highest_perf;
+ 
+ 	int ret = rdmsrl_safe_on_cpu(cpudata->cpu, MSR_AMD_CPPC_CAP1,
+ 				     &cap1);
+ 	if (ret)
+ 		return ret;
+ 
+-	/* For platforms that do not support the preferred core feature, the
+-	 * highest_pef may be configured with 166 or 255, to avoid max frequency
+-	 * calculated wrongly. we take the AMD_CPPC_HIGHEST_PERF(cap1) value as
+-	 * the default max perf.
+-	 */
+-	if (cpudata->hw_prefcore)
+-		highest_perf = amd_pstate_highest_perf_set(cpudata);
+-	else
+-		highest_perf = AMD_CPPC_HIGHEST_PERF(cap1);
+-
+-	WRITE_ONCE(cpudata->highest_perf, highest_perf);
+-	WRITE_ONCE(cpudata->max_limit_perf, highest_perf);
++	WRITE_ONCE(cpudata->highest_perf, AMD_CPPC_HIGHEST_PERF(cap1));
++	WRITE_ONCE(cpudata->max_limit_perf, AMD_CPPC_HIGHEST_PERF(cap1));
+ 	WRITE_ONCE(cpudata->nominal_perf, AMD_CPPC_NOMINAL_PERF(cap1));
+ 	WRITE_ONCE(cpudata->lowest_nonlinear_perf, AMD_CPPC_LOWNONLIN_PERF(cap1));
+ 	WRITE_ONCE(cpudata->lowest_perf, AMD_CPPC_LOWEST_PERF(cap1));
+@@ -446,19 +418,13 @@ static int pstate_init_perf(struct amd_c
+ static int cppc_init_perf(struct amd_cpudata *cpudata)
+ {
+ 	struct cppc_perf_caps cppc_perf;
+-	u32 highest_perf;
+ 
+ 	int ret = cppc_get_perf_caps(cpudata->cpu, &cppc_perf);
+ 	if (ret)
+ 		return ret;
+ 
+-	if (cpudata->hw_prefcore)
+-		highest_perf = amd_pstate_highest_perf_set(cpudata);
+-	else
+-		highest_perf = cppc_perf.highest_perf;
+-
+-	WRITE_ONCE(cpudata->highest_perf, highest_perf);
+-	WRITE_ONCE(cpudata->max_limit_perf, highest_perf);
++	WRITE_ONCE(cpudata->highest_perf, cppc_perf.highest_perf);
++	WRITE_ONCE(cpudata->max_limit_perf, cppc_perf.highest_perf);
+ 	WRITE_ONCE(cpudata->nominal_perf, cppc_perf.nominal_perf);
+ 	WRITE_ONCE(cpudata->lowest_nonlinear_perf,
+ 		   cppc_perf.lowest_nonlinear_perf);
+@@ -944,8 +910,8 @@ static u32 amd_pstate_get_transition_lat
+ static int amd_pstate_init_freq(struct amd_cpudata *cpudata)
+ {
+ 	int ret;
+-	u32 min_freq;
+-	u32 highest_perf, max_freq;
++	u32 min_freq, max_freq;
++	u64 numerator;
+ 	u32 nominal_perf, nominal_freq;
+ 	u32 lowest_nonlinear_perf, lowest_nonlinear_freq;
+ 	u32 boost_ratio, lowest_nonlinear_ratio;
+@@ -967,8 +933,10 @@ static int amd_pstate_init_freq(struct a
+ 
+ 	nominal_perf = READ_ONCE(cpudata->nominal_perf);
+ 
+-	highest_perf = READ_ONCE(cpudata->highest_perf);
+-	boost_ratio = div_u64(highest_perf << SCHED_CAPACITY_SHIFT, nominal_perf);
++	ret = amd_get_boost_ratio_numerator(cpudata->cpu, &numerator);
++	if (ret)
++		return ret;
++	boost_ratio = div_u64(numerator << SCHED_CAPACITY_SHIFT, nominal_perf);
+ 	max_freq = (nominal_freq * boost_ratio >> SCHED_CAPACITY_SHIFT) * 1000;
+ 
+ 	lowest_nonlinear_perf = READ_ONCE(cpudata->lowest_nonlinear_perf);
diff --git a/debian/patches/patchset-pf/amd-pstate/0013-cpufreq-amd-pstate-Optimize-amd_pstate_update_limits.patch b/debian/patches/patchset-pf/amd-pstate/0013-cpufreq-amd-pstate-Optimize-amd_pstate_update_limits.patch
new file mode 100644
index 0000000..3ab7c18
--- /dev/null
+++ b/debian/patches/patchset-pf/amd-pstate/0013-cpufreq-amd-pstate-Optimize-amd_pstate_update_limits.patch
@@ -0,0 +1,42 @@
+From deed718125e73b6bf280dcebb80c39108226388c Mon Sep 17 00:00:00 2001
+From: Mario Limonciello <mario.limonciello@amd.com>
+Date: Thu, 5 Sep 2024 11:30:05 -0500
+Subject: cpufreq: amd-pstate: Optimize amd_pstate_update_limits()
+
+Don't take and release the mutex when prefcore isn't present and
+avoid initialization of variables that will be initially set
+in the function.
+
+Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
+Reviewed-by: Perry Yuan <perry.yuan@amd.com>
+Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
+Reviewed-by: Gautham R. Shenoy <gautham.sheoy@amd.com>
+---
+ drivers/cpufreq/amd-pstate.c | 10 +++++-----
+ 1 file changed, 5 insertions(+), 5 deletions(-)
+
+--- a/drivers/cpufreq/amd-pstate.c
++++ b/drivers/cpufreq/amd-pstate.c
+@@ -836,17 +836,17 @@ static void amd_pstate_update_limits(uns
+ 
+ 	cpudata = policy->driver_data;
+ 
+-	mutex_lock(&amd_pstate_driver_lock);
+-	if ((!amd_pstate_prefcore) || (!cpudata->hw_prefcore))
+-		goto free_cpufreq_put;
++	if (!amd_pstate_prefcore)
++		return;
+ 
++	mutex_lock(&amd_pstate_driver_lock);
+ 	ret = amd_get_highest_perf(cpu, &cur_high);
+ 	if (ret)
+ 		goto free_cpufreq_put;
+ 
+ 	prev_high = READ_ONCE(cpudata->prefcore_ranking);
+-	if (prev_high != cur_high) {
+-		highest_perf_changed = true;
++	highest_perf_changed = (prev_high != cur_high);
++	if (highest_perf_changed) {
+ 		WRITE_ONCE(cpudata->prefcore_ranking, cur_high);
+ 
+ 		if (cur_high < CPPC_MAX_PERF)
diff --git a/debian/patches/patchset-pf/amd-pstate/0014-cpufreq-amd-pstate-Add-documentation-for-amd_pstate_.patch b/debian/patches/patchset-pf/amd-pstate/0014-cpufreq-amd-pstate-Add-documentation-for-amd_pstate_.patch
new file mode 100644
index 0000000..34b04aa
--- /dev/null
+++ b/debian/patches/patchset-pf/amd-pstate/0014-cpufreq-amd-pstate-Add-documentation-for-amd_pstate_.patch
@@ -0,0 +1,29 @@
+From 391075a34e392c7cacd338a6b034a21a10679855 Mon Sep 17 00:00:00 2001
+From: Mario Limonciello <mario.limonciello@amd.com>
+Date: Thu, 5 Sep 2024 11:30:06 -0500
+Subject: cpufreq: amd-pstate: Add documentation for `amd_pstate_hw_prefcore`
+
+Explain that the sysfs file represents both preferred core being
+enabled by the user and supported by the hardware.
+
+Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
+Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
+Reviewed-by: Gautham R. Shenoy <gautham.sheoy@amd.com>
+---
+ Documentation/admin-guide/pm/amd-pstate.rst | 5 +++++
+ 1 file changed, 5 insertions(+)
+
+--- a/Documentation/admin-guide/pm/amd-pstate.rst
++++ b/Documentation/admin-guide/pm/amd-pstate.rst
+@@ -263,6 +263,11 @@ lowest non-linear performance in `AMD CP
+ <perf_cap_>`_.)
+ This attribute is read-only.
+ 
++``amd_pstate_hw_prefcore``
++
++Whether the platform supports the preferred core feature and it has been
++enabled. This attribute is read-only.
++
+ ``energy_performance_available_preferences``
+ 
+ A list of all the supported EPP preferences that could be used for
diff --git a/debian/patches/patchset-pf/amd-pstate/0015-amd-pstate-Add-missing-documentation-for-amd_pstate_.patch b/debian/patches/patchset-pf/amd-pstate/0015-amd-pstate-Add-missing-documentation-for-amd_pstate_.patch
new file mode 100644
index 0000000..09b194b
--- /dev/null
+++ b/debian/patches/patchset-pf/amd-pstate/0015-amd-pstate-Add-missing-documentation-for-amd_pstate_.patch
@@ -0,0 +1,42 @@
+From 2ed9874f6dcafcc2bee7a922af9e1d1c62dbeb18 Mon Sep 17 00:00:00 2001
+From: Mario Limonciello <mario.limonciello@amd.com>
+Date: Thu, 5 Sep 2024 11:30:07 -0500
+Subject: amd-pstate: Add missing documentation for
+ `amd_pstate_prefcore_ranking`
+
+`amd_pstate_prefcore_ranking` reflects the dynamic rankings of a CPU
+core based on platform conditions.  Explicitly include it in the
+documentation.
+
+Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
+Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
+Reviewed-by: Gautham R. Shenoy <gautham.sheoy@amd.com>
+---
+ Documentation/admin-guide/pm/amd-pstate.rst | 9 ++++++++-
+ 1 file changed, 8 insertions(+), 1 deletion(-)
+
+--- a/Documentation/admin-guide/pm/amd-pstate.rst
++++ b/Documentation/admin-guide/pm/amd-pstate.rst
+@@ -252,7 +252,8 @@ In some ASICs, the highest CPPC performa
+ table, so we need to expose it to sysfs. If boost is not active, but
+ still supported, this maximum frequency will be larger than the one in
+ ``cpuinfo``. On systems that support preferred core, the driver will have
+-different values for some cores than others.
++different values for some cores than others and this will reflect the values
++advertised by the platform at bootup.
+ This attribute is read-only.
+ 
+ ``amd_pstate_lowest_nonlinear_freq``
+@@ -268,6 +269,12 @@ This attribute is read-only.
+ Whether the platform supports the preferred core feature and it has been
+ enabled. This attribute is read-only.
+ 
++``amd_pstate_prefcore_ranking``
++
++The performance ranking of the core. This number doesn't have any unit, but
++larger numbers are preferred at the time of reading. This can change at
++runtime based on platform conditions. This attribute is read-only.
++
+ ``energy_performance_available_preferences``
+ 
+ A list of all the supported EPP preferences that could be used for
diff --git a/debian/patches/patchset-pf/amd-pstate/0016-cpufreq-amd-pstate-Fix-non-kerneldoc-comment.patch b/debian/patches/patchset-pf/amd-pstate/0016-cpufreq-amd-pstate-Fix-non-kerneldoc-comment.patch
new file mode 100644
index 0000000..de4a858
--- /dev/null
+++ b/debian/patches/patchset-pf/amd-pstate/0016-cpufreq-amd-pstate-Fix-non-kerneldoc-comment.patch
@@ -0,0 +1,24 @@
+From 2e2ba39aec71fb51e897c3275b255ef806800cf0 Mon Sep 17 00:00:00 2001
+From: Mario Limonciello <mario.limonciello@amd.com>
+Date: Thu, 5 Sep 2024 11:23:51 -0500
+Subject: cpufreq/amd-pstate: Fix non kerneldoc comment
+
+The comment for amd_cppc_supported() isn't meant to be kernel doc.
+
+Fixes: cb817ec6673b7 ("cpufreq: amd-pstate: show CPPC debug message if CPPC is not supported")
+Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
+---
+ drivers/cpufreq/amd-pstate.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/drivers/cpufreq/amd-pstate.c
++++ b/drivers/cpufreq/amd-pstate.c
+@@ -1786,7 +1786,7 @@ static int __init amd_pstate_set_driver(
+ 	return -EINVAL;
+ }
+ 
+-/**
++/*
+  * CPPC function is not supported for family ID 17H with model_ID ranging from 0x10 to 0x2F.
+  * show the debug message that helps to check if the CPU has CPPC support for loading issue.
+  */
diff --git a/debian/patches/patchset-pf/amd-pstate/0017-cpufreq-amd-pstate-ut-Fix-an-Uninitialized-variables.patch b/debian/patches/patchset-pf/amd-pstate/0017-cpufreq-amd-pstate-ut-Fix-an-Uninitialized-variables.patch
new file mode 100644
index 0000000..e770805
--- /dev/null
+++ b/debian/patches/patchset-pf/amd-pstate/0017-cpufreq-amd-pstate-ut-Fix-an-Uninitialized-variables.patch
@@ -0,0 +1,24 @@
+From 185e64a7e1a749593f3d6dadc666da9dda82d48c Mon Sep 17 00:00:00 2001
+From: Qianqiang Liu <qianqiang.liu@163.com>
+Date: Wed, 11 Sep 2024 07:39:24 +0800
+Subject: cpufreq/amd-pstate-ut: Fix an "Uninitialized variables" issue
+
+Using uninitialized value "mode2" when calling "amd_pstate_get_mode_string".
+Set "mode2" to "AMD_PSTATE_DISABLE" by default.
+
+Signed-off-by: Qianqiang Liu <qianqiang.liu@163.com>
+---
+ drivers/cpufreq/amd-pstate-ut.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/drivers/cpufreq/amd-pstate-ut.c
++++ b/drivers/cpufreq/amd-pstate-ut.c
+@@ -270,7 +270,7 @@ static int amd_pstate_set_mode(enum amd_
+ 
+ static void amd_pstate_ut_check_driver(u32 index)
+ {
+-	enum amd_pstate_mode mode1, mode2;
++	enum amd_pstate_mode mode1, mode2 = AMD_PSTATE_DISABLE;
+ 	int ret;
+ 
+ 	for (mode1 = AMD_PSTATE_DISABLE; mode1 < AMD_PSTATE_MAX; mode1++) {
diff --git a/debian/patches/patchset-pf/amd-pstate/0018-cpufreq-amd-pstate-Rename-MSR-and-shared-memory-spec.patch b/debian/patches/patchset-pf/amd-pstate/0018-cpufreq-amd-pstate-Rename-MSR-and-shared-memory-spec.patch
new file mode 100644
index 0000000..efe032b
--- /dev/null
+++ b/debian/patches/patchset-pf/amd-pstate/0018-cpufreq-amd-pstate-Rename-MSR-and-shared-memory-spec.patch
@@ -0,0 +1,108 @@
+From d74ce254cc470da670d6b90c69bab553cdbde62b Mon Sep 17 00:00:00 2001
+From: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
+Date: Tue, 17 Sep 2024 09:14:35 +0000
+Subject: cpufreq/amd-pstate: Rename MSR and shared memory specific functions
+
+Existing function names "cppc_*" and "pstate_*" for shared memory and
+MSR based systems are not intuitive enough, replace them with "shmem_*" and
+"msr_*" respectively.
+
+Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
+---
+ drivers/cpufreq/amd-pstate.c | 24 ++++++++++++------------
+ 1 file changed, 12 insertions(+), 12 deletions(-)
+
+--- a/drivers/cpufreq/amd-pstate.c
++++ b/drivers/cpufreq/amd-pstate.c
+@@ -263,7 +263,7 @@ static int amd_pstate_get_energy_pref_in
+ 	return index;
+ }
+ 
+-static void pstate_update_perf(struct amd_cpudata *cpudata, u32 min_perf,
++static void msr_update_perf(struct amd_cpudata *cpudata, u32 min_perf,
+ 			       u32 des_perf, u32 max_perf, bool fast_switch)
+ {
+ 	if (fast_switch)
+@@ -273,7 +273,7 @@ static void pstate_update_perf(struct am
+ 			      READ_ONCE(cpudata->cppc_req_cached));
+ }
+ 
+-DEFINE_STATIC_CALL(amd_pstate_update_perf, pstate_update_perf);
++DEFINE_STATIC_CALL(amd_pstate_update_perf, msr_update_perf);
+ 
+ static inline void amd_pstate_update_perf(struct amd_cpudata *cpudata,
+ 					  u32 min_perf, u32 des_perf,
+@@ -336,7 +336,7 @@ static int amd_pstate_set_energy_pref_in
+ 	return ret;
+ }
+ 
+-static inline int pstate_enable(bool enable)
++static inline int msr_enable(bool enable)
+ {
+ 	int ret, cpu;
+ 	unsigned long logical_proc_id_mask = 0;
+@@ -362,7 +362,7 @@ static inline int pstate_enable(bool ena
+ 	return 0;
+ }
+ 
+-static int cppc_enable(bool enable)
++static int shmem_enable(bool enable)
+ {
+ 	int cpu, ret = 0;
+ 	struct cppc_perf_ctrls perf_ctrls;
+@@ -389,14 +389,14 @@ static int cppc_enable(bool enable)
+ 	return ret;
+ }
+ 
+-DEFINE_STATIC_CALL(amd_pstate_enable, pstate_enable);
++DEFINE_STATIC_CALL(amd_pstate_enable, msr_enable);
+ 
+ static inline int amd_pstate_enable(bool enable)
+ {
+ 	return static_call(amd_pstate_enable)(enable);
+ }
+ 
+-static int pstate_init_perf(struct amd_cpudata *cpudata)
++static int msr_init_perf(struct amd_cpudata *cpudata)
+ {
+ 	u64 cap1;
+ 
+@@ -415,7 +415,7 @@ static int pstate_init_perf(struct amd_c
+ 	return 0;
+ }
+ 
+-static int cppc_init_perf(struct amd_cpudata *cpudata)
++static int shmem_init_perf(struct amd_cpudata *cpudata)
+ {
+ 	struct cppc_perf_caps cppc_perf;
+ 
+@@ -450,14 +450,14 @@ static int cppc_init_perf(struct amd_cpu
+ 	return ret;
+ }
+ 
+-DEFINE_STATIC_CALL(amd_pstate_init_perf, pstate_init_perf);
++DEFINE_STATIC_CALL(amd_pstate_init_perf, msr_init_perf);
+ 
+ static inline int amd_pstate_init_perf(struct amd_cpudata *cpudata)
+ {
+ 	return static_call(amd_pstate_init_perf)(cpudata);
+ }
+ 
+-static void cppc_update_perf(struct amd_cpudata *cpudata,
++static void shmem_update_perf(struct amd_cpudata *cpudata,
+ 			     u32 min_perf, u32 des_perf,
+ 			     u32 max_perf, bool fast_switch)
+ {
+@@ -1909,9 +1909,9 @@ static int __init amd_pstate_init(void)
+ 			current_pstate_driver->adjust_perf = amd_pstate_adjust_perf;
+ 	} else {
+ 		pr_debug("AMD CPPC shared memory based functionality is supported\n");
+-		static_call_update(amd_pstate_enable, cppc_enable);
+-		static_call_update(amd_pstate_init_perf, cppc_init_perf);
+-		static_call_update(amd_pstate_update_perf, cppc_update_perf);
++		static_call_update(amd_pstate_enable, shmem_enable);
++		static_call_update(amd_pstate_init_perf, shmem_init_perf);
++		static_call_update(amd_pstate_update_perf, shmem_update_perf);
+ 	}
+ 
+ 	if (amd_pstate_prefcore) {
diff --git a/debian/patches/patchset-pf/amd-pstate/0019-cpufreq-Add-a-callback-to-update-the-min_freq_req-fr.patch b/debian/patches/patchset-pf/amd-pstate/0019-cpufreq-Add-a-callback-to-update-the-min_freq_req-fr.patch
new file mode 100644
index 0000000..bd41ffe
--- /dev/null
+++ b/debian/patches/patchset-pf/amd-pstate/0019-cpufreq-Add-a-callback-to-update-the-min_freq_req-fr.patch
@@ -0,0 +1,115 @@
+From 787175146e26a199c06be4e6bf8cf8da0f757271 Mon Sep 17 00:00:00 2001
+From: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
+Date: Thu, 3 Oct 2024 08:39:52 +0000
+Subject: cpufreq: Add a callback to update the min_freq_req from drivers
+
+Currently, there is no proper way to update the initial lower frequency
+limit from cpufreq drivers. Only way is to add a new min_freq qos
+request from the driver side, but it leads to the issue explained below.
+
+The QoS infrastructure collates the constraints from multiple
+subsystems and saves them in a plist. The "current value" is defined to
+be the highest value in the plist for min_freq constraint.
+
+The cpufreq core adds a qos_request for min_freq to be 0 and the amd-pstate
+driver today adds qos request for min_freq to be lowest_freq, where
+lowest_freq corresponds to CPPC.lowest_perf.
+
+Eg: Suppose WLOG considering amd-pstate driver, lowest_freq is 400000 KHz,
+lowest_non_linear_freq is 1200000 KHz.
+
+At this point of time, the min_freq QoS plist looks like:
+
+head--> 400000 KHz (registered by amd-pstate) --> 0 KHz (registered by
+cpufreq core)
+
+When a user updates /sys/devices/system/cpu/cpuX/cpufreq/scaling_min_freq,
+it only results in updating the cpufreq-core's node in the plist, where
+say 0 becomes the newly echoed value.
+
+Now, if the user echoes a value 1000000 KHz, to scaling_min_freq, then the
+new list would be
+
+head--> 1000000 KHz (registered by cpufreq core) --> 400000 KHz (registered
+by amd-pstate)
+
+and the new "current value" of the min_freq QoS constraint will be 1000000
+KHz, this is the scenario where it works as expected.
+
+Suppose we change the amd-pstate driver code's min_freq qos constraint
+to lowest_non_linear_freq instead of lowest_freq, then the user will
+never be able to request a value below that, due to the following:
+
+At boot time, the min_freq QoS plist would be
+
+head--> 1200000 KHz (registered by amd-pstate) --> 0 KHz (registered by
+cpufreq core)
+
+When the user echoes a value of 1000000 KHz, to
+/sys/devices/..../scaling_min_freq, then the new list would be
+
+head--> 1200000 KHz (registered by amd-pstate) --> 1000000 KHz (registered
+by cpufreq core)
+
+with the new "current value" of the min_freq QoS remaining 1200000 KHz.
+Since the current value has not changed, there won't be any notifications
+sent to the subsystems which have added their QoS constraints. In
+particular, the amd-pstate driver will not get the notification, and thus,
+the user's request to lower the scaling_min_freq will be ineffective.
+
+Hence, it is advisable to have a single source of truth for the min and
+max freq QoS constraints between the cpufreq and the cpufreq drivers.
+
+So add a new callback get_init_min_freq() add in struct cpufreq_driver,
+which allows amd-pstate (or any other cpufreq driver) to override the
+default min_freq value being set in the policy->min_freq_req. Now
+scaling_min_freq can be modified by the user to any value (lower or
+higher than the init value) later on if desired.
+
+Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
+Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
+Reviewed-by: Mario Limonciello <mario.limonciello@amd.com>
+---
+ drivers/cpufreq/cpufreq.c | 6 +++++-
+ include/linux/cpufreq.h   | 6 ++++++
+ 2 files changed, 11 insertions(+), 1 deletion(-)
+
+--- a/drivers/cpufreq/cpufreq.c
++++ b/drivers/cpufreq/cpufreq.c
+@@ -1380,6 +1380,7 @@ static int cpufreq_online(unsigned int c
+ 	bool new_policy;
+ 	unsigned long flags;
+ 	unsigned int j;
++	u32 init_min_freq = FREQ_QOS_MIN_DEFAULT_VALUE;
+ 	int ret;
+ 
+ 	pr_debug("%s: bringing CPU%u online\n", __func__, cpu);
+@@ -1464,9 +1465,12 @@ static int cpufreq_online(unsigned int c
+ 			goto out_destroy_policy;
+ 		}
+ 
++		if (cpufreq_driver->get_init_min_freq)
++			init_min_freq = cpufreq_driver->get_init_min_freq(policy);
++
+ 		ret = freq_qos_add_request(&policy->constraints,
+ 					   policy->min_freq_req, FREQ_QOS_MIN,
+-					   FREQ_QOS_MIN_DEFAULT_VALUE);
++					   init_min_freq);
+ 		if (ret < 0) {
+ 			/*
+ 			 * So we don't call freq_qos_remove_request() for an
+--- a/include/linux/cpufreq.h
++++ b/include/linux/cpufreq.h
+@@ -414,6 +414,12 @@ struct cpufreq_driver {
+ 	 * policy is properly initialized, but before the governor is started.
+ 	 */
+ 	void		(*register_em)(struct cpufreq_policy *policy);
++
++	/*
++	 * Set by drivers that want to initialize the policy->min_freq_req with
++	 * a value different from the default value (0) in cpufreq core.
++	 */
++	int		(*get_init_min_freq)(struct cpufreq_policy *policy);
+ };
+ 
+ /* flags */
diff --git a/debian/patches/patchset-pf/amd-pstate/0020-cpufreq-amd-pstate-Set-the-initial-min_freq-to-lowes.patch b/debian/patches/patchset-pf/amd-pstate/0020-cpufreq-amd-pstate-Set-the-initial-min_freq-to-lowes.patch
new file mode 100644
index 0000000..fa421e8
--- /dev/null
+++ b/debian/patches/patchset-pf/amd-pstate/0020-cpufreq-amd-pstate-Set-the-initial-min_freq-to-lowes.patch
@@ -0,0 +1,79 @@
+From f5b234be445a45b0bcacc37e0aad7a6bc7900eac Mon Sep 17 00:00:00 2001
+From: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
+Date: Thu, 3 Oct 2024 08:39:54 +0000
+Subject: cpufreq/amd-pstate: Set the initial min_freq to lowest_nonlinear_freq
+
+According to the AMD architectural programmer's manual volume 2 [1], in
+section "17.6.4.1 CPPC_CAPABILITY_1" lowest_nonlinear_perf is described
+as "Reports the most energy efficient performance level (in terms of
+performance per watt). Above this threshold, lower performance levels
+generally result in increased energy efficiency. Reducing performance
+below this threshold does not result in total energy savings for a given
+computation, although it reduces instantaneous power consumption". So
+lowest_nonlinear_perf is the most power efficient performance level, and
+going below that would lead to a worse performance/watt.
+
+Also, setting the minimum frequency to lowest_nonlinear_freq (instead of
+lowest_freq) allows the CPU to idle at a higher frequency which leads
+to more time being spent in a deeper idle state (as trivial idle tasks
+are completed sooner). This has shown a power benefit in some systems,
+in other systems, power consumption has increased but so has the
+throughput/watt.
+
+Use the get_init_min_freq() callback to set the initial lower limit for
+amd-pstate driver to lowest_nonlinear_freq instead of lowest_freq.
+
+Link: https://www.amd.com/content/dam/amd/en/documents/processor-tech-docs/programmer-references/24593.pdf [1]
+
+Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
+Reviewed-by: Mario Limonciello <mario.limonciello@amd.com>
+---
+ drivers/cpufreq/amd-pstate.c | 16 +++++++++-------
+ 1 file changed, 9 insertions(+), 7 deletions(-)
+
+--- a/drivers/cpufreq/amd-pstate.c
++++ b/drivers/cpufreq/amd-pstate.c
+@@ -1025,13 +1025,6 @@ static int amd_pstate_cpu_init(struct cp
+ 	if (cpu_feature_enabled(X86_FEATURE_CPPC))
+ 		policy->fast_switch_possible = true;
+ 
+-	ret = freq_qos_add_request(&policy->constraints, &cpudata->req[0],
+-				   FREQ_QOS_MIN, policy->cpuinfo.min_freq);
+-	if (ret < 0) {
+-		dev_err(dev, "Failed to add min-freq constraint (%d)\n", ret);
+-		goto free_cpudata1;
+-	}
+-
+ 	ret = freq_qos_add_request(&policy->constraints, &cpudata->req[1],
+ 				   FREQ_QOS_MAX, policy->cpuinfo.max_freq);
+ 	if (ret < 0) {
+@@ -1736,6 +1729,13 @@ static int amd_pstate_epp_resume(struct
+ 	return 0;
+ }
+ 
++static int amd_pstate_get_init_min_freq(struct cpufreq_policy *policy)
++{
++	struct amd_cpudata *cpudata = policy->driver_data;
++
++	return READ_ONCE(cpudata->lowest_nonlinear_freq);
++}
++
+ static struct cpufreq_driver amd_pstate_driver = {
+ 	.flags		= CPUFREQ_CONST_LOOPS | CPUFREQ_NEED_UPDATE_LIMITS,
+ 	.verify		= amd_pstate_verify,
+@@ -1749,6 +1749,7 @@ static struct cpufreq_driver amd_pstate_
+ 	.update_limits	= amd_pstate_update_limits,
+ 	.name		= "amd-pstate",
+ 	.attr		= amd_pstate_attr,
++	.get_init_min_freq = amd_pstate_get_init_min_freq,
+ };
+ 
+ static struct cpufreq_driver amd_pstate_epp_driver = {
+@@ -1765,6 +1766,7 @@ static struct cpufreq_driver amd_pstate_
+ 	.set_boost	= amd_pstate_set_boost,
+ 	.name		= "amd-pstate-epp",
+ 	.attr		= amd_pstate_epp_attr,
++	.get_init_min_freq = amd_pstate_get_init_min_freq,
+ };
+ 
+ static int __init amd_pstate_set_driver(int mode_idx)
diff --git a/debian/patches/patchset-pf/amd-pstate/0021-cpufreq-amd-pstate-Cleanup-the-old-min_freq-qos-requ.patch b/debian/patches/patchset-pf/amd-pstate/0021-cpufreq-amd-pstate-Cleanup-the-old-min_freq-qos-requ.patch
new file mode 100644
index 0000000..1b82a70
--- /dev/null
+++ b/debian/patches/patchset-pf/amd-pstate/0021-cpufreq-amd-pstate-Cleanup-the-old-min_freq-qos-requ.patch
@@ -0,0 +1,103 @@
+From f7b2b3a1c0d015c4272793bed89734c5cffb354c Mon Sep 17 00:00:00 2001
+From: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
+Date: Thu, 3 Oct 2024 08:39:56 +0000
+Subject: cpufreq/amd-pstate: Cleanup the old min_freq qos request remnants
+
+Convert the freq_qos_request array in struct amd_cpudata to a single
+variable (only for max_freq request). Remove the references to cpudata->req
+array. Remove and rename the jump labels accordingly.
+
+Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
+Reviewed-by: Mario Limonciello <mario.limonciello@amd.com>
+---
+ drivers/cpufreq/amd-pstate.c | 19 ++++++++-----------
+ drivers/cpufreq/amd-pstate.h |  4 ++--
+ 2 files changed, 10 insertions(+), 13 deletions(-)
+
+--- a/drivers/cpufreq/amd-pstate.c
++++ b/drivers/cpufreq/amd-pstate.c
+@@ -726,7 +726,7 @@ static int amd_pstate_cpu_boost_update(s
+ 	policy->max = policy->cpuinfo.max_freq;
+ 
+ 	if (cppc_state == AMD_PSTATE_PASSIVE) {
+-		ret = freq_qos_update_request(&cpudata->req[1], policy->cpuinfo.max_freq);
++		ret = freq_qos_update_request(&cpudata->max_freq_req, policy->cpuinfo.max_freq);
+ 		if (ret < 0)
+ 			pr_debug("Failed to update freq constraint: CPU%d\n", cpudata->cpu);
+ 	}
+@@ -993,17 +993,17 @@ static int amd_pstate_cpu_init(struct cp
+ 
+ 	ret = amd_pstate_init_perf(cpudata);
+ 	if (ret)
+-		goto free_cpudata1;
++		goto free_cpudata;
+ 
+ 	amd_pstate_init_prefcore(cpudata);
+ 
+ 	ret = amd_pstate_init_freq(cpudata);
+ 	if (ret)
+-		goto free_cpudata1;
++		goto free_cpudata;
+ 
+ 	ret = amd_pstate_init_boost_support(cpudata);
+ 	if (ret)
+-		goto free_cpudata1;
++		goto free_cpudata;
+ 
+ 	min_freq = READ_ONCE(cpudata->min_freq);
+ 	max_freq = READ_ONCE(cpudata->max_freq);
+@@ -1025,11 +1025,11 @@ static int amd_pstate_cpu_init(struct cp
+ 	if (cpu_feature_enabled(X86_FEATURE_CPPC))
+ 		policy->fast_switch_possible = true;
+ 
+-	ret = freq_qos_add_request(&policy->constraints, &cpudata->req[1],
++	ret = freq_qos_add_request(&policy->constraints, &cpudata->max_freq_req,
+ 				   FREQ_QOS_MAX, policy->cpuinfo.max_freq);
+ 	if (ret < 0) {
+ 		dev_err(dev, "Failed to add max-freq constraint (%d)\n", ret);
+-		goto free_cpudata2;
++		goto free_cpudata;
+ 	}
+ 
+ 	cpudata->max_limit_freq = max_freq;
+@@ -1042,9 +1042,7 @@ static int amd_pstate_cpu_init(struct cp
+ 
+ 	return 0;
+ 
+-free_cpudata2:
+-	freq_qos_remove_request(&cpudata->req[0]);
+-free_cpudata1:
++free_cpudata:
+ 	kfree(cpudata);
+ 	return ret;
+ }
+@@ -1053,8 +1051,7 @@ static void amd_pstate_cpu_exit(struct c
+ {
+ 	struct amd_cpudata *cpudata = policy->driver_data;
+ 
+-	freq_qos_remove_request(&cpudata->req[1]);
+-	freq_qos_remove_request(&cpudata->req[0]);
++	freq_qos_remove_request(&cpudata->max_freq_req);
+ 	policy->fast_switch_possible = false;
+ 	kfree(cpudata);
+ }
+--- a/drivers/cpufreq/amd-pstate.h
++++ b/drivers/cpufreq/amd-pstate.h
+@@ -28,7 +28,7 @@ struct amd_aperf_mperf {
+ /**
+  * struct amd_cpudata - private CPU data for AMD P-State
+  * @cpu: CPU number
+- * @req: constraint request to apply
++ * @max_freq_req: maximum frequency constraint request to apply
+  * @cppc_req_cached: cached performance request hints
+  * @highest_perf: the maximum performance an individual processor may reach,
+  *		  assuming ideal conditions
+@@ -68,7 +68,7 @@ struct amd_aperf_mperf {
+ struct amd_cpudata {
+ 	int	cpu;
+ 
+-	struct	freq_qos_request req[2];
++	struct	freq_qos_request max_freq_req;
+ 	u64	cppc_req_cached;
+ 
+ 	u32	highest_perf;
diff --git a/debian/patches/patchset-pf/amd-pstate/0022-cpufreq-amd-pstate-Fix-amd_pstate-mode-switch-on-sha.patch b/debian/patches/patchset-pf/amd-pstate/0022-cpufreq-amd-pstate-Fix-amd_pstate-mode-switch-on-sha.patch
new file mode 100644
index 0000000..6b342ff
--- /dev/null
+++ b/debian/patches/patchset-pf/amd-pstate/0022-cpufreq-amd-pstate-Fix-amd_pstate-mode-switch-on-sha.patch
@@ -0,0 +1,42 @@
+From d1216c052bedbf6d79e4b0261e2f09e17c66ffd3 Mon Sep 17 00:00:00 2001
+From: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
+Date: Fri, 4 Oct 2024 12:23:04 +0000
+Subject: cpufreq/amd-pstate: Fix amd_pstate mode switch on shared memory
+ systems
+
+While switching the driver mode between active and passive, Collaborative
+Processor Performance Control (CPPC) is disabled in
+amd_pstate_unregister_driver(). But, it is not enabled back while registering
+the new driver (passive or active). This leads to the new driver mode not
+working correctly, so enable it back in amd_pstate_register_driver().
+
+Fixes: 3ca7bc818d8c ("cpufreq: amd-pstate: Add guided mode control support via sysfs")
+Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
+---
+ drivers/cpufreq/amd-pstate.c | 10 ++++++++++
+ 1 file changed, 10 insertions(+)
+
+--- a/drivers/cpufreq/amd-pstate.c
++++ b/drivers/cpufreq/amd-pstate.c
+@@ -1221,11 +1221,21 @@ static int amd_pstate_register_driver(in
+ 		return -EINVAL;
+ 
+ 	cppc_state = mode;
++
++	ret = amd_pstate_enable(true);
++	if (ret) {
++		pr_err("failed to enable cppc during amd-pstate driver registration, return %d\n",
++		       ret);
++		amd_pstate_driver_cleanup();
++		return ret;
++	}
++
+ 	ret = cpufreq_register_driver(current_pstate_driver);
+ 	if (ret) {
+ 		amd_pstate_driver_cleanup();
+ 		return ret;
+ 	}
++
+ 	return 0;
+ }
+ 
diff --git a/debian/patches/patchset-pf/amd-pstate/0023-cpufreq-amd-pstate-Use-nominal-perf-for-limits-when-.patch b/debian/patches/patchset-pf/amd-pstate/0023-cpufreq-amd-pstate-Use-nominal-perf-for-limits-when-.patch
new file mode 100644
index 0000000..7036362
--- /dev/null
+++ b/debian/patches/patchset-pf/amd-pstate/0023-cpufreq-amd-pstate-Use-nominal-perf-for-limits-when-.patch
@@ -0,0 +1,57 @@
+From c4fde0d177bdb33912f450914d84d6432391a8b5 Mon Sep 17 00:00:00 2001
+From: Mario Limonciello <mario.limonciello@amd.com>
+Date: Sat, 12 Oct 2024 12:45:16 -0500
+Subject: cpufreq/amd-pstate: Use nominal perf for limits when boost is
+ disabled
+
+When boost has been disabled the limit for perf should be nominal perf not
+the highest perf.  Using the latter to do calculations will lead to
+incorrect values that are still above nominal.
+
+Fixes: ad4caad58d91 ("cpufreq: amd-pstate: Merge amd_pstate_highest_perf_set() into amd_get_boost_ratio_numerator()")
+Reported-by: Peter Jung <ptr1337@cachyos.org>
+Closes: https://bugzilla.kernel.org/show_bug.cgi?id=219348
+Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
+---
+ drivers/cpufreq/amd-pstate.c | 20 ++++++++++++++------
+ 1 file changed, 14 insertions(+), 6 deletions(-)
+
+--- a/drivers/cpufreq/amd-pstate.c
++++ b/drivers/cpufreq/amd-pstate.c
+@@ -566,11 +566,16 @@ static int amd_pstate_verify(struct cpuf
+ 
+ static int amd_pstate_update_min_max_limit(struct cpufreq_policy *policy)
+ {
+-	u32 max_limit_perf, min_limit_perf, lowest_perf;
++	u32 max_limit_perf, min_limit_perf, lowest_perf, max_perf;
+ 	struct amd_cpudata *cpudata = policy->driver_data;
+ 
+-	max_limit_perf = div_u64(policy->max * cpudata->highest_perf, cpudata->max_freq);
+-	min_limit_perf = div_u64(policy->min * cpudata->highest_perf, cpudata->max_freq);
++	if (cpudata->boost_supported && !policy->boost_enabled)
++		max_perf = READ_ONCE(cpudata->nominal_perf);
++	else
++		max_perf = READ_ONCE(cpudata->highest_perf);
++
++	max_limit_perf = div_u64(policy->max * max_perf, policy->cpuinfo.max_freq);
++	min_limit_perf = div_u64(policy->min * max_perf, policy->cpuinfo.max_freq);
+ 
+ 	lowest_perf = READ_ONCE(cpudata->lowest_perf);
+ 	if (min_limit_perf < lowest_perf)
+@@ -1526,10 +1531,13 @@ static int amd_pstate_epp_update_limit(s
+ 	u64 value;
+ 	s16 epp;
+ 
+-	max_perf = READ_ONCE(cpudata->highest_perf);
++	if (cpudata->boost_supported && !policy->boost_enabled)
++		max_perf = READ_ONCE(cpudata->nominal_perf);
++	else
++		max_perf = READ_ONCE(cpudata->highest_perf);
+ 	min_perf = READ_ONCE(cpudata->lowest_perf);
+-	max_limit_perf = div_u64(policy->max * cpudata->highest_perf, cpudata->max_freq);
+-	min_limit_perf = div_u64(policy->min * cpudata->highest_perf, cpudata->max_freq);
++	max_limit_perf = div_u64(policy->max * max_perf, policy->cpuinfo.max_freq);
++	min_limit_perf = div_u64(policy->min * max_perf, policy->cpuinfo.max_freq);
+ 
+ 	if (min_limit_perf < min_perf)
+ 		min_limit_perf = min_perf;
diff --git a/debian/patches/patchset-pf/amd-pstate/0024-cpufreq-amd-pstate-Don-t-update-CPPC-request-in-amd_.patch b/debian/patches/patchset-pf/amd-pstate/0024-cpufreq-amd-pstate-Don-t-update-CPPC-request-in-amd_.patch
new file mode 100644
index 0000000..0019cc1
--- /dev/null
+++ b/debian/patches/patchset-pf/amd-pstate/0024-cpufreq-amd-pstate-Don-t-update-CPPC-request-in-amd_.patch
@@ -0,0 +1,55 @@
+From 01ad0fb3da95867947d923596a26b18d844afe3c Mon Sep 17 00:00:00 2001
+From: Mario Limonciello <mario.limonciello@amd.com>
+Date: Sat, 12 Oct 2024 12:45:17 -0500
+Subject: cpufreq/amd-pstate: Don't update CPPC request in
+ amd_pstate_cpu_boost_update()
+
+When boost is changed the CPPC value is changed in amd_pstate_cpu_boost_update()
+but then changed again when refresh_frequency_limits() and all it's callbacks
+occur.  The first is a pointless write, so instead just update the limits for
+the policy and let the policy refresh anchor everything properly.
+
+Fixes: c8c68c38b56f ("cpufreq: amd-pstate: initialize core precision boost state")
+Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
+---
+ drivers/cpufreq/amd-pstate.c | 24 +-----------------------
+ 1 file changed, 1 insertion(+), 23 deletions(-)
+
+--- a/drivers/cpufreq/amd-pstate.c
++++ b/drivers/cpufreq/amd-pstate.c
+@@ -695,34 +695,12 @@ static void amd_pstate_adjust_perf(unsig
+ static int amd_pstate_cpu_boost_update(struct cpufreq_policy *policy, bool on)
+ {
+ 	struct amd_cpudata *cpudata = policy->driver_data;
+-	struct cppc_perf_ctrls perf_ctrls;
+-	u32 highest_perf, nominal_perf, nominal_freq, max_freq;
++	u32 nominal_freq, max_freq;
+ 	int ret = 0;
+ 
+-	highest_perf = READ_ONCE(cpudata->highest_perf);
+-	nominal_perf = READ_ONCE(cpudata->nominal_perf);
+ 	nominal_freq = READ_ONCE(cpudata->nominal_freq);
+ 	max_freq = READ_ONCE(cpudata->max_freq);
+ 
+-	if (boot_cpu_has(X86_FEATURE_CPPC)) {
+-		u64 value = READ_ONCE(cpudata->cppc_req_cached);
+-
+-		value &= ~GENMASK_ULL(7, 0);
+-		value |= on ? highest_perf : nominal_perf;
+-		WRITE_ONCE(cpudata->cppc_req_cached, value);
+-
+-		wrmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, value);
+-	} else {
+-		perf_ctrls.max_perf = on ? highest_perf : nominal_perf;
+-		ret = cppc_set_perf(cpudata->cpu, &perf_ctrls);
+-		if (ret) {
+-			cpufreq_cpu_release(policy);
+-			pr_debug("Failed to set max perf on CPU:%d. ret:%d\n",
+-				cpudata->cpu, ret);
+-			return ret;
+-		}
+-	}
+-
+ 	if (on)
+ 		policy->cpuinfo.max_freq = max_freq;
+ 	else if (policy->cpuinfo.max_freq > nominal_freq * 1000)
diff --git a/debian/patches/patchset-pf/amd-pstate/0025-cpufreq-amd-pstate-Use-amd_pstate_update_min_max_lim.patch b/debian/patches/patchset-pf/amd-pstate/0025-cpufreq-amd-pstate-Use-amd_pstate_update_min_max_lim.patch
new file mode 100644
index 0000000..3fd8c77
--- /dev/null
+++ b/debian/patches/patchset-pf/amd-pstate/0025-cpufreq-amd-pstate-Use-amd_pstate_update_min_max_lim.patch
@@ -0,0 +1,49 @@
+From 684d162c08ab86fff02861c907ecc92bf9c09af4 Mon Sep 17 00:00:00 2001
+From: Mario Limonciello <mario.limonciello@amd.com>
+Date: Sat, 12 Oct 2024 12:45:18 -0500
+Subject: cpufreq/amd-pstate: Use amd_pstate_update_min_max_limit() for EPP
+ limits
+
+When the EPP updates are set the maximum capable frequency for the
+CPU is used to set the upper limit instead of that of the policy.
+
+Adjust amd_pstate_epp_update_limit() to reuse policy calculation code
+from amd_pstate_update_min_max_limit().
+
+Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
+---
+ drivers/cpufreq/amd-pstate.c | 19 +++----------------
+ 1 file changed, 3 insertions(+), 16 deletions(-)
+
+--- a/drivers/cpufreq/amd-pstate.c
++++ b/drivers/cpufreq/amd-pstate.c
+@@ -1505,26 +1505,13 @@ static void amd_pstate_epp_cpu_exit(stru
+ static int amd_pstate_epp_update_limit(struct cpufreq_policy *policy)
+ {
+ 	struct amd_cpudata *cpudata = policy->driver_data;
+-	u32 max_perf, min_perf, min_limit_perf, max_limit_perf;
++	u32 max_perf, min_perf;
+ 	u64 value;
+ 	s16 epp;
+ 
+-	if (cpudata->boost_supported && !policy->boost_enabled)
+-		max_perf = READ_ONCE(cpudata->nominal_perf);
+-	else
+-		max_perf = READ_ONCE(cpudata->highest_perf);
++	max_perf = READ_ONCE(cpudata->highest_perf);
+ 	min_perf = READ_ONCE(cpudata->lowest_perf);
+-	max_limit_perf = div_u64(policy->max * max_perf, policy->cpuinfo.max_freq);
+-	min_limit_perf = div_u64(policy->min * max_perf, policy->cpuinfo.max_freq);
+-
+-	if (min_limit_perf < min_perf)
+-		min_limit_perf = min_perf;
+-
+-	if (max_limit_perf < min_limit_perf)
+-		max_limit_perf = min_limit_perf;
+-
+-	WRITE_ONCE(cpudata->max_limit_perf, max_limit_perf);
+-	WRITE_ONCE(cpudata->min_limit_perf, min_limit_perf);
++	amd_pstate_update_min_max_limit(policy);
+ 
+ 	max_perf = clamp_t(unsigned long, max_perf, cpudata->min_limit_perf,
+ 			cpudata->max_limit_perf);
diff --git a/debian/patches/patchset-pf/amd-pstate/0026-cpufreq-amd-pstate-Drop-needless-EPP-initialization.patch b/debian/patches/patchset-pf/amd-pstate/0026-cpufreq-amd-pstate-Drop-needless-EPP-initialization.patch
new file mode 100644
index 0000000..33964dc
--- /dev/null
+++ b/debian/patches/patchset-pf/amd-pstate/0026-cpufreq-amd-pstate-Drop-needless-EPP-initialization.patch
@@ -0,0 +1,29 @@
+From fa46d2873c9fa4060ce407e4bc5c7e29babce9d0 Mon Sep 17 00:00:00 2001
+From: Mario Limonciello <mario.limonciello@amd.com>
+Date: Sat, 12 Oct 2024 12:45:19 -0500
+Subject: cpufreq/amd-pstate: Drop needless EPP initialization
+
+The EPP value doesn't need to be cached to the CPPC request in
+amd_pstate_epp_update_limit() because it's passed as an argument
+at the end to amd_pstate_set_epp() and stored at that time.
+
+Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
+---
+ drivers/cpufreq/amd-pstate.c | 6 ------
+ 1 file changed, 6 deletions(-)
+
+--- a/drivers/cpufreq/amd-pstate.c
++++ b/drivers/cpufreq/amd-pstate.c
+@@ -1548,12 +1548,6 @@ static int amd_pstate_epp_update_limit(s
+ 	if (cpudata->policy == CPUFREQ_POLICY_PERFORMANCE)
+ 		epp = 0;
+ 
+-	/* Set initial EPP value */
+-	if (cpu_feature_enabled(X86_FEATURE_CPPC)) {
+-		value &= ~GENMASK_ULL(31, 24);
+-		value |= (u64)epp << 24;
+-	}
+-
+ 	WRITE_ONCE(cpudata->cppc_req_cached, value);
+ 	return amd_pstate_set_epp(cpudata, epp);
+ }
diff --git a/debian/patches/patchset-pf/amd-pstate/0027-amd-pstate-6.11-update-setting-the-minimum-frequency.patch b/debian/patches/patchset-pf/amd-pstate/0027-amd-pstate-6.11-update-setting-the-minimum-frequency.patch
new file mode 100644
index 0000000..9fe70c7
--- /dev/null
+++ b/debian/patches/patchset-pf/amd-pstate/0027-amd-pstate-6.11-update-setting-the-minimum-frequency.patch
@@ -0,0 +1,228 @@
+From 649d296be0c7f0df6e71b4fca25fdbe75cb3994e Mon Sep 17 00:00:00 2001
+From: Oleksandr Natalenko <oleksandr@natalenko.name>
+Date: Thu, 17 Oct 2024 17:03:11 +0200
+Subject: amd-pstate-6.11: update setting the minimum frequency to
+ lowest_nonlinear_freq patchset to v3
+
+Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name>
+---
+ drivers/cpufreq/amd-pstate.c | 67 +++++++++++++++++++++---------------
+ drivers/cpufreq/amd-pstate.h |  4 +--
+ drivers/cpufreq/cpufreq.c    |  6 +---
+ include/linux/cpufreq.h      |  6 ----
+ 4 files changed, 43 insertions(+), 40 deletions(-)
+
+--- a/drivers/cpufreq/amd-pstate.c
++++ b/drivers/cpufreq/amd-pstate.c
+@@ -557,9 +557,28 @@ cpufreq_policy_put:
+ 	cpufreq_cpu_put(policy);
+ }
+ 
+-static int amd_pstate_verify(struct cpufreq_policy_data *policy)
++static int amd_pstate_verify(struct cpufreq_policy_data *policy_data)
+ {
+-	cpufreq_verify_within_cpu_limits(policy);
++	/*
++	 * Initialize lower frequency limit (i.e.policy->min) with
++	 * lowest_nonlinear_frequency which is the most energy efficient
++	 * frequency. Override the initial value set by cpufreq core and
++	 * amd-pstate qos_requests.
++	 */
++	if (policy_data->min == FREQ_QOS_MIN_DEFAULT_VALUE) {
++		struct cpufreq_policy *policy = cpufreq_cpu_get(policy_data->cpu);
++		struct amd_cpudata *cpudata;
++
++		if (!policy)
++			return -EINVAL;
++
++		cpudata = policy->driver_data;
++		policy_data->min = cpudata->lowest_nonlinear_freq;
++		cpufreq_cpu_put(policy);
++	}
++
++	cpufreq_verify_within_cpu_limits(policy_data);
++	pr_debug("policy_max =%d, policy_min=%d\n", policy_data->max, policy_data->min);
+ 
+ 	return 0;
+ }
+@@ -709,7 +728,7 @@ static int amd_pstate_cpu_boost_update(s
+ 	policy->max = policy->cpuinfo.max_freq;
+ 
+ 	if (cppc_state == AMD_PSTATE_PASSIVE) {
+-		ret = freq_qos_update_request(&cpudata->max_freq_req, policy->cpuinfo.max_freq);
++		ret = freq_qos_update_request(&cpudata->req[1], policy->cpuinfo.max_freq);
+ 		if (ret < 0)
+ 			pr_debug("Failed to update freq constraint: CPU%d\n", cpudata->cpu);
+ 	}
+@@ -976,17 +995,17 @@ static int amd_pstate_cpu_init(struct cp
+ 
+ 	ret = amd_pstate_init_perf(cpudata);
+ 	if (ret)
+-		goto free_cpudata;
++		goto free_cpudata1;
+ 
+ 	amd_pstate_init_prefcore(cpudata);
+ 
+ 	ret = amd_pstate_init_freq(cpudata);
+ 	if (ret)
+-		goto free_cpudata;
++		goto free_cpudata1;
+ 
+ 	ret = amd_pstate_init_boost_support(cpudata);
+ 	if (ret)
+-		goto free_cpudata;
++		goto free_cpudata1;
+ 
+ 	min_freq = READ_ONCE(cpudata->min_freq);
+ 	max_freq = READ_ONCE(cpudata->max_freq);
+@@ -1008,11 +1027,18 @@ static int amd_pstate_cpu_init(struct cp
+ 	if (cpu_feature_enabled(X86_FEATURE_CPPC))
+ 		policy->fast_switch_possible = true;
+ 
+-	ret = freq_qos_add_request(&policy->constraints, &cpudata->max_freq_req,
++	ret = freq_qos_add_request(&policy->constraints, &cpudata->req[0],
++				   FREQ_QOS_MIN, FREQ_QOS_MIN_DEFAULT_VALUE);
++	if (ret < 0) {
++		dev_err(dev, "Failed to add min-freq constraint (%d)\n", ret);
++		goto free_cpudata1;
++	}
++
++	ret = freq_qos_add_request(&policy->constraints, &cpudata->req[1],
+ 				   FREQ_QOS_MAX, policy->cpuinfo.max_freq);
+ 	if (ret < 0) {
+ 		dev_err(dev, "Failed to add max-freq constraint (%d)\n", ret);
+-		goto free_cpudata;
++		goto free_cpudata2;
+ 	}
+ 
+ 	cpudata->max_limit_freq = max_freq;
+@@ -1025,7 +1051,9 @@ static int amd_pstate_cpu_init(struct cp
+ 
+ 	return 0;
+ 
+-free_cpudata:
++free_cpudata2:
++	freq_qos_remove_request(&cpudata->req[0]);
++free_cpudata1:
+ 	kfree(cpudata);
+ 	return ret;
+ }
+@@ -1034,7 +1062,8 @@ static void amd_pstate_cpu_exit(struct c
+ {
+ 	struct amd_cpudata *cpudata = policy->driver_data;
+ 
+-	freq_qos_remove_request(&cpudata->max_freq_req);
++	freq_qos_remove_request(&cpudata->req[1]);
++	freq_qos_remove_request(&cpudata->req[0]);
+ 	policy->fast_switch_possible = false;
+ 	kfree(cpudata);
+ }
+@@ -1658,13 +1687,6 @@ static int amd_pstate_epp_cpu_offline(st
+ 	return 0;
+ }
+ 
+-static int amd_pstate_epp_verify_policy(struct cpufreq_policy_data *policy)
+-{
+-	cpufreq_verify_within_cpu_limits(policy);
+-	pr_debug("policy_max =%d, policy_min=%d\n", policy->max, policy->min);
+-	return 0;
+-}
+-
+ static int amd_pstate_epp_suspend(struct cpufreq_policy *policy)
+ {
+ 	struct amd_cpudata *cpudata = policy->driver_data;
+@@ -1703,13 +1725,6 @@ static int amd_pstate_epp_resume(struct
+ 	return 0;
+ }
+ 
+-static int amd_pstate_get_init_min_freq(struct cpufreq_policy *policy)
+-{
+-	struct amd_cpudata *cpudata = policy->driver_data;
+-
+-	return READ_ONCE(cpudata->lowest_nonlinear_freq);
+-}
+-
+ static struct cpufreq_driver amd_pstate_driver = {
+ 	.flags		= CPUFREQ_CONST_LOOPS | CPUFREQ_NEED_UPDATE_LIMITS,
+ 	.verify		= amd_pstate_verify,
+@@ -1723,12 +1738,11 @@ static struct cpufreq_driver amd_pstate_
+ 	.update_limits	= amd_pstate_update_limits,
+ 	.name		= "amd-pstate",
+ 	.attr		= amd_pstate_attr,
+-	.get_init_min_freq = amd_pstate_get_init_min_freq,
+ };
+ 
+ static struct cpufreq_driver amd_pstate_epp_driver = {
+ 	.flags		= CPUFREQ_CONST_LOOPS,
+-	.verify		= amd_pstate_epp_verify_policy,
++	.verify		= amd_pstate_verify,
+ 	.setpolicy	= amd_pstate_epp_set_policy,
+ 	.init		= amd_pstate_epp_cpu_init,
+ 	.exit		= amd_pstate_epp_cpu_exit,
+@@ -1740,7 +1754,6 @@ static struct cpufreq_driver amd_pstate_
+ 	.set_boost	= amd_pstate_set_boost,
+ 	.name		= "amd-pstate-epp",
+ 	.attr		= amd_pstate_epp_attr,
+-	.get_init_min_freq = amd_pstate_get_init_min_freq,
+ };
+ 
+ static int __init amd_pstate_set_driver(int mode_idx)
+--- a/drivers/cpufreq/amd-pstate.h
++++ b/drivers/cpufreq/amd-pstate.h
+@@ -28,7 +28,7 @@ struct amd_aperf_mperf {
+ /**
+  * struct amd_cpudata - private CPU data for AMD P-State
+  * @cpu: CPU number
+- * @max_freq_req: maximum frequency constraint request to apply
++ * @req: constraint request to apply
+  * @cppc_req_cached: cached performance request hints
+  * @highest_perf: the maximum performance an individual processor may reach,
+  *		  assuming ideal conditions
+@@ -68,7 +68,7 @@ struct amd_aperf_mperf {
+ struct amd_cpudata {
+ 	int	cpu;
+ 
+-	struct	freq_qos_request max_freq_req;
++	struct	freq_qos_request req[2];
+ 	u64	cppc_req_cached;
+ 
+ 	u32	highest_perf;
+--- a/drivers/cpufreq/cpufreq.c
++++ b/drivers/cpufreq/cpufreq.c
+@@ -1380,7 +1380,6 @@ static int cpufreq_online(unsigned int c
+ 	bool new_policy;
+ 	unsigned long flags;
+ 	unsigned int j;
+-	u32 init_min_freq = FREQ_QOS_MIN_DEFAULT_VALUE;
+ 	int ret;
+ 
+ 	pr_debug("%s: bringing CPU%u online\n", __func__, cpu);
+@@ -1465,12 +1464,9 @@ static int cpufreq_online(unsigned int c
+ 			goto out_destroy_policy;
+ 		}
+ 
+-		if (cpufreq_driver->get_init_min_freq)
+-			init_min_freq = cpufreq_driver->get_init_min_freq(policy);
+-
+ 		ret = freq_qos_add_request(&policy->constraints,
+ 					   policy->min_freq_req, FREQ_QOS_MIN,
+-					   init_min_freq);
++					   FREQ_QOS_MIN_DEFAULT_VALUE);
+ 		if (ret < 0) {
+ 			/*
+ 			 * So we don't call freq_qos_remove_request() for an
+--- a/include/linux/cpufreq.h
++++ b/include/linux/cpufreq.h
+@@ -414,12 +414,6 @@ struct cpufreq_driver {
+ 	 * policy is properly initialized, but before the governor is started.
+ 	 */
+ 	void		(*register_em)(struct cpufreq_policy *policy);
+-
+-	/*
+-	 * Set by drivers that want to initialize the policy->min_freq_req with
+-	 * a value different from the default value (0) in cpufreq core.
+-	 */
+-	int		(*get_init_min_freq)(struct cpufreq_policy *policy);
+ };
+ 
+ /* flags */
diff --git a/debian/patches/patchset-pf/amd-pstate/0028-cpufreq-amd-pstate-Call-amd_pstate_register-in-amd_p.patch b/debian/patches/patchset-pf/amd-pstate/0028-cpufreq-amd-pstate-Call-amd_pstate_register-in-amd_p.patch
new file mode 100644
index 0000000..cc03ae5
--- /dev/null
+++ b/debian/patches/patchset-pf/amd-pstate/0028-cpufreq-amd-pstate-Call-amd_pstate_register-in-amd_p.patch
@@ -0,0 +1,44 @@
+From db147a0a6341822a15fd9c4cd51f8dc4a9a1747b Mon Sep 17 00:00:00 2001
+From: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
+Date: Thu, 17 Oct 2024 10:05:27 +0000
+Subject: cpufreq/amd-pstate: Call amd_pstate_register() in amd_pstate_init()
+
+Replace a similar chunk of code in amd_pstate_init() with
+amd_pstate_register() call.
+
+Suggested-by: Mario Limonciello <mario.limonciello@amd.com>
+Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
+---
+ drivers/cpufreq/amd-pstate.c | 12 ++----------
+ 1 file changed, 2 insertions(+), 10 deletions(-)
+
+--- a/drivers/cpufreq/amd-pstate.c
++++ b/drivers/cpufreq/amd-pstate.c
+@@ -1909,17 +1909,10 @@ static int __init amd_pstate_init(void)
+ 			return ret;
+ 	}
+ 
+-	/* enable amd pstate feature */
+-	ret = amd_pstate_enable(true);
+-	if (ret) {
+-		pr_err("failed to enable driver mode(%d)\n", cppc_state);
+-		return ret;
+-	}
+-
+-	ret = cpufreq_register_driver(current_pstate_driver);
++	ret = amd_pstate_register_driver(cppc_state);
+ 	if (ret) {
+ 		pr_err("failed to register with return %d\n", ret);
+-		goto disable_driver;
++		return ret;
+ 	}
+ 
+ 	dev_root = bus_get_dev_root(&cpu_subsys);
+@@ -1936,7 +1929,6 @@ static int __init amd_pstate_init(void)
+ 
+ global_attr_free:
+ 	cpufreq_unregister_driver(current_pstate_driver);
+-disable_driver:
+ 	amd_pstate_enable(false);
+ 	return ret;
+ }
diff --git a/debian/patches/patchset-pf/amd-pstate/0029-cpufreq-amd-pstate-Call-amd_pstate_set_driver-in-amd.patch b/debian/patches/patchset-pf/amd-pstate/0029-cpufreq-amd-pstate-Call-amd_pstate_set_driver-in-amd.patch
new file mode 100644
index 0000000..39b68b6
--- /dev/null
+++ b/debian/patches/patchset-pf/amd-pstate/0029-cpufreq-amd-pstate-Call-amd_pstate_set_driver-in-amd.patch
@@ -0,0 +1,81 @@
+From 7c658490b05f6ab4dd59e1c25e75ba1037f6cfeb Mon Sep 17 00:00:00 2001
+From: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
+Date: Thu, 17 Oct 2024 10:05:29 +0000
+Subject: cpufreq/amd-pstate: Call amd_pstate_set_driver() in
+ amd_pstate_register_driver()
+
+Replace a similar chunk of code in amd_pstate_register_driver() with
+amd_pstate_set_driver() call.
+
+Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
+---
+ drivers/cpufreq/amd-pstate.c | 47 +++++++++++++++++-------------------
+ 1 file changed, 22 insertions(+), 25 deletions(-)
+
+--- a/drivers/cpufreq/amd-pstate.c
++++ b/drivers/cpufreq/amd-pstate.c
+@@ -1221,16 +1221,32 @@ static void amd_pstate_driver_cleanup(vo
+ 	current_pstate_driver = NULL;
+ }
+ 
++static int amd_pstate_set_driver(int mode_idx)
++{
++	if (mode_idx >= AMD_PSTATE_DISABLE && mode_idx < AMD_PSTATE_MAX) {
++		cppc_state = mode_idx;
++		if (cppc_state == AMD_PSTATE_DISABLE)
++			pr_info("driver is explicitly disabled\n");
++
++		if (cppc_state == AMD_PSTATE_ACTIVE)
++			current_pstate_driver = &amd_pstate_epp_driver;
++
++		if (cppc_state == AMD_PSTATE_PASSIVE || cppc_state == AMD_PSTATE_GUIDED)
++			current_pstate_driver = &amd_pstate_driver;
++
++		return 0;
++	}
++
++	return -EINVAL;
++}
++
+ static int amd_pstate_register_driver(int mode)
+ {
+ 	int ret;
+ 
+-	if (mode == AMD_PSTATE_PASSIVE || mode == AMD_PSTATE_GUIDED)
+-		current_pstate_driver = &amd_pstate_driver;
+-	else if (mode == AMD_PSTATE_ACTIVE)
+-		current_pstate_driver = &amd_pstate_epp_driver;
+-	else
+-		return -EINVAL;
++	ret = amd_pstate_set_driver(mode);
++	if (ret)
++		return ret;
+ 
+ 	cppc_state = mode;
+ 
+@@ -1756,25 +1772,6 @@ static struct cpufreq_driver amd_pstate_
+ 	.attr		= amd_pstate_epp_attr,
+ };
+ 
+-static int __init amd_pstate_set_driver(int mode_idx)
+-{
+-	if (mode_idx >= AMD_PSTATE_DISABLE && mode_idx < AMD_PSTATE_MAX) {
+-		cppc_state = mode_idx;
+-		if (cppc_state == AMD_PSTATE_DISABLE)
+-			pr_info("driver is explicitly disabled\n");
+-
+-		if (cppc_state == AMD_PSTATE_ACTIVE)
+-			current_pstate_driver = &amd_pstate_epp_driver;
+-
+-		if (cppc_state == AMD_PSTATE_PASSIVE || cppc_state == AMD_PSTATE_GUIDED)
+-			current_pstate_driver = &amd_pstate_driver;
+-
+-		return 0;
+-	}
+-
+-	return -EINVAL;
+-}
+-
+ /*
+  * CPPC function is not supported for family ID 17H with model_ID ranging from 0x10 to 0x2F.
+  * show the debug message that helps to check if the CPU has CPPC support for loading issue.
diff --git a/debian/patches/patchset-pf/amd-pstate/0030-cpufreq-amd-pstate-Remove-the-switch-case-in-amd_pst.patch b/debian/patches/patchset-pf/amd-pstate/0030-cpufreq-amd-pstate-Remove-the-switch-case-in-amd_pst.patch
new file mode 100644
index 0000000..507e5e4
--- /dev/null
+++ b/debian/patches/patchset-pf/amd-pstate/0030-cpufreq-amd-pstate-Remove-the-switch-case-in-amd_pst.patch
@@ -0,0 +1,41 @@
+From 55be5db97f4f52badc958463ee8d9cbc2ae91615 Mon Sep 17 00:00:00 2001
+From: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
+Date: Thu, 17 Oct 2024 10:05:31 +0000
+Subject: cpufreq/amd-pstate: Remove the switch case in amd_pstate_init()
+
+Replace the switch case with a more readable if condition.
+
+Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
+---
+ drivers/cpufreq/amd-pstate.c | 16 +++++-----------
+ 1 file changed, 5 insertions(+), 11 deletions(-)
+
+--- a/drivers/cpufreq/amd-pstate.c
++++ b/drivers/cpufreq/amd-pstate.c
+@@ -1873,21 +1873,15 @@ static int __init amd_pstate_init(void)
+ 		cppc_state = CONFIG_X86_AMD_PSTATE_DEFAULT_MODE;
+ 	}
+ 
+-	switch (cppc_state) {
+-	case AMD_PSTATE_DISABLE:
++	if (cppc_state == AMD_PSTATE_DISABLE) {
+ 		pr_info("driver load is disabled, boot with specific mode to enable this\n");
+ 		return -ENODEV;
+-	case AMD_PSTATE_PASSIVE:
+-	case AMD_PSTATE_ACTIVE:
+-	case AMD_PSTATE_GUIDED:
+-		ret = amd_pstate_set_driver(cppc_state);
+-		if (ret)
+-			return ret;
+-		break;
+-	default:
+-		return -EINVAL;
+ 	}
+ 
++	ret = amd_pstate_set_driver(cppc_state);
++	if (ret)
++		return ret;
++
+ 	/* capability check */
+ 	if (cpu_feature_enabled(X86_FEATURE_CPPC)) {
+ 		pr_debug("AMD CPPC MSR based functionality is supported\n");
diff --git a/debian/patches/patchset-pf/amd-pstate/0031-cpufreq-amd-pstate-Remove-the-redundant-amd_pstate_s.patch b/debian/patches/patchset-pf/amd-pstate/0031-cpufreq-amd-pstate-Remove-the-redundant-amd_pstate_s.patch
new file mode 100644
index 0000000..d66b162
--- /dev/null
+++ b/debian/patches/patchset-pf/amd-pstate/0031-cpufreq-amd-pstate-Remove-the-redundant-amd_pstate_s.patch
@@ -0,0 +1,43 @@
+From 7305364888151cb9e6b435c5f219ccfd18132b58 Mon Sep 17 00:00:00 2001
+From: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
+Date: Thu, 17 Oct 2024 10:05:33 +0000
+Subject: cpufreq/amd-pstate: Remove the redundant amd_pstate_set_driver() call
+
+amd_pstate_set_driver() is called twice, once in amd_pstate_init() and once
+as part of amd_pstate_register_driver(). Move around code and eliminate
+the redundancy.
+
+Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
+---
+ drivers/cpufreq/amd-pstate.c | 12 ++++--------
+ 1 file changed, 4 insertions(+), 8 deletions(-)
+
+--- a/drivers/cpufreq/amd-pstate.c
++++ b/drivers/cpufreq/amd-pstate.c
+@@ -1878,9 +1878,11 @@ static int __init amd_pstate_init(void)
+ 		return -ENODEV;
+ 	}
+ 
+-	ret = amd_pstate_set_driver(cppc_state);
+-	if (ret)
++	ret = amd_pstate_register_driver(cppc_state);
++	if (ret) {
++		pr_err("failed to register with return %d\n", ret);
+ 		return ret;
++	}
+ 
+ 	/* capability check */
+ 	if (cpu_feature_enabled(X86_FEATURE_CPPC)) {
+@@ -1900,12 +1902,6 @@ static int __init amd_pstate_init(void)
+ 			return ret;
+ 	}
+ 
+-	ret = amd_pstate_register_driver(cppc_state);
+-	if (ret) {
+-		pr_err("failed to register with return %d\n", ret);
+-		return ret;
+-	}
+-
+ 	dev_root = bus_get_dev_root(&cpu_subsys);
+ 	if (dev_root) {
+ 		ret = sysfs_create_group(&dev_root->kobj, &amd_pstate_global_attr_group);
diff --git a/debian/patches/patchset-pf/amd-pstate/0032-cpufreq-amd-pstate-ut-Add-fix-for-min-freq-unit-test.patch b/debian/patches/patchset-pf/amd-pstate/0032-cpufreq-amd-pstate-ut-Add-fix-for-min-freq-unit-test.patch
new file mode 100644
index 0000000..30b28d1
--- /dev/null
+++ b/debian/patches/patchset-pf/amd-pstate/0032-cpufreq-amd-pstate-ut-Add-fix-for-min-freq-unit-test.patch
@@ -0,0 +1,33 @@
+From 5886ef269d069c72ea952cb00699e16221289e8c Mon Sep 17 00:00:00 2001
+From: Mario Limonciello <mario.limonciello@amd.com>
+Date: Thu, 17 Oct 2024 12:34:39 -0500
+Subject: cpufreq/amd-pstate-ut: Add fix for min freq unit test
+
+commit 642aff3964b0f ("cpufreq/amd-pstate: Set the initial min_freq to
+lowest_nonlinear_freq") changed the iniital minimum frequency to lowest
+nonlinear frequency, but the unit tests weren't updated and now fail.
+
+Update them to match this same change.
+
+Fixes: 642aff3964b0f ("cpufreq/amd-pstate: Set the initial min_freq to lowest_nonlinear_freq")
+Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
+---
+ drivers/cpufreq/amd-pstate-ut.c | 6 +++---
+ 1 file changed, 3 insertions(+), 3 deletions(-)
+
+--- a/drivers/cpufreq/amd-pstate-ut.c
++++ b/drivers/cpufreq/amd-pstate-ut.c
+@@ -227,10 +227,10 @@ static void amd_pstate_ut_check_freq(u32
+ 			goto skip_test;
+ 		}
+ 
+-		if (cpudata->min_freq != policy->min) {
++		if (cpudata->lowest_nonlinear_freq != policy->min) {
+ 			amd_pstate_ut_cases[index].result = AMD_PSTATE_UT_RESULT_FAIL;
+-			pr_err("%s cpu%d cpudata_min_freq=%d policy_min=%d, they should be equal!\n",
+-				__func__, cpu, cpudata->min_freq, policy->min);
++			pr_err("%s cpu%d cpudata_lowest_nonlinear_freq=%d policy_min=%d, they should be equal!\n",
++				__func__, cpu, cpudata->lowest_nonlinear_freq, policy->min);
+ 			goto skip_test;
+ 		}
+ 
diff --git a/debian/patches/patchset-pf/amd-pstate/0033-amd-pstate-Set-min_perf-to-nominal_perf-for-active-m.patch b/debian/patches/patchset-pf/amd-pstate/0033-amd-pstate-Set-min_perf-to-nominal_perf-for-active-m.patch
new file mode 100644
index 0000000..528b7cb
--- /dev/null
+++ b/debian/patches/patchset-pf/amd-pstate/0033-amd-pstate-Set-min_perf-to-nominal_perf-for-active-m.patch
@@ -0,0 +1,33 @@
+From e82b9b5a56bcac18cae68878fe67263279805735 Mon Sep 17 00:00:00 2001
+From: "Gautham R. Shenoy" <gautham.shenoy@amd.com>
+Date: Mon, 21 Oct 2024 15:48:35 +0530
+Subject: amd-pstate: Set min_perf to nominal_perf for active mode performance
+ gov
+
+The amd-pstate driver sets CPPC_REQ.min_perf to CPPC_REQ.max_perf when
+in active mode with performance governor. Typically CPPC_REQ.max_perf
+is set to CPPC.highest_perf. This causes frequency throttling on
+power-limited platforms which causes performance regressions on
+certain classes of workloads.
+
+Hence, set the CPPC_REQ.min_perf to the CPPC.nominal_perf or
+CPPC_REQ.max_perf, whichever is lower of the two.
+
+Fixes: ffa5096a7c33 ("cpufreq: amd-pstate: implement Pstate EPP support for the AMD processors")
+Signed-off-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
+Reviewed-by: Mario Limonciello <mario.limonciello@amd.com>
+---
+ drivers/cpufreq/amd-pstate.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/drivers/cpufreq/amd-pstate.c
++++ b/drivers/cpufreq/amd-pstate.c
+@@ -1565,7 +1565,7 @@ static int amd_pstate_epp_update_limit(s
+ 	value = READ_ONCE(cpudata->cppc_req_cached);
+ 
+ 	if (cpudata->policy == CPUFREQ_POLICY_PERFORMANCE)
+-		min_perf = max_perf;
++		min_perf = min(cpudata->nominal_perf, max_perf);
+ 
+ 	/* Initial min/max values for CPPC Performance Controls Register */
+ 	value &= ~AMD_CPPC_MIN_PERF(~0L);
diff --git a/debian/patches/patchset-pf/amd-pstate/0034-amd-pstate-Switch-to-amd-pstate-by-default-on-some-S.patch b/debian/patches/patchset-pf/amd-pstate/0034-amd-pstate-Switch-to-amd-pstate-by-default-on-some-S.patch
new file mode 100644
index 0000000..bfe6fd4
--- /dev/null
+++ b/debian/patches/patchset-pf/amd-pstate/0034-amd-pstate-Switch-to-amd-pstate-by-default-on-some-S.patch
@@ -0,0 +1,44 @@
+From 497447cf96a785a4edd0756da5d5718037f5687c Mon Sep 17 00:00:00 2001
+From: Swapnil Sapkal <swapnil.sapkal@amd.com>
+Date: Mon, 21 Oct 2024 15:48:36 +0530
+Subject: amd-pstate: Switch to amd-pstate by default on some Server platforms
+
+Currently the default cpufreq driver for all the AMD EPYC servers is
+acpi-cpufreq. Going forward, switch to amd-pstate as the default
+driver on the AMD EPYC server platforms with CPU family 0x1A or
+higher. The default mode will be active mode.
+
+Testing shows that amd-pstate with active mode and performance
+governor provides comparable or better performance per-watt against
+acpi-cpufreq + performance governor.
+
+Likewise, amd-pstate with active mode and powersave governor with the
+energy_performance_preference=power (EPP=255) provides comparable or
+better performance per-watt against acpi-cpufreq + schedutil governor
+for a wide range of workloads.
+
+Users can still revert to using acpi-cpufreq driver on these platforms
+with the "amd_pstate=disable" kernel commandline parameter.
+
+Signed-off-by: Swapnil Sapkal <swapnil.sapkal@amd.com>
+Signed-off-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
+Reviewed-by: Mario Limonciello <mario.limonciello@amd.com>
+---
+ drivers/cpufreq/amd-pstate.c | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+--- a/drivers/cpufreq/amd-pstate.c
++++ b/drivers/cpufreq/amd-pstate.c
+@@ -1862,10 +1862,10 @@ static int __init amd_pstate_init(void)
+ 	if (cppc_state == AMD_PSTATE_UNDEFINED) {
+ 		/* Disable on the following configs by default:
+ 		 * 1. Undefined platforms
+-		 * 2. Server platforms
++		 * 2. Server platforms with CPUs older than Family 0x1A.
+ 		 */
+ 		if (amd_pstate_acpi_pm_profile_undefined() ||
+-		    amd_pstate_acpi_pm_profile_server()) {
++		    (amd_pstate_acpi_pm_profile_server() && boot_cpu_data.x86 < 0x1A)) {
+ 			pr_info("driver load is disabled, boot with specific mode to enable this\n");
+ 			return -ENODEV;
+ 		}
diff --git a/debian/patches/patchset-pf/amd-pstate/0035-cpufreq-amd-pstate-Push-adjust_perf-vfunc-init-into-.patch b/debian/patches/patchset-pf/amd-pstate/0035-cpufreq-amd-pstate-Push-adjust_perf-vfunc-init-into-.patch
new file mode 100644
index 0000000..f4f044b
--- /dev/null
+++ b/debian/patches/patchset-pf/amd-pstate/0035-cpufreq-amd-pstate-Push-adjust_perf-vfunc-init-into-.patch
@@ -0,0 +1,38 @@
+From a4d255935a1ea6e4b10167df942ec641079bcdf7 Mon Sep 17 00:00:00 2001
+From: Mario Limonciello <mario.limonciello@amd.com>
+Date: Mon, 28 Oct 2024 09:55:41 -0500
+Subject: cpufreq/amd-pstate: Push adjust_perf vfunc init into cpu_init
+
+As the driver can be changed in and out of different modes it's possible
+that adjust_perf is assigned when it shouldn't be.
+
+This could happen if an MSR design is started up in passive mode and then
+switches to active mode.
+
+To solve this explicitly clear `adjust_perf` in amd_pstate_epp_cpu_init().
+
+Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
+---
+ drivers/cpufreq/amd-pstate.c | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+--- a/drivers/cpufreq/amd-pstate.c
++++ b/drivers/cpufreq/amd-pstate.c
+@@ -1528,6 +1528,8 @@ static int amd_pstate_epp_cpu_init(struc
+ 		WRITE_ONCE(cpudata->cppc_cap1_cached, value);
+ 	}
+ 
++	current_pstate_driver->adjust_perf = NULL;
++
+ 	return 0;
+ 
+ free_cpudata1:
+@@ -1887,8 +1889,6 @@ static int __init amd_pstate_init(void)
+ 	/* capability check */
+ 	if (cpu_feature_enabled(X86_FEATURE_CPPC)) {
+ 		pr_debug("AMD CPPC MSR based functionality is supported\n");
+-		if (cppc_state != AMD_PSTATE_ACTIVE)
+-			current_pstate_driver->adjust_perf = amd_pstate_adjust_perf;
+ 	} else {
+ 		pr_debug("AMD CPPC shared memory based functionality is supported\n");
+ 		static_call_update(amd_pstate_enable, shmem_enable);
diff --git a/debian/patches/patchset-pf/amd-pstate/0036-cpufreq-amd-pstate-Move-registration-after-static-fu.patch b/debian/patches/patchset-pf/amd-pstate/0036-cpufreq-amd-pstate-Move-registration-after-static-fu.patch
new file mode 100644
index 0000000..32b6282
--- /dev/null
+++ b/debian/patches/patchset-pf/amd-pstate/0036-cpufreq-amd-pstate-Move-registration-after-static-fu.patch
@@ -0,0 +1,47 @@
+From c42a82a583646dcbba8500d47ed878616ab5c33a Mon Sep 17 00:00:00 2001
+From: Mario Limonciello <mario.limonciello@amd.com>
+Date: Mon, 28 Oct 2024 09:55:42 -0500
+Subject: cpufreq/amd-pstate: Move registration after static function call
+ update
+
+On shared memory designs the static functions need to work before
+registration is done or the system can hang at bootup.
+
+Move the registration later in amd_pstate_init() to solve this.
+
+Fixes: e238968a2087 ("cpufreq/amd-pstate: Remove the redundant amd_pstate_set_driver() call")
+Reported-by: Klara Modin <klarasmodin@gmail.com>
+Closes: https://lore.kernel.org/linux-pm/cf9c146d-bacf-444e-92e2-15ebf513af96@gmail.com/#t
+Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
+---
+ drivers/cpufreq/amd-pstate.c | 12 ++++++------
+ 1 file changed, 6 insertions(+), 6 deletions(-)
+
+--- a/drivers/cpufreq/amd-pstate.c
++++ b/drivers/cpufreq/amd-pstate.c
+@@ -1880,12 +1880,6 @@ static int __init amd_pstate_init(void)
+ 		return -ENODEV;
+ 	}
+ 
+-	ret = amd_pstate_register_driver(cppc_state);
+-	if (ret) {
+-		pr_err("failed to register with return %d\n", ret);
+-		return ret;
+-	}
+-
+ 	/* capability check */
+ 	if (cpu_feature_enabled(X86_FEATURE_CPPC)) {
+ 		pr_debug("AMD CPPC MSR based functionality is supported\n");
+@@ -1896,6 +1890,12 @@ static int __init amd_pstate_init(void)
+ 		static_call_update(amd_pstate_update_perf, shmem_update_perf);
+ 	}
+ 
++	ret = amd_pstate_register_driver(cppc_state);
++	if (ret) {
++		pr_err("failed to register with return %d\n", ret);
++		return ret;
++	}
++
+ 	if (amd_pstate_prefcore) {
+ 		ret = amd_detect_prefcore(&amd_pstate_prefcore);
+ 		if (ret)
diff --git a/debian/patches/patchset-pf/amd-rapl/0001-perf-Generic-hotplug-support-for-a-PMU-with-a-scope.patch b/debian/patches/patchset-pf/amd-rapl/0001-perf-Generic-hotplug-support-for-a-PMU-with-a-scope.patch
new file mode 100644
index 0000000..8c7f40f
--- /dev/null
+++ b/debian/patches/patchset-pf/amd-rapl/0001-perf-Generic-hotplug-support-for-a-PMU-with-a-scope.patch
@@ -0,0 +1,321 @@
+From 023d6b8aa8d8b346cfdcccf5ca4cb880c8d41d87 Mon Sep 17 00:00:00 2001
+From: Kan Liang <kan.liang@linux.intel.com>
+Date: Fri, 2 Aug 2024 08:16:37 -0700
+Subject: perf: Generic hotplug support for a PMU with a scope
+
+The perf subsystem assumes that the counters of a PMU are per-CPU. So
+the user space tool reads a counter from each CPU in the system wide
+mode. However, many PMUs don't have a per-CPU counter. The counter is
+effective for a scope, e.g., a die or a socket. To address this, a
+cpumask is exposed by the kernel driver to restrict to one CPU to stand
+for a specific scope. In case the given CPU is removed,
+the hotplug support has to be implemented for each such driver.
+
+The codes to support the cpumask and hotplug are very similar.
+- Expose a cpumask into sysfs
+- Pickup another CPU in the same scope if the given CPU is removed.
+- Invoke the perf_pmu_migrate_context() to migrate to a new CPU.
+- In event init, always set the CPU in the cpumask to event->cpu
+
+Similar duplicated codes are implemented for each such PMU driver. It
+would be good to introduce a generic infrastructure to avoid such
+duplication.
+
+5 popular scopes are implemented here, core, die, cluster, pkg, and
+the system-wide. The scope can be set when a PMU is registered. If so, a
+"cpumask" is automatically exposed for the PMU.
+
+The "cpumask" is from the perf_online_<scope>_mask, which is to track
+the active CPU for each scope. They are set when the first CPU of the
+scope is online via the generic perf hotplug support. When a
+corresponding CPU is removed, the perf_online_<scope>_mask is updated
+accordingly and the PMU will be moved to a new CPU from the same scope
+if possible.
+
+Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
+---
+ include/linux/perf_event.h |  18 ++++
+ kernel/events/core.c       | 164 ++++++++++++++++++++++++++++++++++++-
+ 2 files changed, 180 insertions(+), 2 deletions(-)
+
+--- a/include/linux/perf_event.h
++++ b/include/linux/perf_event.h
+@@ -292,6 +292,19 @@ struct perf_event_pmu_context;
+ #define PERF_PMU_CAP_AUX_OUTPUT			0x0080
+ #define PERF_PMU_CAP_EXTENDED_HW_TYPE		0x0100
+ 
++/**
++ * pmu::scope
++ */
++enum perf_pmu_scope {
++	PERF_PMU_SCOPE_NONE	= 0,
++	PERF_PMU_SCOPE_CORE,
++	PERF_PMU_SCOPE_DIE,
++	PERF_PMU_SCOPE_CLUSTER,
++	PERF_PMU_SCOPE_PKG,
++	PERF_PMU_SCOPE_SYS_WIDE,
++	PERF_PMU_MAX_SCOPE,
++};
++
+ struct perf_output_handle;
+ 
+ #define PMU_NULL_DEV	((void *)(~0UL))
+@@ -315,6 +328,11 @@ struct pmu {
+ 	 */
+ 	int				capabilities;
+ 
++	/*
++	 * PMU scope
++	 */
++	unsigned int			scope;
++
+ 	int __percpu			*pmu_disable_count;
+ 	struct perf_cpu_pmu_context __percpu *cpu_pmu_context;
+ 	atomic_t			exclusive_cnt; /* < 0: cpu; > 0: tsk */
+--- a/kernel/events/core.c
++++ b/kernel/events/core.c
+@@ -411,6 +411,11 @@ static LIST_HEAD(pmus);
+ static DEFINE_MUTEX(pmus_lock);
+ static struct srcu_struct pmus_srcu;
+ static cpumask_var_t perf_online_mask;
++static cpumask_var_t perf_online_core_mask;
++static cpumask_var_t perf_online_die_mask;
++static cpumask_var_t perf_online_cluster_mask;
++static cpumask_var_t perf_online_pkg_mask;
++static cpumask_var_t perf_online_sys_mask;
+ static struct kmem_cache *perf_event_cache;
+ 
+ /*
+@@ -11497,10 +11502,60 @@ perf_event_mux_interval_ms_store(struct
+ }
+ static DEVICE_ATTR_RW(perf_event_mux_interval_ms);
+ 
++static inline const struct cpumask *perf_scope_cpu_topology_cpumask(unsigned int scope, int cpu)
++{
++	switch (scope) {
++	case PERF_PMU_SCOPE_CORE:
++		return topology_sibling_cpumask(cpu);
++	case PERF_PMU_SCOPE_DIE:
++		return topology_die_cpumask(cpu);
++	case PERF_PMU_SCOPE_CLUSTER:
++		return topology_cluster_cpumask(cpu);
++	case PERF_PMU_SCOPE_PKG:
++		return topology_core_cpumask(cpu);
++	case PERF_PMU_SCOPE_SYS_WIDE:
++		return cpu_online_mask;
++	}
++
++	return NULL;
++}
++
++static inline struct cpumask *perf_scope_cpumask(unsigned int scope)
++{
++	switch (scope) {
++	case PERF_PMU_SCOPE_CORE:
++		return perf_online_core_mask;
++	case PERF_PMU_SCOPE_DIE:
++		return perf_online_die_mask;
++	case PERF_PMU_SCOPE_CLUSTER:
++		return perf_online_cluster_mask;
++	case PERF_PMU_SCOPE_PKG:
++		return perf_online_pkg_mask;
++	case PERF_PMU_SCOPE_SYS_WIDE:
++		return perf_online_sys_mask;
++	}
++
++	return NULL;
++}
++
++static ssize_t cpumask_show(struct device *dev, struct device_attribute *attr,
++			    char *buf)
++{
++	struct pmu *pmu = dev_get_drvdata(dev);
++	struct cpumask *mask = perf_scope_cpumask(pmu->scope);
++
++	if (mask)
++		return cpumap_print_to_pagebuf(true, buf, mask);
++	return 0;
++}
++
++static DEVICE_ATTR_RO(cpumask);
++
+ static struct attribute *pmu_dev_attrs[] = {
+ 	&dev_attr_type.attr,
+ 	&dev_attr_perf_event_mux_interval_ms.attr,
+ 	&dev_attr_nr_addr_filters.attr,
++	&dev_attr_cpumask.attr,
+ 	NULL,
+ };
+ 
+@@ -11512,6 +11567,10 @@ static umode_t pmu_dev_is_visible(struct
+ 	if (n == 2 && !pmu->nr_addr_filters)
+ 		return 0;
+ 
++	/* cpumask */
++	if (n == 3 && pmu->scope == PERF_PMU_SCOPE_NONE)
++		return 0;
++
+ 	return a->mode;
+ }
+ 
+@@ -11596,6 +11655,11 @@ int perf_pmu_register(struct pmu *pmu, c
+ 		goto free_pdc;
+ 	}
+ 
++	if (WARN_ONCE(pmu->scope >= PERF_PMU_MAX_SCOPE, "Can not register a pmu with an invalid scope.\n")) {
++		ret = -EINVAL;
++		goto free_pdc;
++	}
++
+ 	pmu->name = name;
+ 
+ 	if (type >= 0)
+@@ -11750,6 +11814,22 @@ static int perf_try_init_event(struct pm
+ 		    event_has_any_exclude_flag(event))
+ 			ret = -EINVAL;
+ 
++		if (pmu->scope != PERF_PMU_SCOPE_NONE && event->cpu >= 0) {
++			const struct cpumask *cpumask = perf_scope_cpu_topology_cpumask(pmu->scope, event->cpu);
++			struct cpumask *pmu_cpumask = perf_scope_cpumask(pmu->scope);
++			int cpu;
++
++			if (pmu_cpumask && cpumask) {
++				cpu = cpumask_any_and(pmu_cpumask, cpumask);
++				if (cpu >= nr_cpu_ids)
++					ret = -ENODEV;
++				else
++					event->cpu = cpu;
++			} else {
++				ret = -ENODEV;
++			}
++		}
++
+ 		if (ret && event->destroy)
+ 			event->destroy(event);
+ 	}
+@@ -13713,6 +13793,12 @@ static void __init perf_event_init_all_c
+ 	int cpu;
+ 
+ 	zalloc_cpumask_var(&perf_online_mask, GFP_KERNEL);
++	zalloc_cpumask_var(&perf_online_core_mask, GFP_KERNEL);
++	zalloc_cpumask_var(&perf_online_die_mask, GFP_KERNEL);
++	zalloc_cpumask_var(&perf_online_cluster_mask, GFP_KERNEL);
++	zalloc_cpumask_var(&perf_online_pkg_mask, GFP_KERNEL);
++	zalloc_cpumask_var(&perf_online_sys_mask, GFP_KERNEL);
++
+ 
+ 	for_each_possible_cpu(cpu) {
+ 		swhash = &per_cpu(swevent_htable, cpu);
+@@ -13762,6 +13848,40 @@ static void __perf_event_exit_context(vo
+ 	raw_spin_unlock(&ctx->lock);
+ }
+ 
++static void perf_event_clear_cpumask(unsigned int cpu)
++{
++	int target[PERF_PMU_MAX_SCOPE];
++	unsigned int scope;
++	struct pmu *pmu;
++
++	cpumask_clear_cpu(cpu, perf_online_mask);
++
++	for (scope = PERF_PMU_SCOPE_NONE + 1; scope < PERF_PMU_MAX_SCOPE; scope++) {
++		const struct cpumask *cpumask = perf_scope_cpu_topology_cpumask(scope, cpu);
++		struct cpumask *pmu_cpumask = perf_scope_cpumask(scope);
++
++		target[scope] = -1;
++		if (WARN_ON_ONCE(!pmu_cpumask || !cpumask))
++			continue;
++
++		if (!cpumask_test_and_clear_cpu(cpu, pmu_cpumask))
++			continue;
++		target[scope] = cpumask_any_but(cpumask, cpu);
++		if (target[scope] < nr_cpu_ids)
++			cpumask_set_cpu(target[scope], pmu_cpumask);
++	}
++
++	/* migrate */
++	list_for_each_entry_rcu(pmu, &pmus, entry, lockdep_is_held(&pmus_srcu)) {
++		if (pmu->scope == PERF_PMU_SCOPE_NONE ||
++		    WARN_ON_ONCE(pmu->scope >= PERF_PMU_MAX_SCOPE))
++			continue;
++
++		if (target[pmu->scope] >= 0 && target[pmu->scope] < nr_cpu_ids)
++			perf_pmu_migrate_context(pmu, cpu, target[pmu->scope]);
++	}
++}
++
+ static void perf_event_exit_cpu_context(int cpu)
+ {
+ 	struct perf_cpu_context *cpuctx;
+@@ -13769,6 +13889,11 @@ static void perf_event_exit_cpu_context(
+ 
+ 	// XXX simplify cpuctx->online
+ 	mutex_lock(&pmus_lock);
++	/*
++	 * Clear the cpumasks, and migrate to other CPUs if possible.
++	 * Must be invoked before the __perf_event_exit_context.
++	 */
++	perf_event_clear_cpumask(cpu);
+ 	cpuctx = per_cpu_ptr(&perf_cpu_context, cpu);
+ 	ctx = &cpuctx->ctx;
+ 
+@@ -13776,7 +13901,6 @@ static void perf_event_exit_cpu_context(
+ 	smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1);
+ 	cpuctx->online = 0;
+ 	mutex_unlock(&ctx->mutex);
+-	cpumask_clear_cpu(cpu, perf_online_mask);
+ 	mutex_unlock(&pmus_lock);
+ }
+ #else
+@@ -13785,6 +13909,42 @@ static void perf_event_exit_cpu_context(
+ 
+ #endif
+ 
++static void perf_event_setup_cpumask(unsigned int cpu)
++{
++	struct cpumask *pmu_cpumask;
++	unsigned int scope;
++
++	cpumask_set_cpu(cpu, perf_online_mask);
++
++	/*
++	 * Early boot stage, the cpumask hasn't been set yet.
++	 * The perf_online_<domain>_masks includes the first CPU of each domain.
++	 * Always uncondifionally set the boot CPU for the perf_online_<domain>_masks.
++	 */
++	if (!topology_sibling_cpumask(cpu)) {
++		for (scope = PERF_PMU_SCOPE_NONE + 1; scope < PERF_PMU_MAX_SCOPE; scope++) {
++			pmu_cpumask = perf_scope_cpumask(scope);
++			if (WARN_ON_ONCE(!pmu_cpumask))
++				continue;
++			cpumask_set_cpu(cpu, pmu_cpumask);
++		}
++		return;
++	}
++
++	for (scope = PERF_PMU_SCOPE_NONE + 1; scope < PERF_PMU_MAX_SCOPE; scope++) {
++		const struct cpumask *cpumask = perf_scope_cpu_topology_cpumask(scope, cpu);
++
++		pmu_cpumask = perf_scope_cpumask(scope);
++
++		if (WARN_ON_ONCE(!pmu_cpumask || !cpumask))
++			continue;
++
++		if (!cpumask_empty(cpumask) &&
++		    cpumask_any_and(pmu_cpumask, cpumask) >= nr_cpu_ids)
++			cpumask_set_cpu(cpu, pmu_cpumask);
++	}
++}
++
+ int perf_event_init_cpu(unsigned int cpu)
+ {
+ 	struct perf_cpu_context *cpuctx;
+@@ -13793,7 +13953,7 @@ int perf_event_init_cpu(unsigned int cpu
+ 	perf_swevent_init_cpu(cpu);
+ 
+ 	mutex_lock(&pmus_lock);
+-	cpumask_set_cpu(cpu, perf_online_mask);
++	perf_event_setup_cpumask(cpu);
+ 	cpuctx = per_cpu_ptr(&perf_cpu_context, cpu);
+ 	ctx = &cpuctx->ctx;
+ 
diff --git a/debian/patches/patchset-pf/amd-rapl/0002-perf-Add-PERF_EV_CAP_READ_SCOPE.patch b/debian/patches/patchset-pf/amd-rapl/0002-perf-Add-PERF_EV_CAP_READ_SCOPE.patch
new file mode 100644
index 0000000..3240af8
--- /dev/null
+++ b/debian/patches/patchset-pf/amd-rapl/0002-perf-Add-PERF_EV_CAP_READ_SCOPE.patch
@@ -0,0 +1,71 @@
+From 8c7eb17e722a6a45c4436e5debb9336089b21d9b Mon Sep 17 00:00:00 2001
+From: Kan Liang <kan.liang@linux.intel.com>
+Date: Fri, 2 Aug 2024 08:16:38 -0700
+Subject: perf: Add PERF_EV_CAP_READ_SCOPE
+
+Usually, an event can be read from any CPU of the scope. It doesn't need
+to be read from the advertised CPU.
+
+Add a new event cap, PERF_EV_CAP_READ_SCOPE. An event of a PMU with
+scope can be read from any active CPU in the scope.
+
+Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
+---
+ include/linux/perf_event.h |  3 +++
+ kernel/events/core.c       | 14 +++++++++++---
+ 2 files changed, 14 insertions(+), 3 deletions(-)
+
+--- a/include/linux/perf_event.h
++++ b/include/linux/perf_event.h
+@@ -633,10 +633,13 @@ typedef void (*perf_overflow_handler_t)(
+  * PERF_EV_CAP_SIBLING: An event with this flag must be a group sibling and
+  * cannot be a group leader. If an event with this flag is detached from the
+  * group it is scheduled out and moved into an unrecoverable ERROR state.
++ * PERF_EV_CAP_READ_SCOPE: A CPU event that can be read from any CPU of the
++ * PMU scope where it is active.
+  */
+ #define PERF_EV_CAP_SOFTWARE		BIT(0)
+ #define PERF_EV_CAP_READ_ACTIVE_PKG	BIT(1)
+ #define PERF_EV_CAP_SIBLING		BIT(2)
++#define PERF_EV_CAP_READ_SCOPE		BIT(3)
+ 
+ #define SWEVENT_HLIST_BITS		8
+ #define SWEVENT_HLIST_SIZE		(1 << SWEVENT_HLIST_BITS)
+--- a/kernel/events/core.c
++++ b/kernel/events/core.c
+@@ -4477,16 +4477,24 @@ struct perf_read_data {
+ 	int ret;
+ };
+ 
++static inline const struct cpumask *perf_scope_cpu_topology_cpumask(unsigned int scope, int cpu);
++
+ static int __perf_event_read_cpu(struct perf_event *event, int event_cpu)
+ {
++	int local_cpu = smp_processor_id();
+ 	u16 local_pkg, event_pkg;
+ 
+ 	if ((unsigned)event_cpu >= nr_cpu_ids)
+ 		return event_cpu;
+ 
+-	if (event->group_caps & PERF_EV_CAP_READ_ACTIVE_PKG) {
+-		int local_cpu = smp_processor_id();
++	if (event->group_caps & PERF_EV_CAP_READ_SCOPE) {
++		const struct cpumask *cpumask = perf_scope_cpu_topology_cpumask(event->pmu->scope, event_cpu);
++
++		if (cpumask && cpumask_test_cpu(local_cpu, cpumask))
++			return local_cpu;
++	}
+ 
++	if (event->group_caps & PERF_EV_CAP_READ_ACTIVE_PKG) {
+ 		event_pkg = topology_physical_package_id(event_cpu);
+ 		local_pkg = topology_physical_package_id(local_cpu);
+ 
+@@ -11824,7 +11832,7 @@ static int perf_try_init_event(struct pm
+ 				if (cpu >= nr_cpu_ids)
+ 					ret = -ENODEV;
+ 				else
+-					event->cpu = cpu;
++					event->event_caps |= PERF_EV_CAP_READ_SCOPE;
+ 			} else {
+ 				ret = -ENODEV;
+ 			}
diff --git a/debian/patches/patchset-pf/amd-rapl/0003-perf-x86-intel-cstate-Clean-up-cpumask-and-hotplug.patch b/debian/patches/patchset-pf/amd-rapl/0003-perf-x86-intel-cstate-Clean-up-cpumask-and-hotplug.patch
new file mode 100644
index 0000000..299cac0
--- /dev/null
+++ b/debian/patches/patchset-pf/amd-rapl/0003-perf-x86-intel-cstate-Clean-up-cpumask-and-hotplug.patch
@@ -0,0 +1,286 @@
+From 09c1529eb102b486220c35546f2663ca858a2943 Mon Sep 17 00:00:00 2001
+From: Kan Liang <kan.liang@linux.intel.com>
+Date: Fri, 2 Aug 2024 08:16:39 -0700
+Subject: perf/x86/intel/cstate: Clean up cpumask and hotplug
+
+There are three cstate PMUs with different scopes, core, die and module.
+The scopes are supported by the generic perf_event subsystem now.
+
+Set the scope for each PMU and remove all the cpumask and hotplug codes.
+
+Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
+---
+ arch/x86/events/intel/cstate.c | 142 ++-------------------------------
+ include/linux/cpuhotplug.h     |   2 -
+ 2 files changed, 5 insertions(+), 139 deletions(-)
+
+--- a/arch/x86/events/intel/cstate.c
++++ b/arch/x86/events/intel/cstate.c
+@@ -128,10 +128,6 @@ static ssize_t __cstate_##_var##_show(st
+ static struct device_attribute format_attr_##_var =		\
+ 	__ATTR(_name, 0444, __cstate_##_var##_show, NULL)
+ 
+-static ssize_t cstate_get_attr_cpumask(struct device *dev,
+-				       struct device_attribute *attr,
+-				       char *buf);
+-
+ /* Model -> events mapping */
+ struct cstate_model {
+ 	unsigned long		core_events;
+@@ -206,22 +202,9 @@ static struct attribute_group cstate_for
+ 	.attrs = cstate_format_attrs,
+ };
+ 
+-static cpumask_t cstate_core_cpu_mask;
+-static DEVICE_ATTR(cpumask, S_IRUGO, cstate_get_attr_cpumask, NULL);
+-
+-static struct attribute *cstate_cpumask_attrs[] = {
+-	&dev_attr_cpumask.attr,
+-	NULL,
+-};
+-
+-static struct attribute_group cpumask_attr_group = {
+-	.attrs = cstate_cpumask_attrs,
+-};
+-
+ static const struct attribute_group *cstate_attr_groups[] = {
+ 	&cstate_events_attr_group,
+ 	&cstate_format_attr_group,
+-	&cpumask_attr_group,
+ 	NULL,
+ };
+ 
+@@ -269,8 +252,6 @@ static struct perf_msr pkg_msr[] = {
+ 	[PERF_CSTATE_PKG_C10_RES] = { MSR_PKG_C10_RESIDENCY,	&group_cstate_pkg_c10,	test_msr },
+ };
+ 
+-static cpumask_t cstate_pkg_cpu_mask;
+-
+ /* cstate_module PMU */
+ static struct pmu cstate_module_pmu;
+ static bool has_cstate_module;
+@@ -291,28 +272,9 @@ static struct perf_msr module_msr[] = {
+ 	[PERF_CSTATE_MODULE_C6_RES]  = { MSR_MODULE_C6_RES_MS,	&group_cstate_module_c6,	test_msr },
+ };
+ 
+-static cpumask_t cstate_module_cpu_mask;
+-
+-static ssize_t cstate_get_attr_cpumask(struct device *dev,
+-				       struct device_attribute *attr,
+-				       char *buf)
+-{
+-	struct pmu *pmu = dev_get_drvdata(dev);
+-
+-	if (pmu == &cstate_core_pmu)
+-		return cpumap_print_to_pagebuf(true, buf, &cstate_core_cpu_mask);
+-	else if (pmu == &cstate_pkg_pmu)
+-		return cpumap_print_to_pagebuf(true, buf, &cstate_pkg_cpu_mask);
+-	else if (pmu == &cstate_module_pmu)
+-		return cpumap_print_to_pagebuf(true, buf, &cstate_module_cpu_mask);
+-	else
+-		return 0;
+-}
+-
+ static int cstate_pmu_event_init(struct perf_event *event)
+ {
+ 	u64 cfg = event->attr.config;
+-	int cpu;
+ 
+ 	if (event->attr.type != event->pmu->type)
+ 		return -ENOENT;
+@@ -331,20 +293,13 @@ static int cstate_pmu_event_init(struct
+ 		if (!(core_msr_mask & (1 << cfg)))
+ 			return -EINVAL;
+ 		event->hw.event_base = core_msr[cfg].msr;
+-		cpu = cpumask_any_and(&cstate_core_cpu_mask,
+-				      topology_sibling_cpumask(event->cpu));
+ 	} else if (event->pmu == &cstate_pkg_pmu) {
+ 		if (cfg >= PERF_CSTATE_PKG_EVENT_MAX)
+ 			return -EINVAL;
+ 		cfg = array_index_nospec((unsigned long)cfg, PERF_CSTATE_PKG_EVENT_MAX);
+ 		if (!(pkg_msr_mask & (1 << cfg)))
+ 			return -EINVAL;
+-
+-		event->event_caps |= PERF_EV_CAP_READ_ACTIVE_PKG;
+-
+ 		event->hw.event_base = pkg_msr[cfg].msr;
+-		cpu = cpumask_any_and(&cstate_pkg_cpu_mask,
+-				      topology_die_cpumask(event->cpu));
+ 	} else if (event->pmu == &cstate_module_pmu) {
+ 		if (cfg >= PERF_CSTATE_MODULE_EVENT_MAX)
+ 			return -EINVAL;
+@@ -352,16 +307,10 @@ static int cstate_pmu_event_init(struct
+ 		if (!(module_msr_mask & (1 << cfg)))
+ 			return -EINVAL;
+ 		event->hw.event_base = module_msr[cfg].msr;
+-		cpu = cpumask_any_and(&cstate_module_cpu_mask,
+-				      topology_cluster_cpumask(event->cpu));
+ 	} else {
+ 		return -ENOENT;
+ 	}
+ 
+-	if (cpu >= nr_cpu_ids)
+-		return -ENODEV;
+-
+-	event->cpu = cpu;
+ 	event->hw.config = cfg;
+ 	event->hw.idx = -1;
+ 	return 0;
+@@ -412,84 +361,6 @@ static int cstate_pmu_event_add(struct p
+ 	return 0;
+ }
+ 
+-/*
+- * Check if exiting cpu is the designated reader. If so migrate the
+- * events when there is a valid target available
+- */
+-static int cstate_cpu_exit(unsigned int cpu)
+-{
+-	unsigned int target;
+-
+-	if (has_cstate_core &&
+-	    cpumask_test_and_clear_cpu(cpu, &cstate_core_cpu_mask)) {
+-
+-		target = cpumask_any_but(topology_sibling_cpumask(cpu), cpu);
+-		/* Migrate events if there is a valid target */
+-		if (target < nr_cpu_ids) {
+-			cpumask_set_cpu(target, &cstate_core_cpu_mask);
+-			perf_pmu_migrate_context(&cstate_core_pmu, cpu, target);
+-		}
+-	}
+-
+-	if (has_cstate_pkg &&
+-	    cpumask_test_and_clear_cpu(cpu, &cstate_pkg_cpu_mask)) {
+-
+-		target = cpumask_any_but(topology_die_cpumask(cpu), cpu);
+-		/* Migrate events if there is a valid target */
+-		if (target < nr_cpu_ids) {
+-			cpumask_set_cpu(target, &cstate_pkg_cpu_mask);
+-			perf_pmu_migrate_context(&cstate_pkg_pmu, cpu, target);
+-		}
+-	}
+-
+-	if (has_cstate_module &&
+-	    cpumask_test_and_clear_cpu(cpu, &cstate_module_cpu_mask)) {
+-
+-		target = cpumask_any_but(topology_cluster_cpumask(cpu), cpu);
+-		/* Migrate events if there is a valid target */
+-		if (target < nr_cpu_ids) {
+-			cpumask_set_cpu(target, &cstate_module_cpu_mask);
+-			perf_pmu_migrate_context(&cstate_module_pmu, cpu, target);
+-		}
+-	}
+-	return 0;
+-}
+-
+-static int cstate_cpu_init(unsigned int cpu)
+-{
+-	unsigned int target;
+-
+-	/*
+-	 * If this is the first online thread of that core, set it in
+-	 * the core cpu mask as the designated reader.
+-	 */
+-	target = cpumask_any_and(&cstate_core_cpu_mask,
+-				 topology_sibling_cpumask(cpu));
+-
+-	if (has_cstate_core && target >= nr_cpu_ids)
+-		cpumask_set_cpu(cpu, &cstate_core_cpu_mask);
+-
+-	/*
+-	 * If this is the first online thread of that package, set it
+-	 * in the package cpu mask as the designated reader.
+-	 */
+-	target = cpumask_any_and(&cstate_pkg_cpu_mask,
+-				 topology_die_cpumask(cpu));
+-	if (has_cstate_pkg && target >= nr_cpu_ids)
+-		cpumask_set_cpu(cpu, &cstate_pkg_cpu_mask);
+-
+-	/*
+-	 * If this is the first online thread of that cluster, set it
+-	 * in the cluster cpu mask as the designated reader.
+-	 */
+-	target = cpumask_any_and(&cstate_module_cpu_mask,
+-				 topology_cluster_cpumask(cpu));
+-	if (has_cstate_module && target >= nr_cpu_ids)
+-		cpumask_set_cpu(cpu, &cstate_module_cpu_mask);
+-
+-	return 0;
+-}
+-
+ static const struct attribute_group *core_attr_update[] = {
+ 	&group_cstate_core_c1,
+ 	&group_cstate_core_c3,
+@@ -526,6 +397,7 @@ static struct pmu cstate_core_pmu = {
+ 	.stop		= cstate_pmu_event_stop,
+ 	.read		= cstate_pmu_event_update,
+ 	.capabilities	= PERF_PMU_CAP_NO_INTERRUPT | PERF_PMU_CAP_NO_EXCLUDE,
++	.scope		= PERF_PMU_SCOPE_CORE,
+ 	.module		= THIS_MODULE,
+ };
+ 
+@@ -541,6 +413,7 @@ static struct pmu cstate_pkg_pmu = {
+ 	.stop		= cstate_pmu_event_stop,
+ 	.read		= cstate_pmu_event_update,
+ 	.capabilities	= PERF_PMU_CAP_NO_INTERRUPT | PERF_PMU_CAP_NO_EXCLUDE,
++	.scope		= PERF_PMU_SCOPE_PKG,
+ 	.module		= THIS_MODULE,
+ };
+ 
+@@ -556,6 +429,7 @@ static struct pmu cstate_module_pmu = {
+ 	.stop		= cstate_pmu_event_stop,
+ 	.read		= cstate_pmu_event_update,
+ 	.capabilities	= PERF_PMU_CAP_NO_INTERRUPT | PERF_PMU_CAP_NO_EXCLUDE,
++	.scope		= PERF_PMU_SCOPE_CLUSTER,
+ 	.module		= THIS_MODULE,
+ };
+ 
+@@ -810,9 +684,6 @@ static int __init cstate_probe(const str
+ 
+ static inline void cstate_cleanup(void)
+ {
+-	cpuhp_remove_state_nocalls(CPUHP_AP_PERF_X86_CSTATE_ONLINE);
+-	cpuhp_remove_state_nocalls(CPUHP_AP_PERF_X86_CSTATE_STARTING);
+-
+ 	if (has_cstate_core)
+ 		perf_pmu_unregister(&cstate_core_pmu);
+ 
+@@ -827,11 +698,6 @@ static int __init cstate_init(void)
+ {
+ 	int err;
+ 
+-	cpuhp_setup_state(CPUHP_AP_PERF_X86_CSTATE_STARTING,
+-			  "perf/x86/cstate:starting", cstate_cpu_init, NULL);
+-	cpuhp_setup_state(CPUHP_AP_PERF_X86_CSTATE_ONLINE,
+-			  "perf/x86/cstate:online", NULL, cstate_cpu_exit);
+-
+ 	if (has_cstate_core) {
+ 		err = perf_pmu_register(&cstate_core_pmu, cstate_core_pmu.name, -1);
+ 		if (err) {
+@@ -844,6 +710,8 @@ static int __init cstate_init(void)
+ 
+ 	if (has_cstate_pkg) {
+ 		if (topology_max_dies_per_package() > 1) {
++			/* CLX-AP is multi-die and the cstate is die-scope */
++			cstate_pkg_pmu.scope = PERF_PMU_SCOPE_DIE;
+ 			err = perf_pmu_register(&cstate_pkg_pmu,
+ 						"cstate_die", -1);
+ 		} else {
+--- a/include/linux/cpuhotplug.h
++++ b/include/linux/cpuhotplug.h
+@@ -152,7 +152,6 @@ enum cpuhp_state {
+ 	CPUHP_AP_PERF_X86_AMD_UNCORE_STARTING,
+ 	CPUHP_AP_PERF_X86_STARTING,
+ 	CPUHP_AP_PERF_X86_AMD_IBS_STARTING,
+-	CPUHP_AP_PERF_X86_CSTATE_STARTING,
+ 	CPUHP_AP_PERF_XTENSA_STARTING,
+ 	CPUHP_AP_ARM_VFP_STARTING,
+ 	CPUHP_AP_ARM64_DEBUG_MONITORS_STARTING,
+@@ -209,7 +208,6 @@ enum cpuhp_state {
+ 	CPUHP_AP_PERF_X86_AMD_UNCORE_ONLINE,
+ 	CPUHP_AP_PERF_X86_AMD_POWER_ONLINE,
+ 	CPUHP_AP_PERF_X86_RAPL_ONLINE,
+-	CPUHP_AP_PERF_X86_CSTATE_ONLINE,
+ 	CPUHP_AP_PERF_S390_CF_ONLINE,
+ 	CPUHP_AP_PERF_S390_SF_ONLINE,
+ 	CPUHP_AP_PERF_ARM_CCI_ONLINE,
diff --git a/debian/patches/patchset-pf/amd-rapl/0004-iommu-vt-d-Clean-up-cpumask-and-hotplug-for-perfmon.patch b/debian/patches/patchset-pf/amd-rapl/0004-iommu-vt-d-Clean-up-cpumask-and-hotplug-for-perfmon.patch
new file mode 100644
index 0000000..5ae825e
--- /dev/null
+++ b/debian/patches/patchset-pf/amd-rapl/0004-iommu-vt-d-Clean-up-cpumask-and-hotplug-for-perfmon.patch
@@ -0,0 +1,188 @@
+From f91da33af8295b4b3d73a2083225f69e1d5ff301 Mon Sep 17 00:00:00 2001
+From: Kan Liang <kan.liang@linux.intel.com>
+Date: Fri, 2 Aug 2024 08:16:40 -0700
+Subject: iommu/vt-d: Clean up cpumask and hotplug for perfmon
+
+The iommu PMU is system-wide scope, which is supported by the generic
+perf_event subsystem now.
+
+Set the scope for the iommu PMU and remove all the cpumask and hotplug
+codes.
+
+Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
+Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
+Cc: David Woodhouse <dwmw2@infradead.org>
+Cc: Joerg Roedel <joro@8bytes.org>
+Cc: Will Deacon <will@kernel.org>
+Cc: iommu@lists.linux.dev
+---
+ drivers/iommu/intel/iommu.h   |   2 -
+ drivers/iommu/intel/perfmon.c | 111 +---------------------------------
+ 2 files changed, 2 insertions(+), 111 deletions(-)
+
+--- a/drivers/iommu/intel/iommu.h
++++ b/drivers/iommu/intel/iommu.h
+@@ -687,8 +687,6 @@ struct iommu_pmu {
+ 	DECLARE_BITMAP(used_mask, IOMMU_PMU_IDX_MAX);
+ 	struct perf_event	*event_list[IOMMU_PMU_IDX_MAX];
+ 	unsigned char		irq_name[16];
+-	struct hlist_node	cpuhp_node;
+-	int			cpu;
+ };
+ 
+ #define IOMMU_IRQ_ID_OFFSET_PRQ		(DMAR_UNITS_SUPPORTED)
+--- a/drivers/iommu/intel/perfmon.c
++++ b/drivers/iommu/intel/perfmon.c
+@@ -34,28 +34,9 @@ static struct attribute_group iommu_pmu_
+ 	.attrs = attrs_empty,
+ };
+ 
+-static cpumask_t iommu_pmu_cpu_mask;
+-
+-static ssize_t
+-cpumask_show(struct device *dev, struct device_attribute *attr, char *buf)
+-{
+-	return cpumap_print_to_pagebuf(true, buf, &iommu_pmu_cpu_mask);
+-}
+-static DEVICE_ATTR_RO(cpumask);
+-
+-static struct attribute *iommu_pmu_cpumask_attrs[] = {
+-	&dev_attr_cpumask.attr,
+-	NULL
+-};
+-
+-static struct attribute_group iommu_pmu_cpumask_attr_group = {
+-	.attrs = iommu_pmu_cpumask_attrs,
+-};
+-
+ static const struct attribute_group *iommu_pmu_attr_groups[] = {
+ 	&iommu_pmu_format_attr_group,
+ 	&iommu_pmu_events_attr_group,
+-	&iommu_pmu_cpumask_attr_group,
+ 	NULL
+ };
+ 
+@@ -565,6 +546,7 @@ static int __iommu_pmu_register(struct i
+ 	iommu_pmu->pmu.attr_groups	= iommu_pmu_attr_groups;
+ 	iommu_pmu->pmu.attr_update	= iommu_pmu_attr_update;
+ 	iommu_pmu->pmu.capabilities	= PERF_PMU_CAP_NO_EXCLUDE;
++	iommu_pmu->pmu.scope		= PERF_PMU_SCOPE_SYS_WIDE;
+ 	iommu_pmu->pmu.module		= THIS_MODULE;
+ 
+ 	return perf_pmu_register(&iommu_pmu->pmu, iommu_pmu->pmu.name, -1);
+@@ -773,89 +755,6 @@ static void iommu_pmu_unset_interrupt(st
+ 	iommu->perf_irq = 0;
+ }
+ 
+-static int iommu_pmu_cpu_online(unsigned int cpu, struct hlist_node *node)
+-{
+-	struct iommu_pmu *iommu_pmu = hlist_entry_safe(node, typeof(*iommu_pmu), cpuhp_node);
+-
+-	if (cpumask_empty(&iommu_pmu_cpu_mask))
+-		cpumask_set_cpu(cpu, &iommu_pmu_cpu_mask);
+-
+-	if (cpumask_test_cpu(cpu, &iommu_pmu_cpu_mask))
+-		iommu_pmu->cpu = cpu;
+-
+-	return 0;
+-}
+-
+-static int iommu_pmu_cpu_offline(unsigned int cpu, struct hlist_node *node)
+-{
+-	struct iommu_pmu *iommu_pmu = hlist_entry_safe(node, typeof(*iommu_pmu), cpuhp_node);
+-	int target = cpumask_first(&iommu_pmu_cpu_mask);
+-
+-	/*
+-	 * The iommu_pmu_cpu_mask has been updated when offline the CPU
+-	 * for the first iommu_pmu. Migrate the other iommu_pmu to the
+-	 * new target.
+-	 */
+-	if (target < nr_cpu_ids && target != iommu_pmu->cpu) {
+-		perf_pmu_migrate_context(&iommu_pmu->pmu, cpu, target);
+-		iommu_pmu->cpu = target;
+-		return 0;
+-	}
+-
+-	if (!cpumask_test_and_clear_cpu(cpu, &iommu_pmu_cpu_mask))
+-		return 0;
+-
+-	target = cpumask_any_but(cpu_online_mask, cpu);
+-
+-	if (target < nr_cpu_ids)
+-		cpumask_set_cpu(target, &iommu_pmu_cpu_mask);
+-	else
+-		return 0;
+-
+-	perf_pmu_migrate_context(&iommu_pmu->pmu, cpu, target);
+-	iommu_pmu->cpu = target;
+-
+-	return 0;
+-}
+-
+-static int nr_iommu_pmu;
+-static enum cpuhp_state iommu_cpuhp_slot;
+-
+-static int iommu_pmu_cpuhp_setup(struct iommu_pmu *iommu_pmu)
+-{
+-	int ret;
+-
+-	if (!nr_iommu_pmu) {
+-		ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN,
+-					      "driver/iommu/intel/perfmon:online",
+-					      iommu_pmu_cpu_online,
+-					      iommu_pmu_cpu_offline);
+-		if (ret < 0)
+-			return ret;
+-		iommu_cpuhp_slot = ret;
+-	}
+-
+-	ret = cpuhp_state_add_instance(iommu_cpuhp_slot, &iommu_pmu->cpuhp_node);
+-	if (ret) {
+-		if (!nr_iommu_pmu)
+-			cpuhp_remove_multi_state(iommu_cpuhp_slot);
+-		return ret;
+-	}
+-	nr_iommu_pmu++;
+-
+-	return 0;
+-}
+-
+-static void iommu_pmu_cpuhp_free(struct iommu_pmu *iommu_pmu)
+-{
+-	cpuhp_state_remove_instance(iommu_cpuhp_slot, &iommu_pmu->cpuhp_node);
+-
+-	if (--nr_iommu_pmu)
+-		return;
+-
+-	cpuhp_remove_multi_state(iommu_cpuhp_slot);
+-}
+-
+ void iommu_pmu_register(struct intel_iommu *iommu)
+ {
+ 	struct iommu_pmu *iommu_pmu = iommu->pmu;
+@@ -866,17 +765,12 @@ void iommu_pmu_register(struct intel_iom
+ 	if (__iommu_pmu_register(iommu))
+ 		goto err;
+ 
+-	if (iommu_pmu_cpuhp_setup(iommu_pmu))
+-		goto unregister;
+-
+ 	/* Set interrupt for overflow */
+ 	if (iommu_pmu_set_interrupt(iommu))
+-		goto cpuhp_free;
++		goto unregister;
+ 
+ 	return;
+ 
+-cpuhp_free:
+-	iommu_pmu_cpuhp_free(iommu_pmu);
+ unregister:
+ 	perf_pmu_unregister(&iommu_pmu->pmu);
+ err:
+@@ -892,6 +786,5 @@ void iommu_pmu_unregister(struct intel_i
+ 		return;
+ 
+ 	iommu_pmu_unset_interrupt(iommu);
+-	iommu_pmu_cpuhp_free(iommu_pmu);
+ 	perf_pmu_unregister(&iommu_pmu->pmu);
+ }
diff --git a/debian/patches/patchset-pf/amd-rapl/0005-dmaengine-idxd-Clean-up-cpumask-and-hotplug-for-perf.patch b/debian/patches/patchset-pf/amd-rapl/0005-dmaengine-idxd-Clean-up-cpumask-and-hotplug-for-perf.patch
new file mode 100644
index 0000000..df672f0
--- /dev/null
+++ b/debian/patches/patchset-pf/amd-rapl/0005-dmaengine-idxd-Clean-up-cpumask-and-hotplug-for-perf.patch
@@ -0,0 +1,238 @@
+From 76278bd3946d618ead2d9cc22612a75a4ab99ace Mon Sep 17 00:00:00 2001
+From: Kan Liang <kan.liang@linux.intel.com>
+Date: Fri, 2 Aug 2024 08:16:41 -0700
+Subject: dmaengine: idxd: Clean up cpumask and hotplug for perfmon
+
+The idxd PMU is system-wide scope, which is supported by the generic
+perf_event subsystem now.
+
+Set the scope for the idxd PMU and remove all the cpumask and hotplug
+codes.
+
+Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
+Cc: Fenghua Yu <fenghua.yu@intel.com>
+Cc: Dave Jiang <dave.jiang@intel.com>
+Cc: Vinod Koul <vkoul@kernel.org>
+Cc: dmaengine@vger.kernel.org
+Reviewed-by: Dave Jiang <dave.jiang@intel.com>
+Reviewed-by: Fenghua Yu <fenghua.yu@intel.com>
+---
+ drivers/dma/idxd/idxd.h    |  7 ---
+ drivers/dma/idxd/init.c    |  3 --
+ drivers/dma/idxd/perfmon.c | 98 +-------------------------------------
+ 3 files changed, 1 insertion(+), 107 deletions(-)
+
+--- a/drivers/dma/idxd/idxd.h
++++ b/drivers/dma/idxd/idxd.h
+@@ -124,7 +124,6 @@ struct idxd_pmu {
+ 
+ 	struct pmu pmu;
+ 	char name[IDXD_NAME_SIZE];
+-	int cpu;
+ 
+ 	int n_counters;
+ 	int counter_width;
+@@ -135,8 +134,6 @@ struct idxd_pmu {
+ 
+ 	unsigned long supported_filters;
+ 	int n_filters;
+-
+-	struct hlist_node cpuhp_node;
+ };
+ 
+ #define IDXD_MAX_PRIORITY	0xf
+@@ -803,14 +800,10 @@ void idxd_user_counter_increment(struct
+ int perfmon_pmu_init(struct idxd_device *idxd);
+ void perfmon_pmu_remove(struct idxd_device *idxd);
+ void perfmon_counter_overflow(struct idxd_device *idxd);
+-void perfmon_init(void);
+-void perfmon_exit(void);
+ #else
+ static inline int perfmon_pmu_init(struct idxd_device *idxd) { return 0; }
+ static inline void perfmon_pmu_remove(struct idxd_device *idxd) {}
+ static inline void perfmon_counter_overflow(struct idxd_device *idxd) {}
+-static inline void perfmon_init(void) {}
+-static inline void perfmon_exit(void) {}
+ #endif
+ 
+ /* debugfs */
+--- a/drivers/dma/idxd/init.c
++++ b/drivers/dma/idxd/init.c
+@@ -878,8 +878,6 @@ static int __init idxd_init_module(void)
+ 	else
+ 		support_enqcmd = true;
+ 
+-	perfmon_init();
+-
+ 	err = idxd_driver_register(&idxd_drv);
+ 	if (err < 0)
+ 		goto err_idxd_driver_register;
+@@ -928,7 +926,6 @@ static void __exit idxd_exit_module(void
+ 	idxd_driver_unregister(&idxd_drv);
+ 	pci_unregister_driver(&idxd_pci_driver);
+ 	idxd_cdev_remove();
+-	perfmon_exit();
+ 	idxd_remove_debugfs();
+ }
+ module_exit(idxd_exit_module);
+--- a/drivers/dma/idxd/perfmon.c
++++ b/drivers/dma/idxd/perfmon.c
+@@ -6,29 +6,6 @@
+ #include "idxd.h"
+ #include "perfmon.h"
+ 
+-static ssize_t cpumask_show(struct device *dev, struct device_attribute *attr,
+-			    char *buf);
+-
+-static cpumask_t		perfmon_dsa_cpu_mask;
+-static bool			cpuhp_set_up;
+-static enum cpuhp_state		cpuhp_slot;
+-
+-/*
+- * perf userspace reads this attribute to determine which cpus to open
+- * counters on.  It's connected to perfmon_dsa_cpu_mask, which is
+- * maintained by the cpu hotplug handlers.
+- */
+-static DEVICE_ATTR_RO(cpumask);
+-
+-static struct attribute *perfmon_cpumask_attrs[] = {
+-	&dev_attr_cpumask.attr,
+-	NULL,
+-};
+-
+-static struct attribute_group cpumask_attr_group = {
+-	.attrs = perfmon_cpumask_attrs,
+-};
+-
+ /*
+  * These attributes specify the bits in the config word that the perf
+  * syscall uses to pass the event ids and categories to perfmon.
+@@ -67,16 +44,9 @@ static struct attribute_group perfmon_fo
+ 
+ static const struct attribute_group *perfmon_attr_groups[] = {
+ 	&perfmon_format_attr_group,
+-	&cpumask_attr_group,
+ 	NULL,
+ };
+ 
+-static ssize_t cpumask_show(struct device *dev, struct device_attribute *attr,
+-			    char *buf)
+-{
+-	return cpumap_print_to_pagebuf(true, buf, &perfmon_dsa_cpu_mask);
+-}
+-
+ static bool is_idxd_event(struct idxd_pmu *idxd_pmu, struct perf_event *event)
+ {
+ 	return &idxd_pmu->pmu == event->pmu;
+@@ -217,7 +187,6 @@ static int perfmon_pmu_event_init(struct
+ 		return -EINVAL;
+ 
+ 	event->hw.event_base = ioread64(PERFMON_TABLE_OFFSET(idxd));
+-	event->cpu = idxd->idxd_pmu->cpu;
+ 	event->hw.config = event->attr.config;
+ 
+ 	if (event->group_leader != event)
+@@ -488,6 +457,7 @@ static void idxd_pmu_init(struct idxd_pm
+ 	idxd_pmu->pmu.stop		= perfmon_pmu_event_stop;
+ 	idxd_pmu->pmu.read		= perfmon_pmu_event_update;
+ 	idxd_pmu->pmu.capabilities	= PERF_PMU_CAP_NO_EXCLUDE;
++	idxd_pmu->pmu.scope		= PERF_PMU_SCOPE_SYS_WIDE;
+ 	idxd_pmu->pmu.module		= THIS_MODULE;
+ }
+ 
+@@ -496,47 +466,11 @@ void perfmon_pmu_remove(struct idxd_devi
+ 	if (!idxd->idxd_pmu)
+ 		return;
+ 
+-	cpuhp_state_remove_instance(cpuhp_slot, &idxd->idxd_pmu->cpuhp_node);
+ 	perf_pmu_unregister(&idxd->idxd_pmu->pmu);
+ 	kfree(idxd->idxd_pmu);
+ 	idxd->idxd_pmu = NULL;
+ }
+ 
+-static int perf_event_cpu_online(unsigned int cpu, struct hlist_node *node)
+-{
+-	struct idxd_pmu *idxd_pmu;
+-
+-	idxd_pmu = hlist_entry_safe(node, typeof(*idxd_pmu), cpuhp_node);
+-
+-	/* select the first online CPU as the designated reader */
+-	if (cpumask_empty(&perfmon_dsa_cpu_mask)) {
+-		cpumask_set_cpu(cpu, &perfmon_dsa_cpu_mask);
+-		idxd_pmu->cpu = cpu;
+-	}
+-
+-	return 0;
+-}
+-
+-static int perf_event_cpu_offline(unsigned int cpu, struct hlist_node *node)
+-{
+-	struct idxd_pmu *idxd_pmu;
+-	unsigned int target;
+-
+-	idxd_pmu = hlist_entry_safe(node, typeof(*idxd_pmu), cpuhp_node);
+-
+-	if (!cpumask_test_and_clear_cpu(cpu, &perfmon_dsa_cpu_mask))
+-		return 0;
+-
+-	target = cpumask_any_but(cpu_online_mask, cpu);
+-	/* migrate events if there is a valid target */
+-	if (target < nr_cpu_ids) {
+-		cpumask_set_cpu(target, &perfmon_dsa_cpu_mask);
+-		perf_pmu_migrate_context(&idxd_pmu->pmu, cpu, target);
+-	}
+-
+-	return 0;
+-}
+-
+ int perfmon_pmu_init(struct idxd_device *idxd)
+ {
+ 	union idxd_perfcap perfcap;
+@@ -544,12 +478,6 @@ int perfmon_pmu_init(struct idxd_device
+ 	int rc = -ENODEV;
+ 
+ 	/*
+-	 * perfmon module initialization failed, nothing to do
+-	 */
+-	if (!cpuhp_set_up)
+-		return -ENODEV;
+-
+-	/*
+ 	 * If perfmon_offset or num_counters is 0, it means perfmon is
+ 	 * not supported on this hardware.
+ 	 */
+@@ -624,11 +552,6 @@ int perfmon_pmu_init(struct idxd_device
+ 	if (rc)
+ 		goto free;
+ 
+-	rc = cpuhp_state_add_instance(cpuhp_slot, &idxd_pmu->cpuhp_node);
+-	if (rc) {
+-		perf_pmu_unregister(&idxd->idxd_pmu->pmu);
+-		goto free;
+-	}
+ out:
+ 	return rc;
+ free:
+@@ -637,22 +560,3 @@ free:
+ 
+ 	goto out;
+ }
+-
+-void __init perfmon_init(void)
+-{
+-	int rc = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN,
+-					 "driver/dma/idxd/perf:online",
+-					 perf_event_cpu_online,
+-					 perf_event_cpu_offline);
+-	if (WARN_ON(rc < 0))
+-		return;
+-
+-	cpuhp_slot = rc;
+-	cpuhp_set_up = true;
+-}
+-
+-void __exit perfmon_exit(void)
+-{
+-	if (cpuhp_set_up)
+-		cpuhp_remove_multi_state(cpuhp_slot);
+-}
diff --git a/debian/patches/patchset-pf/amd-rapl/0006-perf-x86-rapl-Move-the-pmu-allocation-out-of-CPU-hot.patch b/debian/patches/patchset-pf/amd-rapl/0006-perf-x86-rapl-Move-the-pmu-allocation-out-of-CPU-hot.patch
new file mode 100644
index 0000000..c78ed41
--- /dev/null
+++ b/debian/patches/patchset-pf/amd-rapl/0006-perf-x86-rapl-Move-the-pmu-allocation-out-of-CPU-hot.patch
@@ -0,0 +1,84 @@
+From fb0a3b5932882f02ed42fcaa6db73aba3eafd6d7 Mon Sep 17 00:00:00 2001
+From: Kan Liang <kan.liang@linux.intel.com>
+Date: Fri, 2 Aug 2024 08:16:42 -0700
+Subject: perf/x86/rapl: Move the pmu allocation out of CPU hotplug
+
+The rapl pmu just needs to be allocated once. It doesn't matter to be
+allocated at each CPU hotplug, or the global init_rapl_pmus().
+
+Move the pmu allocation to the init_rapl_pmus(). So the generic hotplug
+supports can be applied.
+
+Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
+Cc: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
+---
+ arch/x86/events/rapl.c | 44 +++++++++++++++++++++++++++++-------------
+ 1 file changed, 31 insertions(+), 13 deletions(-)
+
+--- a/arch/x86/events/rapl.c
++++ b/arch/x86/events/rapl.c
+@@ -568,19 +568,8 @@ static int rapl_cpu_online(unsigned int
+ 	struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu);
+ 	int target;
+ 
+-	if (!pmu) {
+-		pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu));
+-		if (!pmu)
+-			return -ENOMEM;
+-
+-		raw_spin_lock_init(&pmu->lock);
+-		INIT_LIST_HEAD(&pmu->active_list);
+-		pmu->pmu = &rapl_pmus->pmu;
+-		pmu->timer_interval = ms_to_ktime(rapl_timer_ms);
+-		rapl_hrtimer_init(pmu);
+-
+-		rapl_pmus->pmus[topology_logical_die_id(cpu)] = pmu;
+-	}
++	if (!pmu)
++		return -ENOMEM;
+ 
+ 	/*
+ 	 * Check if there is an online cpu in the package which collects rapl
+@@ -673,6 +662,32 @@ static const struct attribute_group *rap
+ 	NULL,
+ };
+ 
++static void __init init_rapl_pmu(void)
++{
++	struct rapl_pmu *pmu;
++	int cpu;
++
++	cpus_read_lock();
++
++	for_each_cpu(cpu, cpu_online_mask) {
++		pmu = cpu_to_rapl_pmu(cpu);
++		if (pmu)
++			continue;
++		pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu));
++		if (!pmu)
++			continue;
++		raw_spin_lock_init(&pmu->lock);
++		INIT_LIST_HEAD(&pmu->active_list);
++		pmu->pmu = &rapl_pmus->pmu;
++		pmu->timer_interval = ms_to_ktime(rapl_timer_ms);
++		rapl_hrtimer_init(pmu);
++
++		rapl_pmus->pmus[topology_logical_die_id(cpu)] = pmu;
++	}
++
++	cpus_read_unlock();
++}
++
+ static int __init init_rapl_pmus(void)
+ {
+ 	int nr_rapl_pmu = topology_max_packages() * topology_max_dies_per_package();
+@@ -693,6 +708,9 @@ static int __init init_rapl_pmus(void)
+ 	rapl_pmus->pmu.read		= rapl_pmu_event_read;
+ 	rapl_pmus->pmu.module		= THIS_MODULE;
+ 	rapl_pmus->pmu.capabilities	= PERF_PMU_CAP_NO_EXCLUDE;
++
++	init_rapl_pmu();
++
+ 	return 0;
+ }
+ 
diff --git a/debian/patches/patchset-pf/amd-rapl/0007-perf-x86-rapl-Clean-up-cpumask-and-hotplug.patch b/debian/patches/patchset-pf/amd-rapl/0007-perf-x86-rapl-Clean-up-cpumask-and-hotplug.patch
new file mode 100644
index 0000000..565c457
--- /dev/null
+++ b/debian/patches/patchset-pf/amd-rapl/0007-perf-x86-rapl-Clean-up-cpumask-and-hotplug.patch
@@ -0,0 +1,179 @@
+From 7b4f6ba1b1dc5f3120652bcb5921a697d5167bff Mon Sep 17 00:00:00 2001
+From: Kan Liang <kan.liang@linux.intel.com>
+Date: Fri, 2 Aug 2024 08:16:43 -0700
+Subject: perf/x86/rapl: Clean up cpumask and hotplug
+
+The rapl pmu is die scope, which is supported by the generic perf_event
+subsystem now.
+
+Set the scope for the rapl PMU and remove all the cpumask and hotplug
+codes.
+
+Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
+Cc: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
+---
+ arch/x86/events/rapl.c     | 80 +-------------------------------------
+ include/linux/cpuhotplug.h |  1 -
+ 2 files changed, 2 insertions(+), 79 deletions(-)
+
+--- a/arch/x86/events/rapl.c
++++ b/arch/x86/events/rapl.c
+@@ -135,7 +135,6 @@ struct rapl_model {
+  /* 1/2^hw_unit Joule */
+ static int rapl_hw_unit[NR_RAPL_DOMAINS] __read_mostly;
+ static struct rapl_pmus *rapl_pmus;
+-static cpumask_t rapl_cpu_mask;
+ static unsigned int rapl_cntr_mask;
+ static u64 rapl_timer_ms;
+ static struct perf_msr *rapl_msrs;
+@@ -340,8 +339,6 @@ static int rapl_pmu_event_init(struct pe
+ 	if (event->cpu < 0)
+ 		return -EINVAL;
+ 
+-	event->event_caps |= PERF_EV_CAP_READ_ACTIVE_PKG;
+-
+ 	if (!cfg || cfg >= NR_RAPL_DOMAINS + 1)
+ 		return -EINVAL;
+ 
+@@ -360,7 +357,6 @@ static int rapl_pmu_event_init(struct pe
+ 	pmu = cpu_to_rapl_pmu(event->cpu);
+ 	if (!pmu)
+ 		return -EINVAL;
+-	event->cpu = pmu->cpu;
+ 	event->pmu_private = pmu;
+ 	event->hw.event_base = rapl_msrs[bit].msr;
+ 	event->hw.config = cfg;
+@@ -374,23 +370,6 @@ static void rapl_pmu_event_read(struct p
+ 	rapl_event_update(event);
+ }
+ 
+-static ssize_t rapl_get_attr_cpumask(struct device *dev,
+-				struct device_attribute *attr, char *buf)
+-{
+-	return cpumap_print_to_pagebuf(true, buf, &rapl_cpu_mask);
+-}
+-
+-static DEVICE_ATTR(cpumask, S_IRUGO, rapl_get_attr_cpumask, NULL);
+-
+-static struct attribute *rapl_pmu_attrs[] = {
+-	&dev_attr_cpumask.attr,
+-	NULL,
+-};
+-
+-static struct attribute_group rapl_pmu_attr_group = {
+-	.attrs = rapl_pmu_attrs,
+-};
+-
+ RAPL_EVENT_ATTR_STR(energy-cores, rapl_cores, "event=0x01");
+ RAPL_EVENT_ATTR_STR(energy-pkg  ,   rapl_pkg, "event=0x02");
+ RAPL_EVENT_ATTR_STR(energy-ram  ,   rapl_ram, "event=0x03");
+@@ -438,7 +417,6 @@ static struct attribute_group rapl_pmu_f
+ };
+ 
+ static const struct attribute_group *rapl_attr_groups[] = {
+-	&rapl_pmu_attr_group,
+ 	&rapl_pmu_format_group,
+ 	&rapl_pmu_events_group,
+ 	NULL,
+@@ -541,49 +519,6 @@ static struct perf_msr amd_rapl_msrs[] =
+ 	[PERF_RAPL_PSYS] = { 0, &rapl_events_psys_group,  NULL, false, 0 },
+ };
+ 
+-static int rapl_cpu_offline(unsigned int cpu)
+-{
+-	struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu);
+-	int target;
+-
+-	/* Check if exiting cpu is used for collecting rapl events */
+-	if (!cpumask_test_and_clear_cpu(cpu, &rapl_cpu_mask))
+-		return 0;
+-
+-	pmu->cpu = -1;
+-	/* Find a new cpu to collect rapl events */
+-	target = cpumask_any_but(topology_die_cpumask(cpu), cpu);
+-
+-	/* Migrate rapl events to the new target */
+-	if (target < nr_cpu_ids) {
+-		cpumask_set_cpu(target, &rapl_cpu_mask);
+-		pmu->cpu = target;
+-		perf_pmu_migrate_context(pmu->pmu, cpu, target);
+-	}
+-	return 0;
+-}
+-
+-static int rapl_cpu_online(unsigned int cpu)
+-{
+-	struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu);
+-	int target;
+-
+-	if (!pmu)
+-		return -ENOMEM;
+-
+-	/*
+-	 * Check if there is an online cpu in the package which collects rapl
+-	 * events already.
+-	 */
+-	target = cpumask_any_and(&rapl_cpu_mask, topology_die_cpumask(cpu));
+-	if (target < nr_cpu_ids)
+-		return 0;
+-
+-	cpumask_set_cpu(cpu, &rapl_cpu_mask);
+-	pmu->cpu = cpu;
+-	return 0;
+-}
+-
+ static int rapl_check_hw_unit(struct rapl_model *rm)
+ {
+ 	u64 msr_rapl_power_unit_bits;
+@@ -707,6 +642,7 @@ static int __init init_rapl_pmus(void)
+ 	rapl_pmus->pmu.stop		= rapl_pmu_event_stop;
+ 	rapl_pmus->pmu.read		= rapl_pmu_event_read;
+ 	rapl_pmus->pmu.module		= THIS_MODULE;
++	rapl_pmus->pmu.scope		= PERF_PMU_SCOPE_DIE;
+ 	rapl_pmus->pmu.capabilities	= PERF_PMU_CAP_NO_EXCLUDE;
+ 
+ 	init_rapl_pmu();
+@@ -857,24 +793,13 @@ static int __init rapl_pmu_init(void)
+ 	if (ret)
+ 		return ret;
+ 
+-	/*
+-	 * Install callbacks. Core will call them for each online cpu.
+-	 */
+-	ret = cpuhp_setup_state(CPUHP_AP_PERF_X86_RAPL_ONLINE,
+-				"perf/x86/rapl:online",
+-				rapl_cpu_online, rapl_cpu_offline);
+-	if (ret)
+-		goto out;
+-
+ 	ret = perf_pmu_register(&rapl_pmus->pmu, "power", -1);
+ 	if (ret)
+-		goto out1;
++		goto out;
+ 
+ 	rapl_advertise();
+ 	return 0;
+ 
+-out1:
+-	cpuhp_remove_state(CPUHP_AP_PERF_X86_RAPL_ONLINE);
+ out:
+ 	pr_warn("Initialization failed (%d), disabled\n", ret);
+ 	cleanup_rapl_pmus();
+@@ -884,7 +809,6 @@ module_init(rapl_pmu_init);
+ 
+ static void __exit intel_rapl_exit(void)
+ {
+-	cpuhp_remove_state_nocalls(CPUHP_AP_PERF_X86_RAPL_ONLINE);
+ 	perf_pmu_unregister(&rapl_pmus->pmu);
+ 	cleanup_rapl_pmus();
+ }
+--- a/include/linux/cpuhotplug.h
++++ b/include/linux/cpuhotplug.h
+@@ -207,7 +207,6 @@ enum cpuhp_state {
+ 	CPUHP_AP_PERF_X86_UNCORE_ONLINE,
+ 	CPUHP_AP_PERF_X86_AMD_UNCORE_ONLINE,
+ 	CPUHP_AP_PERF_X86_AMD_POWER_ONLINE,
+-	CPUHP_AP_PERF_X86_RAPL_ONLINE,
+ 	CPUHP_AP_PERF_S390_CF_ONLINE,
+ 	CPUHP_AP_PERF_S390_SF_ONLINE,
+ 	CPUHP_AP_PERF_ARM_CCI_ONLINE,
diff --git a/debian/patches/patchset-pf/amd-rapl/0008-perf-x86-rapl-Fix-the-energy-pkg-event-for-AMD-CPUs.patch b/debian/patches/patchset-pf/amd-rapl/0008-perf-x86-rapl-Fix-the-energy-pkg-event-for-AMD-CPUs.patch
new file mode 100644
index 0000000..e535abd
--- /dev/null
+++ b/debian/patches/patchset-pf/amd-rapl/0008-perf-x86-rapl-Fix-the-energy-pkg-event-for-AMD-CPUs.patch
@@ -0,0 +1,101 @@
+From f1525664ff9da3241b3556594dc0b67506ae1ddd Mon Sep 17 00:00:00 2001
+From: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
+Date: Tue, 10 Sep 2024 14:25:05 +0530
+Subject: perf/x86/rapl: Fix the energy-pkg event for AMD CPUs
+
+After commit ("x86/cpu/topology: Add support for the AMD 0x80000026 leaf"),
+on AMD processors that support extended CPUID leaf 0x80000026, the
+topology_die_cpumask() and topology_logical_die_id() macros, no longer
+return the package cpumask and package id, instead they return the CCD
+(Core Complex Die) mask and id respectively. This leads to the energy-pkg
+event scope to be modified to CCD instead of package.
+
+So, change the PMU scope for AMD and Hygon back to package.
+
+On a 12 CCD 1 Package AMD Zen4 Genoa machine:
+
+Before:
+$ cat /sys/devices/power/cpumask
+0,8,16,24,32,40,48,56,64,72,80,88.
+
+The expected cpumask here is supposed to be just "0", as it is a package
+scope event, only one CPU will be collecting the event for all the CPUs in
+the package.
+
+After:
+$ cat /sys/devices/power/cpumask
+0
+
+Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
+---
+ arch/x86/events/rapl.c | 35 ++++++++++++++++++++++++++++++++---
+ 1 file changed, 32 insertions(+), 3 deletions(-)
+
+--- a/arch/x86/events/rapl.c
++++ b/arch/x86/events/rapl.c
+@@ -139,9 +139,32 @@ static unsigned int rapl_cntr_mask;
+ static u64 rapl_timer_ms;
+ static struct perf_msr *rapl_msrs;
+ 
++/*
++ * RAPL Package energy counter scope:
++ * 1. AMD/HYGON platforms have a per-PKG package energy counter
++ * 2. For Intel platforms
++ *	2.1. CLX-AP is multi-die and its RAPL MSRs are die-scope
++ *	2.2. Other Intel platforms are single die systems so the scope can be
++ *	     considered as either pkg-scope or die-scope, and we are considering
++ *	     them as die-scope.
++ */
++#define rapl_pmu_is_pkg_scope()				\
++	(boot_cpu_data.x86_vendor == X86_VENDOR_AMD ||	\
++	 boot_cpu_data.x86_vendor == X86_VENDOR_HYGON)
++
++/*
++ * Helper function to get the correct topology id according to the
++ * RAPL PMU scope.
++ */
++static inline unsigned int get_rapl_pmu_idx(int cpu)
++{
++	return rapl_pmu_is_pkg_scope() ? topology_logical_package_id(cpu) :
++					 topology_logical_die_id(cpu);
++}
++
+ static inline struct rapl_pmu *cpu_to_rapl_pmu(unsigned int cpu)
+ {
+-	unsigned int rapl_pmu_idx = topology_logical_die_id(cpu);
++	unsigned int rapl_pmu_idx = get_rapl_pmu_idx(cpu);
+ 
+ 	/*
+ 	 * The unsigned check also catches the '-1' return value for non
+@@ -617,7 +640,7 @@ static void __init init_rapl_pmu(void)
+ 		pmu->timer_interval = ms_to_ktime(rapl_timer_ms);
+ 		rapl_hrtimer_init(pmu);
+ 
+-		rapl_pmus->pmus[topology_logical_die_id(cpu)] = pmu;
++		rapl_pmus->pmus[get_rapl_pmu_idx(cpu)] = pmu;
+ 	}
+ 
+ 	cpus_read_unlock();
+@@ -626,6 +649,12 @@ static void __init init_rapl_pmu(void)
+ static int __init init_rapl_pmus(void)
+ {
+ 	int nr_rapl_pmu = topology_max_packages() * topology_max_dies_per_package();
++	int rapl_pmu_scope = PERF_PMU_SCOPE_DIE;
++
++	if (rapl_pmu_is_pkg_scope()) {
++		nr_rapl_pmu		= topology_max_packages();
++		rapl_pmu_scope		= PERF_PMU_SCOPE_PKG;
++	}
+ 
+ 	rapl_pmus = kzalloc(struct_size(rapl_pmus, pmus, nr_rapl_pmu), GFP_KERNEL);
+ 	if (!rapl_pmus)
+@@ -641,8 +670,8 @@ static int __init init_rapl_pmus(void)
+ 	rapl_pmus->pmu.start		= rapl_pmu_event_start;
+ 	rapl_pmus->pmu.stop		= rapl_pmu_event_stop;
+ 	rapl_pmus->pmu.read		= rapl_pmu_event_read;
++	rapl_pmus->pmu.scope		= rapl_pmu_scope;
+ 	rapl_pmus->pmu.module		= THIS_MODULE;
+-	rapl_pmus->pmu.scope		= PERF_PMU_SCOPE_DIE;
+ 	rapl_pmus->pmu.capabilities	= PERF_PMU_CAP_NO_EXCLUDE;
+ 
+ 	init_rapl_pmu();
diff --git a/debian/patches/patchset-pf/amd-rapl/0009-x86-topology-Introduce-topology_logical_core_id.patch b/debian/patches/patchset-pf/amd-rapl/0009-x86-topology-Introduce-topology_logical_core_id.patch
new file mode 100644
index 0000000..44994f6
--- /dev/null
+++ b/debian/patches/patchset-pf/amd-rapl/0009-x86-topology-Introduce-topology_logical_core_id.patch
@@ -0,0 +1,77 @@
+From 9439067951f4d857272836b35812af26650d9c16 Mon Sep 17 00:00:00 2001
+From: K Prateek Nayak <kprateek.nayak@amd.com>
+Date: Fri, 13 Sep 2024 15:21:41 +0000
+Subject: x86/topology: Introduce topology_logical_core_id()
+
+On x86, topology_core_id() returns a unique core ID within the PKG
+domain. Looking at match_smt() suggests that a core ID just needs to be
+unique within a LLC domain. For use cases such as the per-core RAPL PMU,
+there exists a need for a unique core ID across the entire system with
+multiple PKG domains. Introduce topology_logical_core_id() to derive a
+unique core ID across the system.
+
+Signed-off-by: K Prateek Nayak <kprateek.nayak@amd.com>
+Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
+Reviewed-by: Zhang Rui <rui.zhang@intel.com>
+Tested-by: K Prateek Nayak <kprateek.nayak@amd.com>
+---
+ Documentation/arch/x86/topology.rst   | 4 ++++
+ arch/x86/include/asm/processor.h      | 1 +
+ arch/x86/include/asm/topology.h       | 1 +
+ arch/x86/kernel/cpu/debugfs.c         | 1 +
+ arch/x86/kernel/cpu/topology_common.c | 1 +
+ 5 files changed, 8 insertions(+)
+
+--- a/Documentation/arch/x86/topology.rst
++++ b/Documentation/arch/x86/topology.rst
+@@ -135,6 +135,10 @@ Thread-related topology information in t
+     The ID of the core to which a thread belongs. It is also printed in /proc/cpuinfo
+     "core_id."
+ 
++  - topology_logical_core_id();
++
++    The logical core ID to which a thread belongs.
++
+ 
+ 
+ System topology examples
+--- a/arch/x86/include/asm/processor.h
++++ b/arch/x86/include/asm/processor.h
+@@ -98,6 +98,7 @@ struct cpuinfo_topology {
+ 	// Logical ID mappings
+ 	u32			logical_pkg_id;
+ 	u32			logical_die_id;
++	u32			logical_core_id;
+ 
+ 	// AMD Node ID and Nodes per Package info
+ 	u32			amd_node_id;
+--- a/arch/x86/include/asm/topology.h
++++ b/arch/x86/include/asm/topology.h
+@@ -137,6 +137,7 @@ extern const struct cpumask *cpu_cluster
+ #define topology_logical_package_id(cpu)	(cpu_data(cpu).topo.logical_pkg_id)
+ #define topology_physical_package_id(cpu)	(cpu_data(cpu).topo.pkg_id)
+ #define topology_logical_die_id(cpu)		(cpu_data(cpu).topo.logical_die_id)
++#define topology_logical_core_id(cpu)		(cpu_data(cpu).topo.logical_core_id)
+ #define topology_die_id(cpu)			(cpu_data(cpu).topo.die_id)
+ #define topology_core_id(cpu)			(cpu_data(cpu).topo.core_id)
+ #define topology_ppin(cpu)			(cpu_data(cpu).ppin)
+--- a/arch/x86/kernel/cpu/debugfs.c
++++ b/arch/x86/kernel/cpu/debugfs.c
+@@ -24,6 +24,7 @@ static int cpu_debug_show(struct seq_fil
+ 	seq_printf(m, "core_id:             %u\n", c->topo.core_id);
+ 	seq_printf(m, "logical_pkg_id:      %u\n", c->topo.logical_pkg_id);
+ 	seq_printf(m, "logical_die_id:      %u\n", c->topo.logical_die_id);
++	seq_printf(m, "logical_core_id:     %u\n", c->topo.logical_core_id);
+ 	seq_printf(m, "llc_id:              %u\n", c->topo.llc_id);
+ 	seq_printf(m, "l2c_id:              %u\n", c->topo.l2c_id);
+ 	seq_printf(m, "amd_node_id:         %u\n", c->topo.amd_node_id);
+--- a/arch/x86/kernel/cpu/topology_common.c
++++ b/arch/x86/kernel/cpu/topology_common.c
+@@ -151,6 +151,7 @@ static void topo_set_ids(struct topo_sca
+ 	if (!early) {
+ 		c->topo.logical_pkg_id = topology_get_logical_id(apicid, TOPO_PKG_DOMAIN);
+ 		c->topo.logical_die_id = topology_get_logical_id(apicid, TOPO_DIE_DOMAIN);
++		c->topo.logical_core_id = topology_get_logical_id(apicid, TOPO_CORE_DOMAIN);
+ 	}
+ 
+ 	/* Package relative core ID */
diff --git a/debian/patches/patchset-pf/amd-rapl/0010-perf-x86-rapl-Remove-the-cpu_to_rapl_pmu-function.patch b/debian/patches/patchset-pf/amd-rapl/0010-perf-x86-rapl-Remove-the-cpu_to_rapl_pmu-function.patch
new file mode 100644
index 0000000..75276ef
--- /dev/null
+++ b/debian/patches/patchset-pf/amd-rapl/0010-perf-x86-rapl-Remove-the-cpu_to_rapl_pmu-function.patch
@@ -0,0 +1,87 @@
+From b8e1231d5f78314de8f9066baba7b1fdd5e59218 Mon Sep 17 00:00:00 2001
+From: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
+Date: Fri, 13 Sep 2024 15:21:42 +0000
+Subject: perf/x86/rapl: Remove the cpu_to_rapl_pmu() function
+
+Preparation for the addition of per-core RAPL energy counter support for
+AMD CPUs. Post which, one cpu might be mapped to more than one rapl_pmu
+(package/die one or per-core one), also makes sense to use the
+get_rapl_pmu_idx macro which is anyway used to index into the
+rapl_pmus->pmus[] array.
+
+Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
+---
+ arch/x86/events/rapl.c | 29 +++++++++++++----------------
+ 1 file changed, 13 insertions(+), 16 deletions(-)
+
+--- a/arch/x86/events/rapl.c
++++ b/arch/x86/events/rapl.c
+@@ -162,17 +162,6 @@ static inline unsigned int get_rapl_pmu_
+ 					 topology_logical_die_id(cpu);
+ }
+ 
+-static inline struct rapl_pmu *cpu_to_rapl_pmu(unsigned int cpu)
+-{
+-	unsigned int rapl_pmu_idx = get_rapl_pmu_idx(cpu);
+-
+-	/*
+-	 * The unsigned check also catches the '-1' return value for non
+-	 * existent mappings in the topology map.
+-	 */
+-	return rapl_pmu_idx < rapl_pmus->nr_rapl_pmu ? rapl_pmus->pmus[rapl_pmu_idx] : NULL;
+-}
+-
+ static inline u64 rapl_read_counter(struct perf_event *event)
+ {
+ 	u64 raw;
+@@ -348,7 +337,7 @@ static void rapl_pmu_event_del(struct pe
+ static int rapl_pmu_event_init(struct perf_event *event)
+ {
+ 	u64 cfg = event->attr.config & RAPL_EVENT_MASK;
+-	int bit, ret = 0;
++	int bit, rapl_pmu_idx, ret = 0;
+ 	struct rapl_pmu *pmu;
+ 
+ 	/* only look at RAPL events */
+@@ -376,8 +365,12 @@ static int rapl_pmu_event_init(struct pe
+ 	if (event->attr.sample_period) /* no sampling */
+ 		return -EINVAL;
+ 
++	rapl_pmu_idx = get_rapl_pmu_idx(event->cpu);
++	if (rapl_pmu_idx >= rapl_pmus->nr_rapl_pmu)
++		return -EINVAL;
++
+ 	/* must be done before validate_group */
+-	pmu = cpu_to_rapl_pmu(event->cpu);
++	pmu = rapl_pmus->pmus[rapl_pmu_idx];
+ 	if (!pmu)
+ 		return -EINVAL;
+ 	event->pmu_private = pmu;
+@@ -623,12 +616,16 @@ static const struct attribute_group *rap
+ static void __init init_rapl_pmu(void)
+ {
+ 	struct rapl_pmu *pmu;
+-	int cpu;
++	int cpu, rapl_pmu_idx;
+ 
+ 	cpus_read_lock();
+ 
+ 	for_each_cpu(cpu, cpu_online_mask) {
+-		pmu = cpu_to_rapl_pmu(cpu);
++		rapl_pmu_idx = get_rapl_pmu_idx(cpu);
++		if (rapl_pmu_idx >= rapl_pmus->nr_rapl_pmu)
++			continue;
++
++		pmu = rapl_pmus->pmus[rapl_pmu_idx];
+ 		if (pmu)
+ 			continue;
+ 		pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu));
+@@ -640,7 +637,7 @@ static void __init init_rapl_pmu(void)
+ 		pmu->timer_interval = ms_to_ktime(rapl_timer_ms);
+ 		rapl_hrtimer_init(pmu);
+ 
+-		rapl_pmus->pmus[get_rapl_pmu_idx(cpu)] = pmu;
++		rapl_pmus->pmus[rapl_pmu_idx] = pmu;
+ 	}
+ 
+ 	cpus_read_unlock();
diff --git a/debian/patches/patchset-pf/amd-rapl/0011-perf-x86-rapl-Rename-rapl_pmu-variables.patch b/debian/patches/patchset-pf/amd-rapl/0011-perf-x86-rapl-Rename-rapl_pmu-variables.patch
new file mode 100644
index 0000000..0846423
--- /dev/null
+++ b/debian/patches/patchset-pf/amd-rapl/0011-perf-x86-rapl-Rename-rapl_pmu-variables.patch
@@ -0,0 +1,240 @@
+From 07ec9f38cac6eb6e5b0b062ef99e9458ba567de8 Mon Sep 17 00:00:00 2001
+From: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
+Date: Fri, 13 Sep 2024 15:21:43 +0000
+Subject: perf/x86/rapl: Rename rapl_pmu variables
+
+Rename struct rapl_pmu variables from "pmu" to "rapl_pmu", to
+avoid any confusion between the variables of two different
+structs pmu and rapl_pmu. As rapl_pmu also contains a pointer to
+struct pmu, which leads to situations in code like pmu->pmu,
+which is needlessly confusing. Above scenario is replaced with
+much more readable rapl_pmu->pmu with this change.
+
+Also rename "pmus" member in rapl_pmus struct, for same reason.
+
+No functional change.
+
+Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
+---
+ arch/x86/events/rapl.c | 93 +++++++++++++++++++++---------------------
+ 1 file changed, 47 insertions(+), 46 deletions(-)
+
+--- a/arch/x86/events/rapl.c
++++ b/arch/x86/events/rapl.c
+@@ -116,7 +116,7 @@ struct rapl_pmu {
+ struct rapl_pmus {
+ 	struct pmu		pmu;
+ 	unsigned int		nr_rapl_pmu;
+-	struct rapl_pmu		*pmus[] __counted_by(nr_rapl_pmu);
++	struct rapl_pmu		*rapl_pmu[] __counted_by(nr_rapl_pmu);
+ };
+ 
+ enum rapl_unit_quirk {
+@@ -223,34 +223,34 @@ static void rapl_start_hrtimer(struct ra
+ 
+ static enum hrtimer_restart rapl_hrtimer_handle(struct hrtimer *hrtimer)
+ {
+-	struct rapl_pmu *pmu = container_of(hrtimer, struct rapl_pmu, hrtimer);
++	struct rapl_pmu *rapl_pmu = container_of(hrtimer, struct rapl_pmu, hrtimer);
+ 	struct perf_event *event;
+ 	unsigned long flags;
+ 
+-	if (!pmu->n_active)
++	if (!rapl_pmu->n_active)
+ 		return HRTIMER_NORESTART;
+ 
+-	raw_spin_lock_irqsave(&pmu->lock, flags);
++	raw_spin_lock_irqsave(&rapl_pmu->lock, flags);
+ 
+-	list_for_each_entry(event, &pmu->active_list, active_entry)
++	list_for_each_entry(event, &rapl_pmu->active_list, active_entry)
+ 		rapl_event_update(event);
+ 
+-	raw_spin_unlock_irqrestore(&pmu->lock, flags);
++	raw_spin_unlock_irqrestore(&rapl_pmu->lock, flags);
+ 
+-	hrtimer_forward_now(hrtimer, pmu->timer_interval);
++	hrtimer_forward_now(hrtimer, rapl_pmu->timer_interval);
+ 
+ 	return HRTIMER_RESTART;
+ }
+ 
+-static void rapl_hrtimer_init(struct rapl_pmu *pmu)
++static void rapl_hrtimer_init(struct rapl_pmu *rapl_pmu)
+ {
+-	struct hrtimer *hr = &pmu->hrtimer;
++	struct hrtimer *hr = &rapl_pmu->hrtimer;
+ 
+ 	hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ 	hr->function = rapl_hrtimer_handle;
+ }
+ 
+-static void __rapl_pmu_event_start(struct rapl_pmu *pmu,
++static void __rapl_pmu_event_start(struct rapl_pmu *rapl_pmu,
+ 				   struct perf_event *event)
+ {
+ 	if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
+@@ -258,39 +258,39 @@ static void __rapl_pmu_event_start(struc
+ 
+ 	event->hw.state = 0;
+ 
+-	list_add_tail(&event->active_entry, &pmu->active_list);
++	list_add_tail(&event->active_entry, &rapl_pmu->active_list);
+ 
+ 	local64_set(&event->hw.prev_count, rapl_read_counter(event));
+ 
+-	pmu->n_active++;
+-	if (pmu->n_active == 1)
+-		rapl_start_hrtimer(pmu);
++	rapl_pmu->n_active++;
++	if (rapl_pmu->n_active == 1)
++		rapl_start_hrtimer(rapl_pmu);
+ }
+ 
+ static void rapl_pmu_event_start(struct perf_event *event, int mode)
+ {
+-	struct rapl_pmu *pmu = event->pmu_private;
++	struct rapl_pmu *rapl_pmu = event->pmu_private;
+ 	unsigned long flags;
+ 
+-	raw_spin_lock_irqsave(&pmu->lock, flags);
+-	__rapl_pmu_event_start(pmu, event);
+-	raw_spin_unlock_irqrestore(&pmu->lock, flags);
++	raw_spin_lock_irqsave(&rapl_pmu->lock, flags);
++	__rapl_pmu_event_start(rapl_pmu, event);
++	raw_spin_unlock_irqrestore(&rapl_pmu->lock, flags);
+ }
+ 
+ static void rapl_pmu_event_stop(struct perf_event *event, int mode)
+ {
+-	struct rapl_pmu *pmu = event->pmu_private;
++	struct rapl_pmu *rapl_pmu = event->pmu_private;
+ 	struct hw_perf_event *hwc = &event->hw;
+ 	unsigned long flags;
+ 
+-	raw_spin_lock_irqsave(&pmu->lock, flags);
++	raw_spin_lock_irqsave(&rapl_pmu->lock, flags);
+ 
+ 	/* mark event as deactivated and stopped */
+ 	if (!(hwc->state & PERF_HES_STOPPED)) {
+-		WARN_ON_ONCE(pmu->n_active <= 0);
+-		pmu->n_active--;
+-		if (pmu->n_active == 0)
+-			hrtimer_cancel(&pmu->hrtimer);
++		WARN_ON_ONCE(rapl_pmu->n_active <= 0);
++		rapl_pmu->n_active--;
++		if (rapl_pmu->n_active == 0)
++			hrtimer_cancel(&rapl_pmu->hrtimer);
+ 
+ 		list_del(&event->active_entry);
+ 
+@@ -308,23 +308,23 @@ static void rapl_pmu_event_stop(struct p
+ 		hwc->state |= PERF_HES_UPTODATE;
+ 	}
+ 
+-	raw_spin_unlock_irqrestore(&pmu->lock, flags);
++	raw_spin_unlock_irqrestore(&rapl_pmu->lock, flags);
+ }
+ 
+ static int rapl_pmu_event_add(struct perf_event *event, int mode)
+ {
+-	struct rapl_pmu *pmu = event->pmu_private;
++	struct rapl_pmu *rapl_pmu = event->pmu_private;
+ 	struct hw_perf_event *hwc = &event->hw;
+ 	unsigned long flags;
+ 
+-	raw_spin_lock_irqsave(&pmu->lock, flags);
++	raw_spin_lock_irqsave(&rapl_pmu->lock, flags);
+ 
+ 	hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
+ 
+ 	if (mode & PERF_EF_START)
+-		__rapl_pmu_event_start(pmu, event);
++		__rapl_pmu_event_start(rapl_pmu, event);
+ 
+-	raw_spin_unlock_irqrestore(&pmu->lock, flags);
++	raw_spin_unlock_irqrestore(&rapl_pmu->lock, flags);
+ 
+ 	return 0;
+ }
+@@ -338,7 +338,7 @@ static int rapl_pmu_event_init(struct pe
+ {
+ 	u64 cfg = event->attr.config & RAPL_EVENT_MASK;
+ 	int bit, rapl_pmu_idx, ret = 0;
+-	struct rapl_pmu *pmu;
++	struct rapl_pmu *rapl_pmu;
+ 
+ 	/* only look at RAPL events */
+ 	if (event->attr.type != rapl_pmus->pmu.type)
+@@ -370,10 +370,11 @@ static int rapl_pmu_event_init(struct pe
+ 		return -EINVAL;
+ 
+ 	/* must be done before validate_group */
+-	pmu = rapl_pmus->pmus[rapl_pmu_idx];
+-	if (!pmu)
++	rapl_pmu = rapl_pmus->rapl_pmu[rapl_pmu_idx];
++	if (!rapl_pmu)
+ 		return -EINVAL;
+-	event->pmu_private = pmu;
++
++	event->pmu_private = rapl_pmu;
+ 	event->hw.event_base = rapl_msrs[bit].msr;
+ 	event->hw.config = cfg;
+ 	event->hw.idx = bit;
+@@ -600,7 +601,7 @@ static void cleanup_rapl_pmus(void)
+ 	int i;
+ 
+ 	for (i = 0; i < rapl_pmus->nr_rapl_pmu; i++)
+-		kfree(rapl_pmus->pmus[i]);
++		kfree(rapl_pmus->rapl_pmu[i]);
+ 	kfree(rapl_pmus);
+ }
+ 
+@@ -615,7 +616,7 @@ static const struct attribute_group *rap
+ 
+ static void __init init_rapl_pmu(void)
+ {
+-	struct rapl_pmu *pmu;
++	struct rapl_pmu *rapl_pmu;
+ 	int cpu, rapl_pmu_idx;
+ 
+ 	cpus_read_lock();
+@@ -625,19 +626,19 @@ static void __init init_rapl_pmu(void)
+ 		if (rapl_pmu_idx >= rapl_pmus->nr_rapl_pmu)
+ 			continue;
+ 
+-		pmu = rapl_pmus->pmus[rapl_pmu_idx];
+-		if (pmu)
++		rapl_pmu = rapl_pmus->rapl_pmu[rapl_pmu_idx];
++		if (rapl_pmu)
+ 			continue;
+-		pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu));
+-		if (!pmu)
++		rapl_pmu = kzalloc_node(sizeof(*rapl_pmu), GFP_KERNEL, cpu_to_node(cpu));
++		if (!rapl_pmu)
+ 			continue;
+-		raw_spin_lock_init(&pmu->lock);
+-		INIT_LIST_HEAD(&pmu->active_list);
+-		pmu->pmu = &rapl_pmus->pmu;
+-		pmu->timer_interval = ms_to_ktime(rapl_timer_ms);
+-		rapl_hrtimer_init(pmu);
++		raw_spin_lock_init(&rapl_pmu->lock);
++		INIT_LIST_HEAD(&rapl_pmu->active_list);
++		rapl_pmu->pmu = &rapl_pmus->pmu;
++		rapl_pmu->timer_interval = ms_to_ktime(rapl_timer_ms);
++		rapl_hrtimer_init(rapl_pmu);
+ 
+-		rapl_pmus->pmus[rapl_pmu_idx] = pmu;
++		rapl_pmus->rapl_pmu[rapl_pmu_idx] = rapl_pmu;
+ 	}
+ 
+ 	cpus_read_unlock();
+@@ -653,7 +654,7 @@ static int __init init_rapl_pmus(void)
+ 		rapl_pmu_scope		= PERF_PMU_SCOPE_PKG;
+ 	}
+ 
+-	rapl_pmus = kzalloc(struct_size(rapl_pmus, pmus, nr_rapl_pmu), GFP_KERNEL);
++	rapl_pmus = kzalloc(struct_size(rapl_pmus, rapl_pmu, nr_rapl_pmu), GFP_KERNEL);
+ 	if (!rapl_pmus)
+ 		return -ENOMEM;
+ 
diff --git a/debian/patches/patchset-pf/amd-rapl/0012-perf-x86-rapl-Make-rapl_model-struct-global.patch b/debian/patches/patchset-pf/amd-rapl/0012-perf-x86-rapl-Make-rapl_model-struct-global.patch
new file mode 100644
index 0000000..9bc67dc
--- /dev/null
+++ b/debian/patches/patchset-pf/amd-rapl/0012-perf-x86-rapl-Make-rapl_model-struct-global.patch
@@ -0,0 +1,75 @@
+From 68614752b9fd6b6bae6f9ab7b02fc28350c5a541 Mon Sep 17 00:00:00 2001
+From: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
+Date: Fri, 13 Sep 2024 15:47:56 +0000
+Subject: perf/x86/rapl: Make rapl_model struct global
+
+Preparation for per-core energy counter support addition for AMD CPUs.
+
+As there will always be just one rapl_model variable on a system, make it
+global, to make it easier to access it from any function.
+
+No functional change.
+
+Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
+---
+ arch/x86/events/rapl.c | 16 ++++++++--------
+ 1 file changed, 8 insertions(+), 8 deletions(-)
+
+--- a/arch/x86/events/rapl.c
++++ b/arch/x86/events/rapl.c
+@@ -138,6 +138,7 @@ static struct rapl_pmus *rapl_pmus;
+ static unsigned int rapl_cntr_mask;
+ static u64 rapl_timer_ms;
+ static struct perf_msr *rapl_msrs;
++static struct rapl_model *rapl_model;
+ 
+ /*
+  * RAPL Package energy counter scope:
+@@ -536,18 +537,18 @@ static struct perf_msr amd_rapl_msrs[] =
+ 	[PERF_RAPL_PSYS] = { 0, &rapl_events_psys_group,  NULL, false, 0 },
+ };
+ 
+-static int rapl_check_hw_unit(struct rapl_model *rm)
++static int rapl_check_hw_unit(void)
+ {
+ 	u64 msr_rapl_power_unit_bits;
+ 	int i;
+ 
+ 	/* protect rdmsrl() to handle virtualization */
+-	if (rdmsrl_safe(rm->msr_power_unit, &msr_rapl_power_unit_bits))
++	if (rdmsrl_safe(rapl_model->msr_power_unit, &msr_rapl_power_unit_bits))
+ 		return -1;
+ 	for (i = 0; i < NR_RAPL_DOMAINS; i++)
+ 		rapl_hw_unit[i] = (msr_rapl_power_unit_bits >> 8) & 0x1FULL;
+ 
+-	switch (rm->unit_quirk) {
++	switch (rapl_model->unit_quirk) {
+ 	/*
+ 	 * DRAM domain on HSW server and KNL has fixed energy unit which can be
+ 	 * different than the unit from power unit MSR. See
+@@ -798,21 +799,20 @@ MODULE_DEVICE_TABLE(x86cpu, rapl_model_m
+ static int __init rapl_pmu_init(void)
+ {
+ 	const struct x86_cpu_id *id;
+-	struct rapl_model *rm;
+ 	int ret;
+ 
+ 	id = x86_match_cpu(rapl_model_match);
+ 	if (!id)
+ 		return -ENODEV;
+ 
+-	rm = (struct rapl_model *) id->driver_data;
++	rapl_model = (struct rapl_model *) id->driver_data;
+ 
+-	rapl_msrs = rm->rapl_msrs;
++	rapl_msrs = rapl_model->rapl_msrs;
+ 
+ 	rapl_cntr_mask = perf_msr_probe(rapl_msrs, PERF_RAPL_MAX,
+-					false, (void *) &rm->events);
++					false, (void *) &rapl_model->events);
+ 
+-	ret = rapl_check_hw_unit(rm);
++	ret = rapl_check_hw_unit();
+ 	if (ret)
+ 		return ret;
+ 
diff --git a/debian/patches/patchset-pf/amd-rapl/0013-perf-x86-rapl-Add-arguments-to-the-cleanup-and-init-.patch b/debian/patches/patchset-pf/amd-rapl/0013-perf-x86-rapl-Add-arguments-to-the-cleanup-and-init-.patch
new file mode 100644
index 0000000..9fb5743
--- /dev/null
+++ b/debian/patches/patchset-pf/amd-rapl/0013-perf-x86-rapl-Add-arguments-to-the-cleanup-and-init-.patch
@@ -0,0 +1,112 @@
+From b10b887510ccb0b6bc7294888982b862703c9c32 Mon Sep 17 00:00:00 2001
+From: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
+Date: Fri, 13 Sep 2024 15:47:57 +0000
+Subject: perf/x86/rapl: Add arguments to the cleanup and init functions
+
+Prep for per-core RAPL PMU addition.
+
+No functional change.
+
+Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
+---
+ arch/x86/events/rapl.c | 32 +++++++++++++++++++-------------
+ 1 file changed, 19 insertions(+), 13 deletions(-)
+
+--- a/arch/x86/events/rapl.c
++++ b/arch/x86/events/rapl.c
+@@ -597,7 +597,7 @@ static void __init rapl_advertise(void)
+ 	}
+ }
+ 
+-static void cleanup_rapl_pmus(void)
++static void cleanup_rapl_pmus(struct rapl_pmus *rapl_pmus)
+ {
+ 	int i;
+ 
+@@ -615,7 +615,7 @@ static const struct attribute_group *rap
+ 	NULL,
+ };
+ 
+-static void __init init_rapl_pmu(void)
++static void __init init_rapl_pmu(struct rapl_pmus *rapl_pmus)
+ {
+ 	struct rapl_pmu *rapl_pmu;
+ 	int cpu, rapl_pmu_idx;
+@@ -645,20 +645,22 @@ static void __init init_rapl_pmu(void)
+ 	cpus_read_unlock();
+ }
+ 
+-static int __init init_rapl_pmus(void)
++static int __init init_rapl_pmus(struct rapl_pmus **rapl_pmus_ptr, int rapl_pmu_scope)
+ {
+-	int nr_rapl_pmu = topology_max_packages() * topology_max_dies_per_package();
+-	int rapl_pmu_scope = PERF_PMU_SCOPE_DIE;
++	int nr_rapl_pmu;
++	struct rapl_pmus *rapl_pmus;
+ 
+-	if (rapl_pmu_is_pkg_scope()) {
+-		nr_rapl_pmu		= topology_max_packages();
+-		rapl_pmu_scope		= PERF_PMU_SCOPE_PKG;
+-	}
++	if (rapl_pmu_scope == PERF_PMU_SCOPE_PKG)
++		nr_rapl_pmu	= topology_max_packages();
++	else
++		nr_rapl_pmu	= topology_max_packages() * topology_max_dies_per_package();
+ 
+ 	rapl_pmus = kzalloc(struct_size(rapl_pmus, rapl_pmu, nr_rapl_pmu), GFP_KERNEL);
+ 	if (!rapl_pmus)
+ 		return -ENOMEM;
+ 
++	*rapl_pmus_ptr = rapl_pmus;
++
+ 	rapl_pmus->nr_rapl_pmu		= nr_rapl_pmu;
+ 	rapl_pmus->pmu.attr_groups	= rapl_attr_groups;
+ 	rapl_pmus->pmu.attr_update	= rapl_attr_update;
+@@ -673,7 +675,7 @@ static int __init init_rapl_pmus(void)
+ 	rapl_pmus->pmu.module		= THIS_MODULE;
+ 	rapl_pmus->pmu.capabilities	= PERF_PMU_CAP_NO_EXCLUDE;
+ 
+-	init_rapl_pmu();
++	init_rapl_pmu(rapl_pmus);
+ 
+ 	return 0;
+ }
+@@ -799,8 +801,12 @@ MODULE_DEVICE_TABLE(x86cpu, rapl_model_m
+ static int __init rapl_pmu_init(void)
+ {
+ 	const struct x86_cpu_id *id;
++	int rapl_pmu_scope = PERF_PMU_SCOPE_DIE;
+ 	int ret;
+ 
++	if (rapl_pmu_is_pkg_scope())
++		rapl_pmu_scope = PERF_PMU_SCOPE_PKG;
++
+ 	id = x86_match_cpu(rapl_model_match);
+ 	if (!id)
+ 		return -ENODEV;
+@@ -816,7 +822,7 @@ static int __init rapl_pmu_init(void)
+ 	if (ret)
+ 		return ret;
+ 
+-	ret = init_rapl_pmus();
++	ret = init_rapl_pmus(&rapl_pmus, rapl_pmu_scope);
+ 	if (ret)
+ 		return ret;
+ 
+@@ -829,7 +835,7 @@ static int __init rapl_pmu_init(void)
+ 
+ out:
+ 	pr_warn("Initialization failed (%d), disabled\n", ret);
+-	cleanup_rapl_pmus();
++	cleanup_rapl_pmus(rapl_pmus);
+ 	return ret;
+ }
+ module_init(rapl_pmu_init);
+@@ -837,6 +843,6 @@ module_init(rapl_pmu_init);
+ static void __exit intel_rapl_exit(void)
+ {
+ 	perf_pmu_unregister(&rapl_pmus->pmu);
+-	cleanup_rapl_pmus();
++	cleanup_rapl_pmus(rapl_pmus);
+ }
+ module_exit(intel_rapl_exit);
diff --git a/debian/patches/patchset-pf/amd-rapl/0014-perf-x86-rapl-Modify-the-generic-variable-names-to-_.patch b/debian/patches/patchset-pf/amd-rapl/0014-perf-x86-rapl-Modify-the-generic-variable-names-to-_.patch
new file mode 100644
index 0000000..198598b
--- /dev/null
+++ b/debian/patches/patchset-pf/amd-rapl/0014-perf-x86-rapl-Modify-the-generic-variable-names-to-_.patch
@@ -0,0 +1,358 @@
+From b5c83c40540298a39f8314034b705f1236b17a9f Mon Sep 17 00:00:00 2001
+From: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
+Date: Fri, 13 Sep 2024 15:47:58 +0000
+Subject: perf/x86/rapl: Modify the generic variable names to *_pkg*
+
+Prep for addition of power_per_core PMU to handle core scope energy
+consumption for AMD CPUs.
+
+Replace the generic names with *_pkg*, to differentiate between the
+scopes of the two different PMUs and their variables.
+
+No functional change.
+
+Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
+---
+ arch/x86/events/rapl.c | 118 ++++++++++++++++++++---------------------
+ 1 file changed, 59 insertions(+), 59 deletions(-)
+
+--- a/arch/x86/events/rapl.c
++++ b/arch/x86/events/rapl.c
+@@ -70,18 +70,18 @@ MODULE_LICENSE("GPL");
+ /*
+  * RAPL energy status counters
+  */
+-enum perf_rapl_events {
++enum perf_rapl_pkg_events {
+ 	PERF_RAPL_PP0 = 0,		/* all cores */
+ 	PERF_RAPL_PKG,			/* entire package */
+ 	PERF_RAPL_RAM,			/* DRAM */
+ 	PERF_RAPL_PP1,			/* gpu */
+ 	PERF_RAPL_PSYS,			/* psys */
+ 
+-	PERF_RAPL_MAX,
+-	NR_RAPL_DOMAINS = PERF_RAPL_MAX,
++	PERF_RAPL_PKG_EVENTS_MAX,
++	NR_RAPL_PKG_DOMAINS = PERF_RAPL_PKG_EVENTS_MAX,
+ };
+ 
+-static const char *const rapl_domain_names[NR_RAPL_DOMAINS] __initconst = {
++static const char *const rapl_pkg_domain_names[NR_RAPL_PKG_DOMAINS] __initconst = {
+ 	"pp0-core",
+ 	"package",
+ 	"dram",
+@@ -126,16 +126,16 @@ enum rapl_unit_quirk {
+ };
+ 
+ struct rapl_model {
+-	struct perf_msr *rapl_msrs;
+-	unsigned long	events;
++	struct perf_msr *rapl_pkg_msrs;
++	unsigned long	pkg_events;
+ 	unsigned int	msr_power_unit;
+ 	enum rapl_unit_quirk	unit_quirk;
+ };
+ 
+  /* 1/2^hw_unit Joule */
+-static int rapl_hw_unit[NR_RAPL_DOMAINS] __read_mostly;
+-static struct rapl_pmus *rapl_pmus;
+-static unsigned int rapl_cntr_mask;
++static int rapl_pkg_hw_unit[NR_RAPL_PKG_DOMAINS] __read_mostly;
++static struct rapl_pmus *rapl_pmus_pkg;
++static unsigned int rapl_pkg_cntr_mask;
+ static u64 rapl_timer_ms;
+ static struct perf_msr *rapl_msrs;
+ static struct rapl_model *rapl_model;
+@@ -149,7 +149,7 @@ static struct rapl_model *rapl_model;
+  *	     considered as either pkg-scope or die-scope, and we are considering
+  *	     them as die-scope.
+  */
+-#define rapl_pmu_is_pkg_scope()				\
++#define rapl_pkg_pmu_is_pkg_scope()				\
+ 	(boot_cpu_data.x86_vendor == X86_VENDOR_AMD ||	\
+ 	 boot_cpu_data.x86_vendor == X86_VENDOR_HYGON)
+ 
+@@ -159,7 +159,7 @@ static struct rapl_model *rapl_model;
+  */
+ static inline unsigned int get_rapl_pmu_idx(int cpu)
+ {
+-	return rapl_pmu_is_pkg_scope() ? topology_logical_package_id(cpu) :
++	return rapl_pkg_pmu_is_pkg_scope() ? topology_logical_package_id(cpu) :
+ 					 topology_logical_die_id(cpu);
+ }
+ 
+@@ -172,7 +172,7 @@ static inline u64 rapl_read_counter(stru
+ 
+ static inline u64 rapl_scale(u64 v, int cfg)
+ {
+-	if (cfg > NR_RAPL_DOMAINS) {
++	if (cfg > NR_RAPL_PKG_DOMAINS) {
+ 		pr_warn("Invalid domain %d, failed to scale data\n", cfg);
+ 		return v;
+ 	}
+@@ -182,7 +182,7 @@ static inline u64 rapl_scale(u64 v, int
+ 	 * or use ldexp(count, -32).
+ 	 * Watts = Joules/Time delta
+ 	 */
+-	return v << (32 - rapl_hw_unit[cfg - 1]);
++	return v << (32 - rapl_pkg_hw_unit[cfg - 1]);
+ }
+ 
+ static u64 rapl_event_update(struct perf_event *event)
+@@ -342,7 +342,7 @@ static int rapl_pmu_event_init(struct pe
+ 	struct rapl_pmu *rapl_pmu;
+ 
+ 	/* only look at RAPL events */
+-	if (event->attr.type != rapl_pmus->pmu.type)
++	if (event->attr.type != rapl_pmus_pkg->pmu.type)
+ 		return -ENOENT;
+ 
+ 	/* check only supported bits are set */
+@@ -352,14 +352,14 @@ static int rapl_pmu_event_init(struct pe
+ 	if (event->cpu < 0)
+ 		return -EINVAL;
+ 
+-	if (!cfg || cfg >= NR_RAPL_DOMAINS + 1)
++	if (!cfg || cfg >= NR_RAPL_PKG_DOMAINS + 1)
+ 		return -EINVAL;
+ 
+-	cfg = array_index_nospec((long)cfg, NR_RAPL_DOMAINS + 1);
++	cfg = array_index_nospec((long)cfg, NR_RAPL_PKG_DOMAINS + 1);
+ 	bit = cfg - 1;
+ 
+ 	/* check event supported */
+-	if (!(rapl_cntr_mask & (1 << bit)))
++	if (!(rapl_pkg_cntr_mask & (1 << bit)))
+ 		return -EINVAL;
+ 
+ 	/* unsupported modes and filters */
+@@ -367,11 +367,11 @@ static int rapl_pmu_event_init(struct pe
+ 		return -EINVAL;
+ 
+ 	rapl_pmu_idx = get_rapl_pmu_idx(event->cpu);
+-	if (rapl_pmu_idx >= rapl_pmus->nr_rapl_pmu)
++	if (rapl_pmu_idx >= rapl_pmus_pkg->nr_rapl_pmu)
+ 		return -EINVAL;
+ 
+ 	/* must be done before validate_group */
+-	rapl_pmu = rapl_pmus->rapl_pmu[rapl_pmu_idx];
++	rapl_pmu = rapl_pmus_pkg->rapl_pmu[rapl_pmu_idx];
+ 	if (!rapl_pmu)
+ 		return -EINVAL;
+ 
+@@ -525,11 +525,11 @@ static struct perf_msr intel_rapl_spr_ms
+ };
+ 
+ /*
+- * Force to PERF_RAPL_MAX size due to:
+- * - perf_msr_probe(PERF_RAPL_MAX)
++ * Force to PERF_RAPL_PKG_EVENTS_MAX size due to:
++ * - perf_msr_probe(PERF_RAPL_PKG_EVENTS_MAX)
+  * - want to use same event codes across both architectures
+  */
+-static struct perf_msr amd_rapl_msrs[] = {
++static struct perf_msr amd_rapl_pkg_msrs[] = {
+ 	[PERF_RAPL_PP0]  = { 0, &rapl_events_cores_group, NULL, false, 0 },
+ 	[PERF_RAPL_PKG]  = { MSR_AMD_PKG_ENERGY_STATUS,  &rapl_events_pkg_group,   test_msr, false, RAPL_MSR_MASK },
+ 	[PERF_RAPL_RAM]  = { 0, &rapl_events_ram_group,   NULL, false, 0 },
+@@ -545,8 +545,8 @@ static int rapl_check_hw_unit(void)
+ 	/* protect rdmsrl() to handle virtualization */
+ 	if (rdmsrl_safe(rapl_model->msr_power_unit, &msr_rapl_power_unit_bits))
+ 		return -1;
+-	for (i = 0; i < NR_RAPL_DOMAINS; i++)
+-		rapl_hw_unit[i] = (msr_rapl_power_unit_bits >> 8) & 0x1FULL;
++	for (i = 0; i < NR_RAPL_PKG_DOMAINS; i++)
++		rapl_pkg_hw_unit[i] = (msr_rapl_power_unit_bits >> 8) & 0x1FULL;
+ 
+ 	switch (rapl_model->unit_quirk) {
+ 	/*
+@@ -556,11 +556,11 @@ static int rapl_check_hw_unit(void)
+ 	 * of 2. Datasheet, September 2014, Reference Number: 330784-001 "
+ 	 */
+ 	case RAPL_UNIT_QUIRK_INTEL_HSW:
+-		rapl_hw_unit[PERF_RAPL_RAM] = 16;
++		rapl_pkg_hw_unit[PERF_RAPL_RAM] = 16;
+ 		break;
+ 	/* SPR uses a fixed energy unit for Psys domain. */
+ 	case RAPL_UNIT_QUIRK_INTEL_SPR:
+-		rapl_hw_unit[PERF_RAPL_PSYS] = 0;
++		rapl_pkg_hw_unit[PERF_RAPL_PSYS] = 0;
+ 		break;
+ 	default:
+ 		break;
+@@ -575,9 +575,9 @@ static int rapl_check_hw_unit(void)
+ 	 * if hw unit is 32, then we use 2 ms 1/200/2
+ 	 */
+ 	rapl_timer_ms = 2;
+-	if (rapl_hw_unit[0] < 32) {
++	if (rapl_pkg_hw_unit[0] < 32) {
+ 		rapl_timer_ms = (1000 / (2 * 100));
+-		rapl_timer_ms *= (1ULL << (32 - rapl_hw_unit[0] - 1));
++		rapl_timer_ms *= (1ULL << (32 - rapl_pkg_hw_unit[0] - 1));
+ 	}
+ 	return 0;
+ }
+@@ -587,12 +587,12 @@ static void __init rapl_advertise(void)
+ 	int i;
+ 
+ 	pr_info("API unit is 2^-32 Joules, %d fixed counters, %llu ms ovfl timer\n",
+-		hweight32(rapl_cntr_mask), rapl_timer_ms);
++		hweight32(rapl_pkg_cntr_mask), rapl_timer_ms);
+ 
+-	for (i = 0; i < NR_RAPL_DOMAINS; i++) {
+-		if (rapl_cntr_mask & (1 << i)) {
++	for (i = 0; i < NR_RAPL_PKG_DOMAINS; i++) {
++		if (rapl_pkg_cntr_mask & (1 << i)) {
+ 			pr_info("hw unit of domain %s 2^-%d Joules\n",
+-				rapl_domain_names[i], rapl_hw_unit[i]);
++				rapl_pkg_domain_names[i], rapl_pkg_hw_unit[i]);
+ 		}
+ 	}
+ }
+@@ -681,71 +681,71 @@ static int __init init_rapl_pmus(struct
+ }
+ 
+ static struct rapl_model model_snb = {
+-	.events		= BIT(PERF_RAPL_PP0) |
++	.pkg_events	= BIT(PERF_RAPL_PP0) |
+ 			  BIT(PERF_RAPL_PKG) |
+ 			  BIT(PERF_RAPL_PP1),
+ 	.msr_power_unit = MSR_RAPL_POWER_UNIT,
+-	.rapl_msrs      = intel_rapl_msrs,
++	.rapl_pkg_msrs	= intel_rapl_msrs,
+ };
+ 
+ static struct rapl_model model_snbep = {
+-	.events		= BIT(PERF_RAPL_PP0) |
++	.pkg_events	= BIT(PERF_RAPL_PP0) |
+ 			  BIT(PERF_RAPL_PKG) |
+ 			  BIT(PERF_RAPL_RAM),
+ 	.msr_power_unit = MSR_RAPL_POWER_UNIT,
+-	.rapl_msrs      = intel_rapl_msrs,
++	.rapl_pkg_msrs	= intel_rapl_msrs,
+ };
+ 
+ static struct rapl_model model_hsw = {
+-	.events		= BIT(PERF_RAPL_PP0) |
++	.pkg_events	= BIT(PERF_RAPL_PP0) |
+ 			  BIT(PERF_RAPL_PKG) |
+ 			  BIT(PERF_RAPL_RAM) |
+ 			  BIT(PERF_RAPL_PP1),
+ 	.msr_power_unit = MSR_RAPL_POWER_UNIT,
+-	.rapl_msrs      = intel_rapl_msrs,
++	.rapl_pkg_msrs	= intel_rapl_msrs,
+ };
+ 
+ static struct rapl_model model_hsx = {
+-	.events		= BIT(PERF_RAPL_PP0) |
++	.pkg_events	= BIT(PERF_RAPL_PP0) |
+ 			  BIT(PERF_RAPL_PKG) |
+ 			  BIT(PERF_RAPL_RAM),
+ 	.unit_quirk	= RAPL_UNIT_QUIRK_INTEL_HSW,
+ 	.msr_power_unit = MSR_RAPL_POWER_UNIT,
+-	.rapl_msrs      = intel_rapl_msrs,
++	.rapl_pkg_msrs	= intel_rapl_msrs,
+ };
+ 
+ static struct rapl_model model_knl = {
+-	.events		= BIT(PERF_RAPL_PKG) |
++	.pkg_events	= BIT(PERF_RAPL_PKG) |
+ 			  BIT(PERF_RAPL_RAM),
+ 	.unit_quirk	= RAPL_UNIT_QUIRK_INTEL_HSW,
+ 	.msr_power_unit = MSR_RAPL_POWER_UNIT,
+-	.rapl_msrs      = intel_rapl_msrs,
++	.rapl_pkg_msrs	= intel_rapl_msrs,
+ };
+ 
+ static struct rapl_model model_skl = {
+-	.events		= BIT(PERF_RAPL_PP0) |
++	.pkg_events	= BIT(PERF_RAPL_PP0) |
+ 			  BIT(PERF_RAPL_PKG) |
+ 			  BIT(PERF_RAPL_RAM) |
+ 			  BIT(PERF_RAPL_PP1) |
+ 			  BIT(PERF_RAPL_PSYS),
+ 	.msr_power_unit = MSR_RAPL_POWER_UNIT,
+-	.rapl_msrs      = intel_rapl_msrs,
++	.rapl_pkg_msrs      = intel_rapl_msrs,
+ };
+ 
+ static struct rapl_model model_spr = {
+-	.events		= BIT(PERF_RAPL_PP0) |
++	.pkg_events	= BIT(PERF_RAPL_PP0) |
+ 			  BIT(PERF_RAPL_PKG) |
+ 			  BIT(PERF_RAPL_RAM) |
+ 			  BIT(PERF_RAPL_PSYS),
+ 	.unit_quirk	= RAPL_UNIT_QUIRK_INTEL_SPR,
+ 	.msr_power_unit = MSR_RAPL_POWER_UNIT,
+-	.rapl_msrs      = intel_rapl_spr_msrs,
++	.rapl_pkg_msrs	= intel_rapl_spr_msrs,
+ };
+ 
+ static struct rapl_model model_amd_hygon = {
+-	.events		= BIT(PERF_RAPL_PKG),
++	.pkg_events	= BIT(PERF_RAPL_PKG),
+ 	.msr_power_unit = MSR_AMD_RAPL_POWER_UNIT,
+-	.rapl_msrs      = amd_rapl_msrs,
++	.rapl_pkg_msrs	= amd_rapl_pkg_msrs,
+ };
+ 
+ static const struct x86_cpu_id rapl_model_match[] __initconst = {
+@@ -801,11 +801,11 @@ MODULE_DEVICE_TABLE(x86cpu, rapl_model_m
+ static int __init rapl_pmu_init(void)
+ {
+ 	const struct x86_cpu_id *id;
+-	int rapl_pmu_scope = PERF_PMU_SCOPE_DIE;
++	int rapl_pkg_pmu_scope = PERF_PMU_SCOPE_DIE;
+ 	int ret;
+ 
+-	if (rapl_pmu_is_pkg_scope())
+-		rapl_pmu_scope = PERF_PMU_SCOPE_PKG;
++	if (rapl_pkg_pmu_is_pkg_scope())
++		rapl_pkg_pmu_scope = PERF_PMU_SCOPE_PKG;
+ 
+ 	id = x86_match_cpu(rapl_model_match);
+ 	if (!id)
+@@ -813,20 +813,20 @@ static int __init rapl_pmu_init(void)
+ 
+ 	rapl_model = (struct rapl_model *) id->driver_data;
+ 
+-	rapl_msrs = rapl_model->rapl_msrs;
++	rapl_msrs = rapl_model->rapl_pkg_msrs;
+ 
+-	rapl_cntr_mask = perf_msr_probe(rapl_msrs, PERF_RAPL_MAX,
+-					false, (void *) &rapl_model->events);
++	rapl_pkg_cntr_mask = perf_msr_probe(rapl_msrs, PERF_RAPL_PKG_EVENTS_MAX,
++					false, (void *) &rapl_model->pkg_events);
+ 
+ 	ret = rapl_check_hw_unit();
+ 	if (ret)
+ 		return ret;
+ 
+-	ret = init_rapl_pmus(&rapl_pmus, rapl_pmu_scope);
++	ret = init_rapl_pmus(&rapl_pmus_pkg, rapl_pkg_pmu_scope);
+ 	if (ret)
+ 		return ret;
+ 
+-	ret = perf_pmu_register(&rapl_pmus->pmu, "power", -1);
++	ret = perf_pmu_register(&rapl_pmus_pkg->pmu, "power", -1);
+ 	if (ret)
+ 		goto out;
+ 
+@@ -835,14 +835,14 @@ static int __init rapl_pmu_init(void)
+ 
+ out:
+ 	pr_warn("Initialization failed (%d), disabled\n", ret);
+-	cleanup_rapl_pmus(rapl_pmus);
++	cleanup_rapl_pmus(rapl_pmus_pkg);
+ 	return ret;
+ }
+ module_init(rapl_pmu_init);
+ 
+ static void __exit intel_rapl_exit(void)
+ {
+-	perf_pmu_unregister(&rapl_pmus->pmu);
+-	cleanup_rapl_pmus(rapl_pmus);
++	perf_pmu_unregister(&rapl_pmus_pkg->pmu);
++	cleanup_rapl_pmus(rapl_pmus_pkg);
+ }
+ module_exit(intel_rapl_exit);
diff --git a/debian/patches/patchset-pf/amd-rapl/0015-perf-x86-rapl-Remove-the-global-variable-rapl_msrs.patch b/debian/patches/patchset-pf/amd-rapl/0015-perf-x86-rapl-Remove-the-global-variable-rapl_msrs.patch
new file mode 100644
index 0000000..bce1965
--- /dev/null
+++ b/debian/patches/patchset-pf/amd-rapl/0015-perf-x86-rapl-Remove-the-global-variable-rapl_msrs.patch
@@ -0,0 +1,47 @@
+From dbc0343069c8f86fad0d8d9075f70f79114ef10a Mon Sep 17 00:00:00 2001
+From: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
+Date: Fri, 13 Sep 2024 15:47:59 +0000
+Subject: perf/x86/rapl: Remove the global variable rapl_msrs
+
+After making the rapl_model struct global, the rapl_msrs global
+variable isn't needed, so remove it.
+
+Also it will be cleaner when new per-core scope PMU is added. As we will
+need to maintain two rapl_msrs array(one for per-core scope and one for
+package scope PMU), inside the rapl_model struct.
+
+Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
+---
+ arch/x86/events/rapl.c | 7 ++-----
+ 1 file changed, 2 insertions(+), 5 deletions(-)
+
+--- a/arch/x86/events/rapl.c
++++ b/arch/x86/events/rapl.c
+@@ -137,7 +137,6 @@ static int rapl_pkg_hw_unit[NR_RAPL_PKG_
+ static struct rapl_pmus *rapl_pmus_pkg;
+ static unsigned int rapl_pkg_cntr_mask;
+ static u64 rapl_timer_ms;
+-static struct perf_msr *rapl_msrs;
+ static struct rapl_model *rapl_model;
+ 
+ /*
+@@ -376,7 +375,7 @@ static int rapl_pmu_event_init(struct pe
+ 		return -EINVAL;
+ 
+ 	event->pmu_private = rapl_pmu;
+-	event->hw.event_base = rapl_msrs[bit].msr;
++	event->hw.event_base = rapl_model->rapl_pkg_msrs[bit].msr;
+ 	event->hw.config = cfg;
+ 	event->hw.idx = bit;
+ 
+@@ -813,9 +812,7 @@ static int __init rapl_pmu_init(void)
+ 
+ 	rapl_model = (struct rapl_model *) id->driver_data;
+ 
+-	rapl_msrs = rapl_model->rapl_pkg_msrs;
+-
+-	rapl_pkg_cntr_mask = perf_msr_probe(rapl_msrs, PERF_RAPL_PKG_EVENTS_MAX,
++	rapl_pkg_cntr_mask = perf_msr_probe(rapl_model->rapl_pkg_msrs, PERF_RAPL_PKG_EVENTS_MAX,
+ 					false, (void *) &rapl_model->pkg_events);
+ 
+ 	ret = rapl_check_hw_unit();
diff --git a/debian/patches/patchset-pf/amd-rapl/0016-perf-x86-rapl-Move-the-cntr_mask-to-rapl_pmus-struct.patch b/debian/patches/patchset-pf/amd-rapl/0016-perf-x86-rapl-Move-the-cntr_mask-to-rapl_pmus-struct.patch
new file mode 100644
index 0000000..606ef19
--- /dev/null
+++ b/debian/patches/patchset-pf/amd-rapl/0016-perf-x86-rapl-Move-the-cntr_mask-to-rapl_pmus-struct.patch
@@ -0,0 +1,79 @@
+From d6a5a28382558b896767a78db795d421015831a7 Mon Sep 17 00:00:00 2001
+From: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
+Date: Fri, 13 Sep 2024 15:48:00 +0000
+Subject: perf/x86/rapl: Move the cntr_mask to rapl_pmus struct
+
+Preparation for the addition of per-core RAPL energy counter for AMD
+CPUs.
+
+Moving cntr_mask to rapl_pmus struct instead of adding a new global
+cntr_mask for the per-core RAPL energy counter, will ensure that the
+"per_core_cntr_mask" is only created if needed (i.e. in case of AMD
+CPUs).
+
+Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
+---
+ arch/x86/events/rapl.c | 15 ++++++++-------
+ 1 file changed, 8 insertions(+), 7 deletions(-)
+
+--- a/arch/x86/events/rapl.c
++++ b/arch/x86/events/rapl.c
+@@ -116,6 +116,7 @@ struct rapl_pmu {
+ struct rapl_pmus {
+ 	struct pmu		pmu;
+ 	unsigned int		nr_rapl_pmu;
++	unsigned int		cntr_mask;
+ 	struct rapl_pmu		*rapl_pmu[] __counted_by(nr_rapl_pmu);
+ };
+ 
+@@ -135,7 +136,6 @@ struct rapl_model {
+  /* 1/2^hw_unit Joule */
+ static int rapl_pkg_hw_unit[NR_RAPL_PKG_DOMAINS] __read_mostly;
+ static struct rapl_pmus *rapl_pmus_pkg;
+-static unsigned int rapl_pkg_cntr_mask;
+ static u64 rapl_timer_ms;
+ static struct rapl_model *rapl_model;
+ 
+@@ -358,7 +358,7 @@ static int rapl_pmu_event_init(struct pe
+ 	bit = cfg - 1;
+ 
+ 	/* check event supported */
+-	if (!(rapl_pkg_cntr_mask & (1 << bit)))
++	if (!(rapl_pmus_pkg->cntr_mask & (1 << bit)))
+ 		return -EINVAL;
+ 
+ 	/* unsupported modes and filters */
+@@ -586,10 +586,10 @@ static void __init rapl_advertise(void)
+ 	int i;
+ 
+ 	pr_info("API unit is 2^-32 Joules, %d fixed counters, %llu ms ovfl timer\n",
+-		hweight32(rapl_pkg_cntr_mask), rapl_timer_ms);
++		hweight32(rapl_pmus_pkg->cntr_mask), rapl_timer_ms);
+ 
+ 	for (i = 0; i < NR_RAPL_PKG_DOMAINS; i++) {
+-		if (rapl_pkg_cntr_mask & (1 << i)) {
++		if (rapl_pmus_pkg->cntr_mask & (1 << i)) {
+ 			pr_info("hw unit of domain %s 2^-%d Joules\n",
+ 				rapl_pkg_domain_names[i], rapl_pkg_hw_unit[i]);
+ 		}
+@@ -812,9 +812,6 @@ static int __init rapl_pmu_init(void)
+ 
+ 	rapl_model = (struct rapl_model *) id->driver_data;
+ 
+-	rapl_pkg_cntr_mask = perf_msr_probe(rapl_model->rapl_pkg_msrs, PERF_RAPL_PKG_EVENTS_MAX,
+-					false, (void *) &rapl_model->pkg_events);
+-
+ 	ret = rapl_check_hw_unit();
+ 	if (ret)
+ 		return ret;
+@@ -823,6 +820,10 @@ static int __init rapl_pmu_init(void)
+ 	if (ret)
+ 		return ret;
+ 
++	rapl_pmus_pkg->cntr_mask = perf_msr_probe(rapl_model->rapl_pkg_msrs,
++						  PERF_RAPL_PKG_EVENTS_MAX, false,
++						  (void *) &rapl_model->pkg_events);
++
+ 	ret = perf_pmu_register(&rapl_pmus_pkg->pmu, "power", -1);
+ 	if (ret)
+ 		goto out;
diff --git a/debian/patches/patchset-pf/amd-rapl/0017-perf-x86-rapl-Add-per-core-energy-counter-support-fo.patch b/debian/patches/patchset-pf/amd-rapl/0017-perf-x86-rapl-Add-per-core-energy-counter-support-fo.patch
new file mode 100644
index 0000000..55f7a3c
--- /dev/null
+++ b/debian/patches/patchset-pf/amd-rapl/0017-perf-x86-rapl-Add-per-core-energy-counter-support-fo.patch
@@ -0,0 +1,439 @@
+From 3cb480ec2950f4c6351c602552fc4f9a8e524b89 Mon Sep 17 00:00:00 2001
+From: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
+Date: Fri, 13 Sep 2024 15:48:01 +0000
+Subject: perf/x86/rapl: Add per-core energy counter support for AMD CPUs
+
+Add a new "power_per_core" PMU and "energy-per-core" event for
+monitoring energy consumption by each core. The existing energy-cores
+event aggregates the energy consumption at the package level.
+This new event aligns with the AMD's per_core energy counters.
+
+Tested the package level and core level PMU counters with workloads
+pinned to different CPUs.
+
+Results with workload pinned to CPU 1 in core 1 on a AMD Zen4 Genoa
+machine:
+
+$ perf stat -a --per-core -e power_per_core/energy-per-core/ sleep 1
+
+ Performance counter stats for 'system wide':
+
+S0-D0-C0         1          0.02 Joules power_per_core/energy-per-core/
+S0-D0-C1         1          5.72 Joules power_per_core/energy-per-core/
+S0-D0-C2         1          0.02 Joules power_per_core/energy-per-core/
+S0-D0-C3         1          0.02 Joules power_per_core/energy-per-core/
+S0-D0-C4         1          0.02 Joules power_per_core/energy-per-core/
+S0-D0-C5         1          0.02 Joules power_per_core/energy-per-core/
+S0-D0-C6         1          0.02 Joules power_per_core/energy-per-core/
+S0-D0-C7         1          0.02 Joules power_per_core/energy-per-core/
+S0-D0-C8         1          0.02 Joules power_per_core/energy-per-core/
+S0-D0-C9         1          0.02 Joules power_per_core/energy-per-core/
+S0-D0-C10        1          0.02 Joules power_per_core/energy-per-core/
+
+Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
+---
+ arch/x86/events/rapl.c | 178 +++++++++++++++++++++++++++++++++--------
+ 1 file changed, 143 insertions(+), 35 deletions(-)
+
+--- a/arch/x86/events/rapl.c
++++ b/arch/x86/events/rapl.c
+@@ -39,6 +39,10 @@
+  *	  event: rapl_energy_psys
+  *    perf code: 0x5
+  *
++ *  per_core counter: consumption of a single physical core
++ *	  event: rapl_energy_per_core (power_per_core PMU)
++ *    perf code: 0x1
++ *
+  * We manage those counters as free running (read-only). They may be
+  * use simultaneously by other tools, such as turbostat.
+  *
+@@ -81,6 +85,10 @@ enum perf_rapl_pkg_events {
+ 	NR_RAPL_PKG_DOMAINS = PERF_RAPL_PKG_EVENTS_MAX,
+ };
+ 
++#define PERF_RAPL_PER_CORE		0		/* per-core */
++#define PERF_RAPL_CORE_EVENTS_MAX	1
++#define NR_RAPL_CORE_DOMAINS		PERF_RAPL_CORE_EVENTS_MAX
++
+ static const char *const rapl_pkg_domain_names[NR_RAPL_PKG_DOMAINS] __initconst = {
+ 	"pp0-core",
+ 	"package",
+@@ -89,6 +97,8 @@ static const char *const rapl_pkg_domain
+ 	"psys",
+ };
+ 
++static const char *const rapl_core_domain_name __initconst = "per-core";
++
+ /*
+  * event code: LSB 8 bits, passed in attr->config
+  * any other bit is reserved
+@@ -128,14 +138,18 @@ enum rapl_unit_quirk {
+ 
+ struct rapl_model {
+ 	struct perf_msr *rapl_pkg_msrs;
++	struct perf_msr *rapl_core_msrs;
+ 	unsigned long	pkg_events;
++	unsigned long	core_events;
+ 	unsigned int	msr_power_unit;
+ 	enum rapl_unit_quirk	unit_quirk;
+ };
+ 
+  /* 1/2^hw_unit Joule */
+ static int rapl_pkg_hw_unit[NR_RAPL_PKG_DOMAINS] __read_mostly;
++static int rapl_core_hw_unit __read_mostly;
+ static struct rapl_pmus *rapl_pmus_pkg;
++static struct rapl_pmus *rapl_pmus_core;
+ static u64 rapl_timer_ms;
+ static struct rapl_model *rapl_model;
+ 
+@@ -156,10 +170,14 @@ static struct rapl_model *rapl_model;
+  * Helper function to get the correct topology id according to the
+  * RAPL PMU scope.
+  */
+-static inline unsigned int get_rapl_pmu_idx(int cpu)
++static inline unsigned int get_rapl_pmu_idx(int cpu, int scope)
+ {
+-	return rapl_pkg_pmu_is_pkg_scope() ? topology_logical_package_id(cpu) :
+-					 topology_logical_die_id(cpu);
++	if (scope == PERF_PMU_SCOPE_PKG)
++		return topology_logical_package_id(cpu);
++	else if (scope == PERF_PMU_SCOPE_DIE)
++		return topology_logical_die_id(cpu);
++	else
++		return topology_logical_core_id(cpu);
+ }
+ 
+ static inline u64 rapl_read_counter(struct perf_event *event)
+@@ -169,19 +187,20 @@ static inline u64 rapl_read_counter(stru
+ 	return raw;
+ }
+ 
+-static inline u64 rapl_scale(u64 v, int cfg)
++static inline u64 rapl_scale(u64 v, struct perf_event *event)
+ {
+-	if (cfg > NR_RAPL_PKG_DOMAINS) {
+-		pr_warn("Invalid domain %d, failed to scale data\n", cfg);
+-		return v;
+-	}
++	int hw_unit = rapl_pkg_hw_unit[event->hw.config - 1];
++
++	if (event->pmu->scope == PERF_PMU_SCOPE_CORE)
++		hw_unit = rapl_core_hw_unit;
++
+ 	/*
+ 	 * scale delta to smallest unit (1/2^32)
+ 	 * users must then scale back: count * 1/(1e9*2^32) to get Joules
+ 	 * or use ldexp(count, -32).
+ 	 * Watts = Joules/Time delta
+ 	 */
+-	return v << (32 - rapl_pkg_hw_unit[cfg - 1]);
++	return v << (32 - hw_unit);
+ }
+ 
+ static u64 rapl_event_update(struct perf_event *event)
+@@ -208,7 +227,7 @@ static u64 rapl_event_update(struct perf
+ 	delta = (new_raw_count << shift) - (prev_raw_count << shift);
+ 	delta >>= shift;
+ 
+-	sdelta = rapl_scale(delta, event->hw.config);
++	sdelta = rapl_scale(delta, event);
+ 
+ 	local64_add(sdelta, &event->count);
+ 
+@@ -337,12 +356,13 @@ static void rapl_pmu_event_del(struct pe
+ static int rapl_pmu_event_init(struct perf_event *event)
+ {
+ 	u64 cfg = event->attr.config & RAPL_EVENT_MASK;
+-	int bit, rapl_pmu_idx, ret = 0;
++	int bit, rapl_pmus_scope, rapl_pmu_idx, ret = 0;
+ 	struct rapl_pmu *rapl_pmu;
++	struct rapl_pmus *rapl_pmus;
+ 
+-	/* only look at RAPL events */
+-	if (event->attr.type != rapl_pmus_pkg->pmu.type)
+-		return -ENOENT;
++	/* unsupported modes and filters */
++	if (event->attr.sample_period) /* no sampling */
++		return -EINVAL;
+ 
+ 	/* check only supported bits are set */
+ 	if (event->attr.config & ~RAPL_EVENT_MASK)
+@@ -351,31 +371,49 @@ static int rapl_pmu_event_init(struct pe
+ 	if (event->cpu < 0)
+ 		return -EINVAL;
+ 
+-	if (!cfg || cfg >= NR_RAPL_PKG_DOMAINS + 1)
++	rapl_pmus = container_of(event->pmu, struct rapl_pmus, pmu);
++	if (!rapl_pmus)
+ 		return -EINVAL;
++	rapl_pmus_scope = rapl_pmus->pmu.scope;
+ 
+-	cfg = array_index_nospec((long)cfg, NR_RAPL_PKG_DOMAINS + 1);
+-	bit = cfg - 1;
+-
+-	/* check event supported */
+-	if (!(rapl_pmus_pkg->cntr_mask & (1 << bit)))
++	if (rapl_pmus_scope == PERF_PMU_SCOPE_PKG || rapl_pmus_scope == PERF_PMU_SCOPE_DIE) {
++		/* only look at RAPL package events */
++		if (event->attr.type != rapl_pmus_pkg->pmu.type)
++			return -ENOENT;
++
++		cfg = array_index_nospec((long)cfg, NR_RAPL_PKG_DOMAINS + 1);
++		if (!cfg || cfg >= NR_RAPL_PKG_DOMAINS + 1)
++			return -EINVAL;
++
++		bit = cfg - 1;
++		event->hw.event_base = rapl_model->rapl_pkg_msrs[bit].msr;
++	} else if (rapl_pmus_scope == PERF_PMU_SCOPE_CORE) {
++		/* only look at RAPL per-core events */
++		if (event->attr.type != rapl_pmus_core->pmu.type)
++			return -ENOENT;
++
++		cfg = array_index_nospec((long)cfg, NR_RAPL_CORE_DOMAINS + 1);
++		if (!cfg || cfg >= NR_RAPL_PKG_DOMAINS + 1)
++			return -EINVAL;
++
++		bit = cfg - 1;
++		event->hw.event_base = rapl_model->rapl_core_msrs[bit].msr;
++	} else
+ 		return -EINVAL;
+ 
+-	/* unsupported modes and filters */
+-	if (event->attr.sample_period) /* no sampling */
++	/* check event supported */
++	if (!(rapl_pmus->cntr_mask & (1 << bit)))
+ 		return -EINVAL;
+ 
+-	rapl_pmu_idx = get_rapl_pmu_idx(event->cpu);
+-	if (rapl_pmu_idx >= rapl_pmus_pkg->nr_rapl_pmu)
++	rapl_pmu_idx = get_rapl_pmu_idx(event->cpu, rapl_pmus_scope);
++	if (rapl_pmu_idx >= rapl_pmus->nr_rapl_pmu)
+ 		return -EINVAL;
+-
+ 	/* must be done before validate_group */
+-	rapl_pmu = rapl_pmus_pkg->rapl_pmu[rapl_pmu_idx];
++	rapl_pmu = rapl_pmus->rapl_pmu[rapl_pmu_idx];
+ 	if (!rapl_pmu)
+ 		return -EINVAL;
+ 
+ 	event->pmu_private = rapl_pmu;
+-	event->hw.event_base = rapl_model->rapl_pkg_msrs[bit].msr;
+ 	event->hw.config = cfg;
+ 	event->hw.idx = bit;
+ 
+@@ -392,12 +430,14 @@ RAPL_EVENT_ATTR_STR(energy-pkg  ,   rapl
+ RAPL_EVENT_ATTR_STR(energy-ram  ,   rapl_ram, "event=0x03");
+ RAPL_EVENT_ATTR_STR(energy-gpu  ,   rapl_gpu, "event=0x04");
+ RAPL_EVENT_ATTR_STR(energy-psys,   rapl_psys, "event=0x05");
++RAPL_EVENT_ATTR_STR(energy-per-core,   rapl_per_core, "event=0x01");
+ 
+ RAPL_EVENT_ATTR_STR(energy-cores.unit, rapl_cores_unit, "Joules");
+ RAPL_EVENT_ATTR_STR(energy-pkg.unit  ,   rapl_pkg_unit, "Joules");
+ RAPL_EVENT_ATTR_STR(energy-ram.unit  ,   rapl_ram_unit, "Joules");
+ RAPL_EVENT_ATTR_STR(energy-gpu.unit  ,   rapl_gpu_unit, "Joules");
+ RAPL_EVENT_ATTR_STR(energy-psys.unit,   rapl_psys_unit, "Joules");
++RAPL_EVENT_ATTR_STR(energy-per-core.unit,   rapl_per_core_unit, "Joules");
+ 
+ /*
+  * we compute in 0.23 nJ increments regardless of MSR
+@@ -407,6 +447,7 @@ RAPL_EVENT_ATTR_STR(energy-pkg.scale,
+ RAPL_EVENT_ATTR_STR(energy-ram.scale,     rapl_ram_scale, "2.3283064365386962890625e-10");
+ RAPL_EVENT_ATTR_STR(energy-gpu.scale,     rapl_gpu_scale, "2.3283064365386962890625e-10");
+ RAPL_EVENT_ATTR_STR(energy-psys.scale,   rapl_psys_scale, "2.3283064365386962890625e-10");
++RAPL_EVENT_ATTR_STR(energy-per-core.scale,   rapl_per_core_scale, "2.3283064365386962890625e-10");
+ 
+ /*
+  * There are no default events, but we need to create
+@@ -439,6 +480,12 @@ static const struct attribute_group *rap
+ 	NULL,
+ };
+ 
++static const struct attribute_group *rapl_per_core_attr_groups[] = {
++	&rapl_pmu_format_group,
++	&rapl_pmu_events_group,
++	NULL,
++};
++
+ static struct attribute *rapl_events_cores[] = {
+ 	EVENT_PTR(rapl_cores),
+ 	EVENT_PTR(rapl_cores_unit),
+@@ -499,6 +546,18 @@ static struct attribute_group rapl_event
+ 	.attrs = rapl_events_psys,
+ };
+ 
++static struct attribute *rapl_events_per_core[] = {
++	EVENT_PTR(rapl_per_core),
++	EVENT_PTR(rapl_per_core_unit),
++	EVENT_PTR(rapl_per_core_scale),
++	NULL,
++};
++
++static struct attribute_group rapl_events_per_core_group = {
++	.name  = "events",
++	.attrs = rapl_events_per_core,
++};
++
+ static bool test_msr(int idx, void *data)
+ {
+ 	return test_bit(idx, (unsigned long *) data);
+@@ -536,6 +595,11 @@ static struct perf_msr amd_rapl_pkg_msrs
+ 	[PERF_RAPL_PSYS] = { 0, &rapl_events_psys_group,  NULL, false, 0 },
+ };
+ 
++static struct perf_msr amd_rapl_core_msrs[] = {
++	[PERF_RAPL_PER_CORE] = { MSR_AMD_CORE_ENERGY_STATUS, &rapl_events_per_core_group,
++				 test_msr, false, RAPL_MSR_MASK },
++};
++
+ static int rapl_check_hw_unit(void)
+ {
+ 	u64 msr_rapl_power_unit_bits;
+@@ -547,6 +611,8 @@ static int rapl_check_hw_unit(void)
+ 	for (i = 0; i < NR_RAPL_PKG_DOMAINS; i++)
+ 		rapl_pkg_hw_unit[i] = (msr_rapl_power_unit_bits >> 8) & 0x1FULL;
+ 
++	rapl_core_hw_unit = (msr_rapl_power_unit_bits >> 8) & 0x1FULL;
++
+ 	switch (rapl_model->unit_quirk) {
+ 	/*
+ 	 * DRAM domain on HSW server and KNL has fixed energy unit which can be
+@@ -565,7 +631,6 @@ static int rapl_check_hw_unit(void)
+ 		break;
+ 	}
+ 
+-
+ 	/*
+ 	 * Calculate the timer rate:
+ 	 * Use reference of 200W for scaling the timeout to avoid counter
+@@ -584,9 +649,13 @@ static int rapl_check_hw_unit(void)
+ static void __init rapl_advertise(void)
+ {
+ 	int i;
++	int num_counters = hweight32(rapl_pmus_pkg->cntr_mask);
++
++	if (rapl_pmus_core)
++		num_counters += hweight32(rapl_pmus_core->cntr_mask);
+ 
+ 	pr_info("API unit is 2^-32 Joules, %d fixed counters, %llu ms ovfl timer\n",
+-		hweight32(rapl_pmus_pkg->cntr_mask), rapl_timer_ms);
++		num_counters, rapl_timer_ms);
+ 
+ 	for (i = 0; i < NR_RAPL_PKG_DOMAINS; i++) {
+ 		if (rapl_pmus_pkg->cntr_mask & (1 << i)) {
+@@ -594,6 +663,10 @@ static void __init rapl_advertise(void)
+ 				rapl_pkg_domain_names[i], rapl_pkg_hw_unit[i]);
+ 		}
+ 	}
++
++	if (rapl_pmus_core && (rapl_pmus_core->cntr_mask & (1 << PERF_RAPL_PER_CORE)))
++		pr_info("hw unit of domain %s 2^-%d Joules\n",
++			rapl_core_domain_name, rapl_core_hw_unit);
+ }
+ 
+ static void cleanup_rapl_pmus(struct rapl_pmus *rapl_pmus)
+@@ -614,6 +687,10 @@ static const struct attribute_group *rap
+ 	NULL,
+ };
+ 
++static const struct attribute_group *rapl_per_core_attr_update[] = {
++	&rapl_events_per_core_group,
++};
++
+ static void __init init_rapl_pmu(struct rapl_pmus *rapl_pmus)
+ {
+ 	struct rapl_pmu *rapl_pmu;
+@@ -622,10 +699,9 @@ static void __init init_rapl_pmu(struct
+ 	cpus_read_lock();
+ 
+ 	for_each_cpu(cpu, cpu_online_mask) {
+-		rapl_pmu_idx = get_rapl_pmu_idx(cpu);
++		rapl_pmu_idx = get_rapl_pmu_idx(cpu, rapl_pmus->pmu.scope);
+ 		if (rapl_pmu_idx >= rapl_pmus->nr_rapl_pmu)
+ 			continue;
+-
+ 		rapl_pmu = rapl_pmus->rapl_pmu[rapl_pmu_idx];
+ 		if (rapl_pmu)
+ 			continue;
+@@ -644,15 +720,19 @@ static void __init init_rapl_pmu(struct
+ 	cpus_read_unlock();
+ }
+ 
+-static int __init init_rapl_pmus(struct rapl_pmus **rapl_pmus_ptr, int rapl_pmu_scope)
++static int __init init_rapl_pmus(struct rapl_pmus **rapl_pmus_ptr, int rapl_pmu_scope,
++				 const struct attribute_group **rapl_attr_groups,
++				 const struct attribute_group **rapl_attr_update)
+ {
+ 	int nr_rapl_pmu;
+ 	struct rapl_pmus *rapl_pmus;
+ 
+ 	if (rapl_pmu_scope == PERF_PMU_SCOPE_PKG)
+ 		nr_rapl_pmu	= topology_max_packages();
+-	else
++	else if (rapl_pmu_scope == PERF_PMU_SCOPE_DIE)
+ 		nr_rapl_pmu	= topology_max_packages() * topology_max_dies_per_package();
++	else
++		nr_rapl_pmu	= topology_max_packages() * topology_num_cores_per_package();
+ 
+ 	rapl_pmus = kzalloc(struct_size(rapl_pmus, rapl_pmu, nr_rapl_pmu), GFP_KERNEL);
+ 	if (!rapl_pmus)
+@@ -743,8 +823,10 @@ static struct rapl_model model_spr = {
+ 
+ static struct rapl_model model_amd_hygon = {
+ 	.pkg_events	= BIT(PERF_RAPL_PKG),
++	.core_events	= BIT(PERF_RAPL_PER_CORE),
+ 	.msr_power_unit = MSR_AMD_RAPL_POWER_UNIT,
+ 	.rapl_pkg_msrs	= amd_rapl_pkg_msrs,
++	.rapl_core_msrs	= amd_rapl_core_msrs,
+ };
+ 
+ static const struct x86_cpu_id rapl_model_match[] __initconst = {
+@@ -816,7 +898,8 @@ static int __init rapl_pmu_init(void)
+ 	if (ret)
+ 		return ret;
+ 
+-	ret = init_rapl_pmus(&rapl_pmus_pkg, rapl_pkg_pmu_scope);
++	ret = init_rapl_pmus(&rapl_pmus_pkg, rapl_pkg_pmu_scope, rapl_attr_groups,
++			     rapl_attr_update);
+ 	if (ret)
+ 		return ret;
+ 
+@@ -828,6 +911,27 @@ static int __init rapl_pmu_init(void)
+ 	if (ret)
+ 		goto out;
+ 
++	if (rapl_model->core_events) {
++		ret = init_rapl_pmus(&rapl_pmus_core, PERF_PMU_SCOPE_CORE,
++				     rapl_per_core_attr_groups,
++				     rapl_per_core_attr_update);
++		if (ret) {
++			pr_warn("Per-core PMU initialization failed (%d)\n", ret);
++			goto per_core_init_failed;
++		}
++
++		rapl_pmus_core->cntr_mask = perf_msr_probe(rapl_model->rapl_core_msrs,
++						     PERF_RAPL_CORE_EVENTS_MAX, false,
++						     (void *) &rapl_model->core_events);
++
++		ret = perf_pmu_register(&rapl_pmus_core->pmu, "power_per_core", -1);
++		if (ret) {
++			pr_warn("Per-core PMU registration failed (%d)\n", ret);
++			cleanup_rapl_pmus(rapl_pmus_core);
++		}
++	}
++
++per_core_init_failed:
+ 	rapl_advertise();
+ 	return 0;
+ 
+@@ -840,6 +944,10 @@ module_init(rapl_pmu_init);
+ 
+ static void __exit intel_rapl_exit(void)
+ {
++	if (rapl_pmus_core) {
++		perf_pmu_unregister(&rapl_pmus_core->pmu);
++		cleanup_rapl_pmus(rapl_pmus_core);
++	}
+ 	perf_pmu_unregister(&rapl_pmus_pkg->pmu);
+ 	cleanup_rapl_pmus(rapl_pmus_pkg);
+ }
diff --git a/debian/patches/patchset-pf/cpuidle/0001-cpuidle-menu-Remove-iowait-influence.patch b/debian/patches/patchset-pf/cpuidle/0001-cpuidle-menu-Remove-iowait-influence.patch
new file mode 100644
index 0000000..01dacc3
--- /dev/null
+++ b/debian/patches/patchset-pf/cpuidle/0001-cpuidle-menu-Remove-iowait-influence.patch
@@ -0,0 +1,180 @@
+From d31e903a364802c068ff23bdd448cc70eda71a7c Mon Sep 17 00:00:00 2001
+From: Christian Loehle <christian.loehle@arm.com>
+Date: Thu, 5 Sep 2024 10:26:38 +0100
+Subject: cpuidle: menu: Remove iowait influence
+
+Remove CPU iowaiters influence on idle state selection.
+Remove the menu notion of performance multiplier which increased with
+the number of tasks that went to iowait sleep on this CPU and haven't
+woken up yet.
+
+Relying on iowait for cpuidle is problematic for a few reasons:
+1. There is no guarantee that an iowaiting task will wake up on the
+same CPU.
+2. The task being in iowait says nothing about the idle duration, we
+could be selecting shallower states for a long time.
+3. The task being in iowait doesn't always imply a performance hit
+with increased latency.
+4. If there is such a performance hit, the number of iowaiting tasks
+doesn't directly correlate.
+5. The definition of iowait altogether is vague at best, it is
+sprinkled across kernel code.
+
+Signed-off-by: Christian Loehle <christian.loehle@arm.com>
+---
+ drivers/cpuidle/governors/menu.c | 76 ++++----------------------------
+ 1 file changed, 9 insertions(+), 67 deletions(-)
+
+--- a/drivers/cpuidle/governors/menu.c
++++ b/drivers/cpuidle/governors/menu.c
+@@ -19,7 +19,7 @@
+ 
+ #include "gov.h"
+ 
+-#define BUCKETS 12
++#define BUCKETS 6
+ #define INTERVAL_SHIFT 3
+ #define INTERVALS (1UL << INTERVAL_SHIFT)
+ #define RESOLUTION 1024
+@@ -29,12 +29,11 @@
+ /*
+  * Concepts and ideas behind the menu governor
+  *
+- * For the menu governor, there are 3 decision factors for picking a C
++ * For the menu governor, there are 2 decision factors for picking a C
+  * state:
+  * 1) Energy break even point
+- * 2) Performance impact
+- * 3) Latency tolerance (from pmqos infrastructure)
+- * These three factors are treated independently.
++ * 2) Latency tolerance (from pmqos infrastructure)
++ * These two factors are treated independently.
+  *
+  * Energy break even point
+  * -----------------------
+@@ -75,30 +74,6 @@
+  * intervals and if the stand deviation of these 8 intervals is below a
+  * threshold value, we use the average of these intervals as prediction.
+  *
+- * Limiting Performance Impact
+- * ---------------------------
+- * C states, especially those with large exit latencies, can have a real
+- * noticeable impact on workloads, which is not acceptable for most sysadmins,
+- * and in addition, less performance has a power price of its own.
+- *
+- * As a general rule of thumb, menu assumes that the following heuristic
+- * holds:
+- *     The busier the system, the less impact of C states is acceptable
+- *
+- * This rule-of-thumb is implemented using a performance-multiplier:
+- * If the exit latency times the performance multiplier is longer than
+- * the predicted duration, the C state is not considered a candidate
+- * for selection due to a too high performance impact. So the higher
+- * this multiplier is, the longer we need to be idle to pick a deep C
+- * state, and thus the less likely a busy CPU will hit such a deep
+- * C state.
+- *
+- * Currently there is only one value determining the factor:
+- * 10 points are added for each process that is waiting for IO on this CPU.
+- * (This value was experimentally determined.)
+- * Utilization is no longer a factor as it was shown that it never contributed
+- * significantly to the performance multiplier in the first place.
+- *
+  */
+ 
+ struct menu_device {
+@@ -112,19 +87,10 @@ struct menu_device {
+ 	int		interval_ptr;
+ };
+ 
+-static inline int which_bucket(u64 duration_ns, unsigned int nr_iowaiters)
++static inline int which_bucket(u64 duration_ns)
+ {
+ 	int bucket = 0;
+ 
+-	/*
+-	 * We keep two groups of stats; one with no
+-	 * IO pending, one without.
+-	 * This allows us to calculate
+-	 * E(duration)|iowait
+-	 */
+-	if (nr_iowaiters)
+-		bucket = BUCKETS/2;
+-
+ 	if (duration_ns < 10ULL * NSEC_PER_USEC)
+ 		return bucket;
+ 	if (duration_ns < 100ULL * NSEC_PER_USEC)
+@@ -138,19 +104,6 @@ static inline int which_bucket(u64 durat
+ 	return bucket + 5;
+ }
+ 
+-/*
+- * Return a multiplier for the exit latency that is intended
+- * to take performance requirements into account.
+- * The more performance critical we estimate the system
+- * to be, the higher this multiplier, and thus the higher
+- * the barrier to go to an expensive C state.
+- */
+-static inline int performance_multiplier(unsigned int nr_iowaiters)
+-{
+-	/* for IO wait tasks (per cpu!) we add 10x each */
+-	return 1 + 10 * nr_iowaiters;
+-}
+-
+ static DEFINE_PER_CPU(struct menu_device, menu_devices);
+ 
+ static void menu_update(struct cpuidle_driver *drv, struct cpuidle_device *dev);
+@@ -258,8 +211,6 @@ static int menu_select(struct cpuidle_dr
+ 	struct menu_device *data = this_cpu_ptr(&menu_devices);
+ 	s64 latency_req = cpuidle_governor_latency_req(dev->cpu);
+ 	u64 predicted_ns;
+-	u64 interactivity_req;
+-	unsigned int nr_iowaiters;
+ 	ktime_t delta, delta_tick;
+ 	int i, idx;
+ 
+@@ -268,8 +219,6 @@ static int menu_select(struct cpuidle_dr
+ 		data->needs_update = 0;
+ 	}
+ 
+-	nr_iowaiters = nr_iowait_cpu(dev->cpu);
+-
+ 	/* Find the shortest expected idle interval. */
+ 	predicted_ns = get_typical_interval(data) * NSEC_PER_USEC;
+ 	if (predicted_ns > RESIDENCY_THRESHOLD_NS) {
+@@ -283,7 +232,7 @@ static int menu_select(struct cpuidle_dr
+ 		}
+ 
+ 		data->next_timer_ns = delta;
+-		data->bucket = which_bucket(data->next_timer_ns, nr_iowaiters);
++		data->bucket = which_bucket(data->next_timer_ns);
+ 
+ 		/* Round up the result for half microseconds. */
+ 		timer_us = div_u64((RESOLUTION * DECAY * NSEC_PER_USEC) / 2 +
+@@ -301,7 +250,7 @@ static int menu_select(struct cpuidle_dr
+ 		 */
+ 		data->next_timer_ns = KTIME_MAX;
+ 		delta_tick = TICK_NSEC / 2;
+-		data->bucket = which_bucket(KTIME_MAX, nr_iowaiters);
++		data->bucket = which_bucket(KTIME_MAX);
+ 	}
+ 
+ 	if (unlikely(drv->state_count <= 1 || latency_req == 0) ||
+@@ -328,15 +277,8 @@ static int menu_select(struct cpuidle_dr
+ 		 */
+ 		if (predicted_ns < TICK_NSEC)
+ 			predicted_ns = data->next_timer_ns;
+-	} else {
+-		/*
+-		 * Use the performance multiplier and the user-configurable
+-		 * latency_req to determine the maximum exit latency.
+-		 */
+-		interactivity_req = div64_u64(predicted_ns,
+-					      performance_multiplier(nr_iowaiters));
+-		if (latency_req > interactivity_req)
+-			latency_req = interactivity_req;
++	} else if (latency_req > predicted_ns) {
++		latency_req = predicted_ns;
+ 	}
+ 
+ 	/*
diff --git a/debian/patches/patchset-pf/cpuidle/0002-cpuidle-Prefer-teo-over-menu-governor.patch b/debian/patches/patchset-pf/cpuidle/0002-cpuidle-Prefer-teo-over-menu-governor.patch
new file mode 100644
index 0000000..d32e22a
--- /dev/null
+++ b/debian/patches/patchset-pf/cpuidle/0002-cpuidle-Prefer-teo-over-menu-governor.patch
@@ -0,0 +1,58 @@
+From 3f840a42780323a4437dd1a417488d141c33af15 Mon Sep 17 00:00:00 2001
+From: Christian Loehle <christian.loehle@arm.com>
+Date: Thu, 5 Sep 2024 10:26:39 +0100
+Subject: cpuidle: Prefer teo over menu governor
+
+Since menu no longer has the interactivity boost teo works better
+overall, so make it the default.
+
+Signed-off-by: Christian Loehle <christian.loehle@arm.com>
+---
+ drivers/cpuidle/Kconfig          | 5 +----
+ drivers/cpuidle/governors/menu.c | 2 +-
+ drivers/cpuidle/governors/teo.c  | 2 +-
+ 3 files changed, 3 insertions(+), 6 deletions(-)
+
+--- a/drivers/cpuidle/Kconfig
++++ b/drivers/cpuidle/Kconfig
+@@ -5,7 +5,7 @@ config CPU_IDLE
+ 	bool "CPU idle PM support"
+ 	default y if ACPI || PPC_PSERIES
+ 	select CPU_IDLE_GOV_LADDER if (!NO_HZ && !NO_HZ_IDLE)
+-	select CPU_IDLE_GOV_MENU if (NO_HZ || NO_HZ_IDLE) && !CPU_IDLE_GOV_TEO
++	select CPU_IDLE_GOV_TEO if (NO_HZ || NO_HZ_IDLE) && !CPU_IDLE_GOV_MENU
+ 	help
+ 	  CPU idle is a generic framework for supporting software-controlled
+ 	  idle processor power management.  It includes modular cross-platform
+@@ -30,9 +30,6 @@ config CPU_IDLE_GOV_TEO
+ 	  This governor implements a simplified idle state selection method
+ 	  focused on timer events and does not do any interactivity boosting.
+ 
+-	  Some workloads benefit from using it and it generally should be safe
+-	  to use.  Say Y here if you are not happy with the alternatives.
+-
+ config CPU_IDLE_GOV_HALTPOLL
+ 	bool "Haltpoll governor (for virtualized systems)"
+ 	depends on KVM_GUEST
+--- a/drivers/cpuidle/governors/menu.c
++++ b/drivers/cpuidle/governors/menu.c
+@@ -508,7 +508,7 @@ static int menu_enable_device(struct cpu
+ 
+ static struct cpuidle_governor menu_governor = {
+ 	.name =		"menu",
+-	.rating =	20,
++	.rating =	19,
+ 	.enable =	menu_enable_device,
+ 	.select =	menu_select,
+ 	.reflect =	menu_reflect,
+--- a/drivers/cpuidle/governors/teo.c
++++ b/drivers/cpuidle/governors/teo.c
+@@ -537,7 +537,7 @@ static int teo_enable_device(struct cpui
+ 
+ static struct cpuidle_governor teo_governor = {
+ 	.name =		"teo",
+-	.rating =	19,
++	.rating =	20,
+ 	.enable =	teo_enable_device,
+ 	.select =	teo_select,
+ 	.reflect =	teo_reflect,
diff --git a/debian/patches/patchset-pf/cpuidle/0003-TEST-cpufreq-schedutil-Linear-iowait-boost-step.patch b/debian/patches/patchset-pf/cpuidle/0003-TEST-cpufreq-schedutil-Linear-iowait-boost-step.patch
new file mode 100644
index 0000000..cc7e290
--- /dev/null
+++ b/debian/patches/patchset-pf/cpuidle/0003-TEST-cpufreq-schedutil-Linear-iowait-boost-step.patch
@@ -0,0 +1,39 @@
+From ca8c9368b6f28ef625716b03aa930acfb8afe158 Mon Sep 17 00:00:00 2001
+From: Christian Loehle <christian.loehle@arm.com>
+Date: Thu, 5 Sep 2024 10:26:40 +0100
+Subject: TEST: cpufreq/schedutil: Linear iowait boost step
+
+In preparation for capping iowait boost make the steps linear as
+opposed to doubling.
+
+Signed-off-by: Christian Loehle <christian.loehle@arm.com>
+---
+ kernel/sched/cpufreq_schedutil.c | 9 ++++-----
+ 1 file changed, 4 insertions(+), 5 deletions(-)
+
+--- a/kernel/sched/cpufreq_schedutil.c
++++ b/kernel/sched/cpufreq_schedutil.c
+@@ -267,7 +267,8 @@ static void sugov_iowait_boost(struct su
+ 	/* Double the boost at each request */
+ 	if (sg_cpu->iowait_boost) {
+ 		sg_cpu->iowait_boost =
+-			min_t(unsigned int, sg_cpu->iowait_boost << 1, SCHED_CAPACITY_SCALE);
++			min_t(unsigned int,
++			      sg_cpu->iowait_boost + IOWAIT_BOOST_MIN, SCHED_CAPACITY_SCALE);
+ 		return;
+ 	}
+ 
+@@ -308,11 +309,9 @@ static unsigned long sugov_iowait_apply(
+ 		/*
+ 		 * No boost pending; reduce the boost value.
+ 		 */
+-		sg_cpu->iowait_boost >>= 1;
+-		if (sg_cpu->iowait_boost < IOWAIT_BOOST_MIN) {
+-			sg_cpu->iowait_boost = 0;
++		sg_cpu->iowait_boost -= IOWAIT_BOOST_MIN;
++		if (!sg_cpu->iowait_boost)
+ 			return 0;
+-		}
+ 	}
+ 
+ 	sg_cpu->iowait_boost_pending = false;
diff --git a/debian/patches/patchset-pf/cpuidle/0004-TEST-cpufreq-schedutil-iowait-boost-cap-sysfs.patch b/debian/patches/patchset-pf/cpuidle/0004-TEST-cpufreq-schedutil-iowait-boost-cap-sysfs.patch
new file mode 100644
index 0000000..a655dad
--- /dev/null
+++ b/debian/patches/patchset-pf/cpuidle/0004-TEST-cpufreq-schedutil-iowait-boost-cap-sysfs.patch
@@ -0,0 +1,106 @@
+From 33f05bd16a4ac2f6f36c9eb88016e2375dcb597c Mon Sep 17 00:00:00 2001
+From: Christian Loehle <christian.loehle@arm.com>
+Date: Thu, 5 Sep 2024 10:26:41 +0100
+Subject: TEST: cpufreq/schedutil: iowait boost cap sysfs
+
+Add a knob to cap applied iowait_boost per sysfs.
+This is to test for potential regressions.
+
+Signed-off-by: Christian Loehle <christian.loehle@arm.com>
+---
+ kernel/sched/cpufreq_schedutil.c | 38 ++++++++++++++++++++++++++++++++
+ 1 file changed, 38 insertions(+)
+
+--- a/kernel/sched/cpufreq_schedutil.c
++++ b/kernel/sched/cpufreq_schedutil.c
+@@ -11,6 +11,7 @@
+ struct sugov_tunables {
+ 	struct gov_attr_set	attr_set;
+ 	unsigned int		rate_limit_us;
++	unsigned int		iowait_boost_cap;
+ };
+ 
+ struct sugov_policy {
+@@ -35,6 +36,8 @@ struct sugov_policy {
+ 
+ 	bool			limits_changed;
+ 	bool			need_freq_update;
++
++	unsigned int		iowait_boost_cap;
+ };
+ 
+ struct sugov_cpu {
+@@ -316,6 +319,9 @@ static unsigned long sugov_iowait_apply(
+ 
+ 	sg_cpu->iowait_boost_pending = false;
+ 
++	if (sg_cpu->iowait_boost > sg_cpu->sg_policy->iowait_boost_cap)
++		sg_cpu->iowait_boost = sg_cpu->sg_policy->iowait_boost_cap;
++
+ 	/*
+ 	 * sg_cpu->util is already in capacity scale; convert iowait_boost
+ 	 * into the same scale so we can compare.
+@@ -554,6 +560,14 @@ static ssize_t rate_limit_us_show(struct
+ 	return sprintf(buf, "%u\n", tunables->rate_limit_us);
+ }
+ 
++
++static ssize_t iowait_boost_cap_show(struct gov_attr_set *attr_set, char *buf)
++{
++	struct sugov_tunables *tunables = to_sugov_tunables(attr_set);
++
++	return sprintf(buf, "%u\n", tunables->iowait_boost_cap);
++}
++
+ static ssize_t
+ rate_limit_us_store(struct gov_attr_set *attr_set, const char *buf, size_t count)
+ {
+@@ -572,10 +586,30 @@ rate_limit_us_store(struct gov_attr_set
+ 	return count;
+ }
+ 
++static ssize_t
++iowait_boost_cap_store(struct gov_attr_set *attr_set, const char *buf, size_t count)
++{
++	struct sugov_tunables *tunables = to_sugov_tunables(attr_set);
++	struct sugov_policy *sg_policy;
++	unsigned int iowait_boost_cap;
++
++	if (kstrtouint(buf, 10, &iowait_boost_cap))
++		return -EINVAL;
++
++	tunables->iowait_boost_cap = iowait_boost_cap;
++
++	list_for_each_entry(sg_policy, &attr_set->policy_list, tunables_hook)
++		sg_policy->iowait_boost_cap = iowait_boost_cap;
++
++	return count;
++}
++
+ static struct governor_attr rate_limit_us = __ATTR_RW(rate_limit_us);
++static struct governor_attr iowait_boost_cap = __ATTR_RW(iowait_boost_cap);
+ 
+ static struct attribute *sugov_attrs[] = {
+ 	&rate_limit_us.attr,
++	&iowait_boost_cap.attr,
+ 	NULL
+ };
+ ATTRIBUTE_GROUPS(sugov);
+@@ -765,6 +799,8 @@ static int sugov_init(struct cpufreq_pol
+ 
+ 	tunables->rate_limit_us = cpufreq_policy_transition_delay_us(policy);
+ 
++	tunables->iowait_boost_cap = SCHED_CAPACITY_SCALE;
++
+ 	policy->governor_data = sg_policy;
+ 	sg_policy->tunables = tunables;
+ 
+@@ -834,6 +870,8 @@ static int sugov_start(struct cpufreq_po
+ 	sg_policy->limits_changed		= false;
+ 	sg_policy->cached_raw_freq		= 0;
+ 
++	sg_policy->iowait_boost_cap		= SCHED_CAPACITY_SCALE;
++
+ 	sg_policy->need_freq_update = cpufreq_driver_test_flags(CPUFREQ_NEED_UPDATE_LIMITS);
+ 
+ 	if (policy_is_shared(policy))
diff --git a/debian/patches/patchset-pf/cpuidle/0005-cpufreq-schedutil-Remove-iowait-boost.patch b/debian/patches/patchset-pf/cpuidle/0005-cpufreq-schedutil-Remove-iowait-boost.patch
new file mode 100644
index 0000000..2c1db29
--- /dev/null
+++ b/debian/patches/patchset-pf/cpuidle/0005-cpufreq-schedutil-Remove-iowait-boost.patch
@@ -0,0 +1,325 @@
+From 33eb6c08d7c615fad308001921c7b1148cbccfde Mon Sep 17 00:00:00 2001
+From: Christian Loehle <christian.loehle@arm.com>
+Date: Thu, 5 Sep 2024 10:26:42 +0100
+Subject: cpufreq/schedutil: Remove iowait boost
+
+iowait boost in schedutil was introduced by
+commit ("21ca6d2c52f8 cpufreq: schedutil: Add iowait boosting").
+with it more or less following intel_pstate's approach to increase
+frequency after an iowait wakeup.
+Behaviour that is piggy-backed onto iowait boost is problematic
+due to a lot of reasons, so remove it.
+
+For schedutil specifically these are some of the reasons:
+1. Boosting is applied even in scenarios where it doesn't improve
+throughput.
+2. The boost is not accounted for in EAS: a) feec() will only consider
+ the actual task utilization for task placement, but another CPU might
+ be more energy-efficient at that capacity than the boosted one.)
+ b) When placing a non-IO task while a CPU is boosted compute_energy()
+ assumes a lower OPP than what is actually applied. This leads to
+ wrong EAS decisions.
+3. Actual IO heavy workloads are hardly distinguished from infrequent
+in_iowait wakeups.
+4. The boost isn't accounted for in task placement.
+5. The boost isn't associated with a task, it therefore lingers on the
+rq even after the responsible task has migrated / stopped.
+6. The boost isn't associated with a task, it therefore needs to ramp
+up again when migrated.
+7. Since schedutil doesn't know which task is getting woken up,
+multiple unrelated in_iowait tasks lead to boosting.
+8. Boosting is hard to control with UCLAMP_MAX (which is only active
+when the task is on the rq, which for boosted tasks is usually not
+the case for most of the time).
+
+One benefit of schedutil specifically is the reliance on the
+scheduler's utilization signals, which have evolved a lot since it's
+original introduction. Some cases that benefitted from iowait boosting
+in the past can now be covered by e.g. util_est.
+
+Signed-off-by: Christian Loehle <christian.loehle@arm.com>
+---
+ kernel/sched/cpufreq_schedutil.c | 181 +------------------------------
+ 1 file changed, 3 insertions(+), 178 deletions(-)
+
+--- a/kernel/sched/cpufreq_schedutil.c
++++ b/kernel/sched/cpufreq_schedutil.c
+@@ -6,12 +6,9 @@
+  * Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+  */
+ 
+-#define IOWAIT_BOOST_MIN	(SCHED_CAPACITY_SCALE / 8)
+-
+ struct sugov_tunables {
+ 	struct gov_attr_set	attr_set;
+ 	unsigned int		rate_limit_us;
+-	unsigned int		iowait_boost_cap;
+ };
+ 
+ struct sugov_policy {
+@@ -36,8 +33,6 @@ struct sugov_policy {
+ 
+ 	bool			limits_changed;
+ 	bool			need_freq_update;
+-
+-	unsigned int		iowait_boost_cap;
+ };
+ 
+ struct sugov_cpu {
+@@ -45,10 +40,6 @@ struct sugov_cpu {
+ 	struct sugov_policy	*sg_policy;
+ 	unsigned int		cpu;
+ 
+-	bool			iowait_boost_pending;
+-	unsigned int		iowait_boost;
+-	u64			last_update;
+-
+ 	unsigned long		util;
+ 	unsigned long		bw_min;
+ 
+@@ -198,137 +189,15 @@ unsigned long sugov_effective_cpu_perf(i
+ 	return max(min, max);
+ }
+ 
+-static void sugov_get_util(struct sugov_cpu *sg_cpu, unsigned long boost)
++static void sugov_get_util(struct sugov_cpu *sg_cpu)
+ {
+ 	unsigned long min, max, util = cpu_util_cfs_boost(sg_cpu->cpu);
+ 
+ 	util = effective_cpu_util(sg_cpu->cpu, util, &min, &max);
+-	util = max(util, boost);
+ 	sg_cpu->bw_min = min;
+ 	sg_cpu->util = sugov_effective_cpu_perf(sg_cpu->cpu, util, min, max);
+ }
+ 
+-/**
+- * sugov_iowait_reset() - Reset the IO boost status of a CPU.
+- * @sg_cpu: the sugov data for the CPU to boost
+- * @time: the update time from the caller
+- * @set_iowait_boost: true if an IO boost has been requested
+- *
+- * The IO wait boost of a task is disabled after a tick since the last update
+- * of a CPU. If a new IO wait boost is requested after more then a tick, then
+- * we enable the boost starting from IOWAIT_BOOST_MIN, which improves energy
+- * efficiency by ignoring sporadic wakeups from IO.
+- */
+-static bool sugov_iowait_reset(struct sugov_cpu *sg_cpu, u64 time,
+-			       bool set_iowait_boost)
+-{
+-	s64 delta_ns = time - sg_cpu->last_update;
+-
+-	/* Reset boost only if a tick has elapsed since last request */
+-	if (delta_ns <= TICK_NSEC)
+-		return false;
+-
+-	sg_cpu->iowait_boost = set_iowait_boost ? IOWAIT_BOOST_MIN : 0;
+-	sg_cpu->iowait_boost_pending = set_iowait_boost;
+-
+-	return true;
+-}
+-
+-/**
+- * sugov_iowait_boost() - Updates the IO boost status of a CPU.
+- * @sg_cpu: the sugov data for the CPU to boost
+- * @time: the update time from the caller
+- * @flags: SCHED_CPUFREQ_IOWAIT if the task is waking up after an IO wait
+- *
+- * Each time a task wakes up after an IO operation, the CPU utilization can be
+- * boosted to a certain utilization which doubles at each "frequent and
+- * successive" wakeup from IO, ranging from IOWAIT_BOOST_MIN to the utilization
+- * of the maximum OPP.
+- *
+- * To keep doubling, an IO boost has to be requested at least once per tick,
+- * otherwise we restart from the utilization of the minimum OPP.
+- */
+-static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,
+-			       unsigned int flags)
+-{
+-	bool set_iowait_boost = flags & SCHED_CPUFREQ_IOWAIT;
+-
+-	/* Reset boost if the CPU appears to have been idle enough */
+-	if (sg_cpu->iowait_boost &&
+-	    sugov_iowait_reset(sg_cpu, time, set_iowait_boost))
+-		return;
+-
+-	/* Boost only tasks waking up after IO */
+-	if (!set_iowait_boost)
+-		return;
+-
+-	/* Ensure boost doubles only one time at each request */
+-	if (sg_cpu->iowait_boost_pending)
+-		return;
+-	sg_cpu->iowait_boost_pending = true;
+-
+-	/* Double the boost at each request */
+-	if (sg_cpu->iowait_boost) {
+-		sg_cpu->iowait_boost =
+-			min_t(unsigned int,
+-			      sg_cpu->iowait_boost + IOWAIT_BOOST_MIN, SCHED_CAPACITY_SCALE);
+-		return;
+-	}
+-
+-	/* First wakeup after IO: start with minimum boost */
+-	sg_cpu->iowait_boost = IOWAIT_BOOST_MIN;
+-}
+-
+-/**
+- * sugov_iowait_apply() - Apply the IO boost to a CPU.
+- * @sg_cpu: the sugov data for the cpu to boost
+- * @time: the update time from the caller
+- * @max_cap: the max CPU capacity
+- *
+- * A CPU running a task which woken up after an IO operation can have its
+- * utilization boosted to speed up the completion of those IO operations.
+- * The IO boost value is increased each time a task wakes up from IO, in
+- * sugov_iowait_apply(), and it's instead decreased by this function,
+- * each time an increase has not been requested (!iowait_boost_pending).
+- *
+- * A CPU which also appears to have been idle for at least one tick has also
+- * its IO boost utilization reset.
+- *
+- * This mechanism is designed to boost high frequently IO waiting tasks, while
+- * being more conservative on tasks which does sporadic IO operations.
+- */
+-static unsigned long sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time,
+-			       unsigned long max_cap)
+-{
+-	/* No boost currently required */
+-	if (!sg_cpu->iowait_boost)
+-		return 0;
+-
+-	/* Reset boost if the CPU appears to have been idle enough */
+-	if (sugov_iowait_reset(sg_cpu, time, false))
+-		return 0;
+-
+-	if (!sg_cpu->iowait_boost_pending) {
+-		/*
+-		 * No boost pending; reduce the boost value.
+-		 */
+-		sg_cpu->iowait_boost -= IOWAIT_BOOST_MIN;
+-		if (!sg_cpu->iowait_boost)
+-			return 0;
+-	}
+-
+-	sg_cpu->iowait_boost_pending = false;
+-
+-	if (sg_cpu->iowait_boost > sg_cpu->sg_policy->iowait_boost_cap)
+-		sg_cpu->iowait_boost = sg_cpu->sg_policy->iowait_boost_cap;
+-
+-	/*
+-	 * sg_cpu->util is already in capacity scale; convert iowait_boost
+-	 * into the same scale so we can compare.
+-	 */
+-	return (sg_cpu->iowait_boost * max_cap) >> SCHED_CAPACITY_SHIFT;
+-}
+-
+ #ifdef CONFIG_NO_HZ_COMMON
+ static bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu)
+ {
+@@ -356,18 +225,12 @@ static inline bool sugov_update_single_c
+ 					      u64 time, unsigned long max_cap,
+ 					      unsigned int flags)
+ {
+-	unsigned long boost;
+-
+-	sugov_iowait_boost(sg_cpu, time, flags);
+-	sg_cpu->last_update = time;
+-
+ 	ignore_dl_rate_limit(sg_cpu);
+ 
+ 	if (!sugov_should_update_freq(sg_cpu->sg_policy, time))
+ 		return false;
+ 
+-	boost = sugov_iowait_apply(sg_cpu, time, max_cap);
+-	sugov_get_util(sg_cpu, boost);
++	sugov_get_util(sg_cpu);
+ 
+ 	return true;
+ }
+@@ -468,11 +331,8 @@ static unsigned int sugov_next_freq_shar
+ 
+ 	for_each_cpu(j, policy->cpus) {
+ 		struct sugov_cpu *j_sg_cpu = &per_cpu(sugov_cpu, j);
+-		unsigned long boost;
+-
+-		boost = sugov_iowait_apply(j_sg_cpu, time, max_cap);
+-		sugov_get_util(j_sg_cpu, boost);
+ 
++		sugov_get_util(j_sg_cpu);
+ 		util = max(j_sg_cpu->util, util);
+ 	}
+ 
+@@ -488,9 +348,6 @@ sugov_update_shared(struct update_util_d
+ 
+ 	raw_spin_lock(&sg_policy->update_lock);
+ 
+-	sugov_iowait_boost(sg_cpu, time, flags);
+-	sg_cpu->last_update = time;
+-
+ 	ignore_dl_rate_limit(sg_cpu);
+ 
+ 	if (sugov_should_update_freq(sg_policy, time)) {
+@@ -560,14 +417,6 @@ static ssize_t rate_limit_us_show(struct
+ 	return sprintf(buf, "%u\n", tunables->rate_limit_us);
+ }
+ 
+-
+-static ssize_t iowait_boost_cap_show(struct gov_attr_set *attr_set, char *buf)
+-{
+-	struct sugov_tunables *tunables = to_sugov_tunables(attr_set);
+-
+-	return sprintf(buf, "%u\n", tunables->iowait_boost_cap);
+-}
+-
+ static ssize_t
+ rate_limit_us_store(struct gov_attr_set *attr_set, const char *buf, size_t count)
+ {
+@@ -586,30 +435,10 @@ rate_limit_us_store(struct gov_attr_set
+ 	return count;
+ }
+ 
+-static ssize_t
+-iowait_boost_cap_store(struct gov_attr_set *attr_set, const char *buf, size_t count)
+-{
+-	struct sugov_tunables *tunables = to_sugov_tunables(attr_set);
+-	struct sugov_policy *sg_policy;
+-	unsigned int iowait_boost_cap;
+-
+-	if (kstrtouint(buf, 10, &iowait_boost_cap))
+-		return -EINVAL;
+-
+-	tunables->iowait_boost_cap = iowait_boost_cap;
+-
+-	list_for_each_entry(sg_policy, &attr_set->policy_list, tunables_hook)
+-		sg_policy->iowait_boost_cap = iowait_boost_cap;
+-
+-	return count;
+-}
+-
+ static struct governor_attr rate_limit_us = __ATTR_RW(rate_limit_us);
+-static struct governor_attr iowait_boost_cap = __ATTR_RW(iowait_boost_cap);
+ 
+ static struct attribute *sugov_attrs[] = {
+ 	&rate_limit_us.attr,
+-	&iowait_boost_cap.attr,
+ 	NULL
+ };
+ ATTRIBUTE_GROUPS(sugov);
+@@ -799,8 +628,6 @@ static int sugov_init(struct cpufreq_pol
+ 
+ 	tunables->rate_limit_us = cpufreq_policy_transition_delay_us(policy);
+ 
+-	tunables->iowait_boost_cap = SCHED_CAPACITY_SCALE;
+-
+ 	policy->governor_data = sg_policy;
+ 	sg_policy->tunables = tunables;
+ 
+@@ -870,8 +697,6 @@ static int sugov_start(struct cpufreq_po
+ 	sg_policy->limits_changed		= false;
+ 	sg_policy->cached_raw_freq		= 0;
+ 
+-	sg_policy->iowait_boost_cap		= SCHED_CAPACITY_SCALE;
+-
+ 	sg_policy->need_freq_update = cpufreq_driver_test_flags(CPUFREQ_NEED_UPDATE_LIMITS);
+ 
+ 	if (policy_is_shared(policy))
diff --git a/debian/patches/patchset-pf/cpuidle/0006-cpufreq-intel_pstate-Remove-iowait-boost.patch b/debian/patches/patchset-pf/cpuidle/0006-cpufreq-intel_pstate-Remove-iowait-boost.patch
new file mode 100644
index 0000000..c255931
--- /dev/null
+++ b/debian/patches/patchset-pf/cpuidle/0006-cpufreq-intel_pstate-Remove-iowait-boost.patch
@@ -0,0 +1,113 @@
+From af7bbb59c2411e985a5d79173af5686337b4af9b Mon Sep 17 00:00:00 2001
+From: Christian Loehle <christian.loehle@arm.com>
+Date: Thu, 5 Sep 2024 10:26:43 +0100
+Subject: cpufreq: intel_pstate: Remove iowait boost
+
+Analogous to schedutil, remove iowait boost for the same reasons.
+
+Signed-off-by: Christian Loehle <christian.loehle@arm.com>
+---
+ drivers/cpufreq/intel_pstate.c | 50 ++--------------------------------
+ 1 file changed, 3 insertions(+), 47 deletions(-)
+
+--- a/drivers/cpufreq/intel_pstate.c
++++ b/drivers/cpufreq/intel_pstate.c
+@@ -191,7 +191,6 @@ struct global_params {
+  * @policy:		CPUFreq policy value
+  * @update_util:	CPUFreq utility callback information
+  * @update_util_set:	CPUFreq utility callback is set
+- * @iowait_boost:	iowait-related boost fraction
+  * @last_update:	Time of the last update.
+  * @pstate:		Stores P state limits for this CPU
+  * @vid:		Stores VID limits for this CPU
+@@ -245,7 +244,6 @@ struct cpudata {
+ 	struct acpi_processor_performance acpi_perf_data;
+ 	bool valid_pss_table;
+ #endif
+-	unsigned int iowait_boost;
+ 	s16 epp_powersave;
+ 	s16 epp_policy;
+ 	s16 epp_default;
+@@ -2136,28 +2134,7 @@ static inline void intel_pstate_update_u
+ {
+ 	cpu->sample.time = time;
+ 
+-	if (cpu->sched_flags & SCHED_CPUFREQ_IOWAIT) {
+-		bool do_io = false;
+-
+-		cpu->sched_flags = 0;
+-		/*
+-		 * Set iowait_boost flag and update time. Since IO WAIT flag
+-		 * is set all the time, we can't just conclude that there is
+-		 * some IO bound activity is scheduled on this CPU with just
+-		 * one occurrence. If we receive at least two in two
+-		 * consecutive ticks, then we treat as boost candidate.
+-		 */
+-		if (time_before64(time, cpu->last_io_update + 2 * TICK_NSEC))
+-			do_io = true;
+-
+-		cpu->last_io_update = time;
+-
+-		if (do_io)
+-			intel_pstate_hwp_boost_up(cpu);
+-
+-	} else {
+-		intel_pstate_hwp_boost_down(cpu);
+-	}
++	intel_pstate_hwp_boost_down(cpu);
+ }
+ 
+ static inline void intel_pstate_update_util_hwp(struct update_util_data *data,
+@@ -2240,9 +2217,6 @@ static inline int32_t get_target_pstate(
+ 	busy_frac = div_fp(sample->mperf << cpu->aperf_mperf_shift,
+ 			   sample->tsc);
+ 
+-	if (busy_frac < cpu->iowait_boost)
+-		busy_frac = cpu->iowait_boost;
+-
+ 	sample->busy_scaled = busy_frac * 100;
+ 
+ 	target = READ_ONCE(global.no_turbo) ?
+@@ -2303,7 +2277,7 @@ static void intel_pstate_adjust_pstate(s
+ 		sample->aperf,
+ 		sample->tsc,
+ 		get_avg_frequency(cpu),
+-		fp_toint(cpu->iowait_boost * 100));
++		0);
+ }
+ 
+ static void intel_pstate_update_util(struct update_util_data *data, u64 time,
+@@ -2317,24 +2291,6 @@ static void intel_pstate_update_util(str
+ 		return;
+ 
+ 	delta_ns = time - cpu->last_update;
+-	if (flags & SCHED_CPUFREQ_IOWAIT) {
+-		/* Start over if the CPU may have been idle. */
+-		if (delta_ns > TICK_NSEC) {
+-			cpu->iowait_boost = ONE_EIGHTH_FP;
+-		} else if (cpu->iowait_boost >= ONE_EIGHTH_FP) {
+-			cpu->iowait_boost <<= 1;
+-			if (cpu->iowait_boost > int_tofp(1))
+-				cpu->iowait_boost = int_tofp(1);
+-		} else {
+-			cpu->iowait_boost = ONE_EIGHTH_FP;
+-		}
+-	} else if (cpu->iowait_boost) {
+-		/* Clear iowait_boost if the CPU may have been idle. */
+-		if (delta_ns > TICK_NSEC)
+-			cpu->iowait_boost = 0;
+-		else
+-			cpu->iowait_boost >>= 1;
+-	}
+ 	cpu->last_update = time;
+ 	delta_ns = time - cpu->sample.time;
+ 	if ((s64)delta_ns < INTEL_PSTATE_SAMPLING_INTERVAL)
+@@ -2832,7 +2788,7 @@ static void intel_cpufreq_trace(struct c
+ 		sample->aperf,
+ 		sample->tsc,
+ 		get_avg_frequency(cpu),
+-		fp_toint(cpu->iowait_boost * 100));
++		0);
+ }
+ 
+ static void intel_cpufreq_hwp_update(struct cpudata *cpu, u32 min, u32 max,
diff --git a/debian/patches/patchset-pf/cpuidle/0007-cpufreq-Remove-SCHED_CPUFREQ_IOWAIT-update.patch b/debian/patches/patchset-pf/cpuidle/0007-cpufreq-Remove-SCHED_CPUFREQ_IOWAIT-update.patch
new file mode 100644
index 0000000..238f3b0
--- /dev/null
+++ b/debian/patches/patchset-pf/cpuidle/0007-cpufreq-Remove-SCHED_CPUFREQ_IOWAIT-update.patch
@@ -0,0 +1,42 @@
+From fd1e0723b0a7ad140d2bf7cd9154997d5ece2b37 Mon Sep 17 00:00:00 2001
+From: Christian Loehle <christian.loehle@arm.com>
+Date: Thu, 5 Sep 2024 10:26:44 +0100
+Subject: cpufreq: Remove SCHED_CPUFREQ_IOWAIT update
+
+Neither intel_pstate nor schedutil care for the flag anymore, so
+remove the update and flag definition.
+
+Signed-off-by: Christian Loehle <christian.loehle@arm.com>
+---
+ include/linux/sched/cpufreq.h | 2 --
+ kernel/sched/fair.c           | 8 --------
+ 2 files changed, 10 deletions(-)
+
+--- a/include/linux/sched/cpufreq.h
++++ b/include/linux/sched/cpufreq.h
+@@ -8,8 +8,6 @@
+  * Interface between cpufreq drivers and the scheduler:
+  */
+ 
+-#define SCHED_CPUFREQ_IOWAIT	(1U << 0)
+-
+ #ifdef CONFIG_CPU_FREQ
+ struct cpufreq_policy;
+ 
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -6768,14 +6768,6 @@ enqueue_task_fair(struct rq *rq, struct
+ 	 */
+ 	util_est_enqueue(&rq->cfs, p);
+ 
+-	/*
+-	 * If in_iowait is set, the code below may not trigger any cpufreq
+-	 * utilization updates, so do it here explicitly with the IOWAIT flag
+-	 * passed.
+-	 */
+-	if (p->in_iowait)
+-		cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT);
+-
+ 	for_each_sched_entity(se) {
+ 		if (se->on_rq)
+ 			break;
diff --git a/debian/patches/patchset-pf/cpuidle/0008-io_uring-Do-not-set-iowait-before-sleeping.patch b/debian/patches/patchset-pf/cpuidle/0008-io_uring-Do-not-set-iowait-before-sleeping.patch
new file mode 100644
index 0000000..6a2d4c4
--- /dev/null
+++ b/debian/patches/patchset-pf/cpuidle/0008-io_uring-Do-not-set-iowait-before-sleeping.patch
@@ -0,0 +1,55 @@
+From 30cdb8d7d06f51bb86142c537ea05bd01c31bb40 Mon Sep 17 00:00:00 2001
+From: Christian Loehle <christian.loehle@arm.com>
+Date: Thu, 5 Sep 2024 10:26:45 +0100
+Subject: io_uring: Do not set iowait before sleeping
+
+Setting in_iowait was introduced in commit
+8a796565cec3 ("io_uring: Use io_schedule* in cqring wait")
+to tackle a perf regression that was caused by menu taking iowait into
+account for synchronous IO and thus not selecting deeper states like in
+the io_uring counterpart.
+That behaviour is gone, so the workaround can be removed.
+
+Signed-off-by: Christian Loehle <christian.loehle@arm.com>
+---
+ io_uring/io_uring.c | 17 -----------------
+ 1 file changed, 17 deletions(-)
+
+--- a/io_uring/io_uring.c
++++ b/io_uring/io_uring.c
+@@ -2359,15 +2359,6 @@ int io_run_task_work_sig(struct io_ring_
+ 	return 0;
+ }
+ 
+-static bool current_pending_io(void)
+-{
+-	struct io_uring_task *tctx = current->io_uring;
+-
+-	if (!tctx)
+-		return false;
+-	return percpu_counter_read_positive(&tctx->inflight);
+-}
+-
+ /* when returns >0, the caller should retry */
+ static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
+ 					  struct io_wait_queue *iowq)
+@@ -2385,19 +2376,11 @@ static inline int io_cqring_wait_schedul
+ 	if (unlikely(io_should_wake(iowq)))
+ 		return 0;
+ 
+-	/*
+-	 * Mark us as being in io_wait if we have pending requests, so cpufreq
+-	 * can take into account that the task is waiting for IO - turns out
+-	 * to be important for low QD IO.
+-	 */
+-	if (current_pending_io())
+-		current->in_iowait = 1;
+ 	ret = 0;
+ 	if (iowq->timeout == KTIME_MAX)
+ 		schedule();
+ 	else if (!schedule_hrtimeout(&iowq->timeout, HRTIMER_MODE_ABS))
+ 		ret = -ETIME;
+-	current->in_iowait = 0;
+ 	return ret;
+ }
+ 
diff --git a/debian/patches/patchset-pf/crypto/0001-crypto-x86-crc32c-simplify-code-for-handling-fewer-t.patch b/debian/patches/patchset-pf/crypto/0001-crypto-x86-crc32c-simplify-code-for-handling-fewer-t.patch
new file mode 100644
index 0000000..7fd04a8
--- /dev/null
+++ b/debian/patches/patchset-pf/crypto/0001-crypto-x86-crc32c-simplify-code-for-handling-fewer-t.patch
@@ -0,0 +1,181 @@
+From e6b67a8d14e86d63062e6f1f234c5afc235561d4 Mon Sep 17 00:00:00 2001
+From: Eric Biggers <ebiggers@google.com>
+Date: Sun, 13 Oct 2024 21:06:49 -0700
+Subject: crypto: x86/crc32c - simplify code for handling fewer than 200 bytes
+
+The assembly code in crc32c-pcl-intel-asm_64.S is invoked only for
+lengths >= 512, due to the overhead of saving and restoring FPU state.
+Therefore, it is unnecessary for this code to be excessively "optimized"
+for lengths < 200.  Eliminate the excessive unrolling of this part of
+the code and use a more straightforward qword-at-a-time loop.
+
+Note: the part of the code in question is not entirely redundant, as it
+is still used to process any remainder mod 24, as well as any remaining
+data when fewer than 200 bytes remain after least one 3072-byte chunk.
+
+Signed-off-by: Eric Biggers <ebiggers@google.com>
+---
+ arch/x86/crypto/crc32c-pcl-intel-asm_64.S | 116 ++++++----------------
+ 1 file changed, 33 insertions(+), 83 deletions(-)
+
+--- a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
++++ b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
+@@ -56,20 +56,10 @@
+ .quad .Lcrc_\i
+ .endm
+ 
+-.macro JNC_LESS_THAN j
+-	jnc .Lless_than_\j
+-.endm
+-
+-# Define threshold where buffers are considered "small" and routed to more
+-# efficient "by-1" code. This "by-1" code only handles up to 255 bytes, so
+-# SMALL_SIZE can be no larger than 255.
+-
++# Define threshold below which buffers are considered "small" and routed to
++# regular CRC code that does not interleave the CRC instructions.
+ #define SMALL_SIZE 200
+ 
+-.if (SMALL_SIZE > 255)
+-.error "SMALL_ SIZE must be < 256"
+-.endif
+-
+ # unsigned int crc_pcl(u8 *buffer, int len, unsigned int crc_init);
+ 
+ .text
+@@ -100,25 +90,18 @@ SYM_FUNC_START(crc_pcl)
+ 	## Move crc_init for Linux to a different
+ 	mov     crc_init_arg, crc_init
+ 
++	mov	%bufp, bufptmp		# rdi = *buf
++	cmp	$SMALL_SIZE, len
++	jb	.Lsmall
++
+ 	################################################################
+ 	## 1) ALIGN:
+ 	################################################################
+-
+-	mov     %bufp, bufptmp		# rdi = *buf
+ 	neg     %bufp
+ 	and     $7, %bufp		# calculate the unalignment amount of
+ 					# the address
+ 	je      .Lproc_block		# Skip if aligned
+ 
+-	## If len is less than 8 and we're unaligned, we need to jump
+-	## to special code to avoid reading beyond the end of the buffer
+-	cmp     $8, len
+-	jae     .Ldo_align
+-	# less_than_8 expects length in upper 3 bits of len_dw
+-	# less_than_8_post_shl1 expects length = carryflag * 8 + len_dw[31:30]
+-	shl     $32-3+1, len_dw
+-	jmp     .Lless_than_8_post_shl1
+-
+ .Ldo_align:
+ 	#### Calculate CRC of unaligned bytes of the buffer (if any)
+ 	movq    (bufptmp), tmp		# load a quadward from the buffer
+@@ -144,9 +127,6 @@ SYM_FUNC_START(crc_pcl)
+ 	jae     .Lfull_block
+ 
+ .Lcontinue_block:
+-	cmpq    $SMALL_SIZE, len
+-	jb      .Lsmall
+-
+ 	## len < 128*24
+ 	movq    $2731, %rax		# 2731 = ceil(2^16 / 24)
+ 	mul     len_dw
+@@ -243,68 +223,38 @@ LABEL crc_ 0
+ 	mov     tmp, len
+ 	cmp     $128*24, tmp
+ 	jae     .Lfull_block
+-	cmp     $24, tmp
++	cmp	$SMALL_SIZE, tmp
+ 	jae     .Lcontinue_block
+ 
+-.Lless_than_24:
+-	shl     $32-4, len_dw			# less_than_16 expects length
+-						# in upper 4 bits of len_dw
+-	jnc     .Lless_than_16
+-	crc32q  (bufptmp), crc_init
+-	crc32q  8(bufptmp), crc_init
+-	jz      .Ldo_return
+-	add     $16, bufptmp
+-	# len is less than 8 if we got here
+-	# less_than_8 expects length in upper 3 bits of len_dw
+-	# less_than_8_post_shl1 expects length = carryflag * 8 + len_dw[31:30]
+-	shl     $2, len_dw
+-	jmp     .Lless_than_8_post_shl1
+-
+ 	#######################################################################
+-	## 6) LESS THAN 256-bytes REMAIN AT THIS POINT (8-bits of len are full)
++	## 6) Process any remainder without interleaving:
+ 	#######################################################################
+ .Lsmall:
+-	shl $32-8, len_dw		# Prepare len_dw for less_than_256
+-	j=256
+-.rept 5					# j = {256, 128, 64, 32, 16}
+-.altmacro
+-LABEL less_than_ %j			# less_than_j: Length should be in
+-					# upper lg(j) bits of len_dw
+-	j=(j/2)
+-	shl     $1, len_dw		# Get next MSB
+-	JNC_LESS_THAN %j
+-.noaltmacro
+-	i=0
+-.rept (j/8)
+-	crc32q  i(bufptmp), crc_init	# Compute crc32 of 8-byte data
+-	i=i+8
+-.endr
+-	jz      .Ldo_return		# Return if remaining length is zero
+-	add     $j, bufptmp		# Advance buf
+-.endr
+-
+-.Lless_than_8:				# Length should be stored in
+-					# upper 3 bits of len_dw
+-	shl     $1, len_dw
+-.Lless_than_8_post_shl1:
+-	jnc     .Lless_than_4
+-	crc32l  (bufptmp), crc_init_dw	# CRC of 4 bytes
+-	jz      .Ldo_return		# return if remaining data is zero
+-	add     $4, bufptmp
+-.Lless_than_4:				# Length should be stored in
+-					# upper 2 bits of len_dw
+-	shl     $1, len_dw
+-	jnc     .Lless_than_2
+-	crc32w  (bufptmp), crc_init_dw	# CRC of 2 bytes
+-	jz      .Ldo_return		# return if remaining data is zero
+-	add     $2, bufptmp
+-.Lless_than_2:				# Length should be stored in the MSB
+-					# of len_dw
+-	shl     $1, len_dw
+-	jnc     .Lless_than_1
+-	crc32b  (bufptmp), crc_init_dw	# CRC of 1 byte
+-.Lless_than_1:				# Length should be zero
+-.Ldo_return:
++	test	len, len
++	jz	.Ldone
++	mov	len_dw, %eax
++	shr	$3, %eax
++	jz	.Ldo_dword
++.Ldo_qwords:
++	crc32q	(bufptmp), crc_init
++	add	$8, bufptmp
++	dec	%eax
++	jnz	.Ldo_qwords
++.Ldo_dword:
++	test	$4, len_dw
++	jz	.Ldo_word
++	crc32l	(bufptmp), crc_init_dw
++	add	$4, bufptmp
++.Ldo_word:
++	test	$2, len_dw
++	jz	.Ldo_byte
++	crc32w	(bufptmp), crc_init_dw
++	add	$2, bufptmp
++.Ldo_byte:
++	test	$1, len_dw
++	jz	.Ldone
++	crc32b	(bufptmp), crc_init_dw
++.Ldone:
+ 	movq    crc_init, %rax
+ 	popq    %rsi
+ 	popq    %rdi
diff --git a/debian/patches/patchset-pf/crypto/0002-crypto-x86-crc32c-access-32-bit-arguments-as-32-bit.patch b/debian/patches/patchset-pf/crypto/0002-crypto-x86-crc32c-access-32-bit-arguments-as-32-bit.patch
new file mode 100644
index 0000000..b3e564c
--- /dev/null
+++ b/debian/patches/patchset-pf/crypto/0002-crypto-x86-crc32c-access-32-bit-arguments-as-32-bit.patch
@@ -0,0 +1,187 @@
+From 430478d63b1403878f2fd4b12de2cd21ee502184 Mon Sep 17 00:00:00 2001
+From: Eric Biggers <ebiggers@google.com>
+Date: Sun, 13 Oct 2024 21:06:49 -0700
+Subject: crypto: x86/crc32c - access 32-bit arguments as 32-bit
+
+Fix crc32c-pcl-intel-asm_64.S to access 32-bit arguments as 32-bit
+values instead of 64-bit, since the upper bits of the corresponding
+64-bit registers are not guaranteed to be zero.  Also update the type of
+the length argument to be unsigned int rather than int, as the assembly
+code treats it as unsigned.
+
+Note: there haven't been any reports of this bug actually causing
+incorrect behavior.  Neither gcc nor clang guarantee zero-extension to
+64 bits, but zero-extension is likely to happen in practice because most
+instructions that operate on 32-bit registers zero-extend to 64 bits.
+
+Signed-off-by: Eric Biggers <ebiggers@google.com>
+---
+ arch/x86/crypto/crc32c-intel_glue.c       |  2 +-
+ arch/x86/crypto/crc32c-pcl-intel-asm_64.S | 57 +++++++++++------------
+ 2 files changed, 27 insertions(+), 32 deletions(-)
+
+--- a/arch/x86/crypto/crc32c-intel_glue.c
++++ b/arch/x86/crypto/crc32c-intel_glue.c
+@@ -41,7 +41,7 @@
+  */
+ #define CRC32C_PCL_BREAKEVEN	512
+ 
+-asmlinkage unsigned int crc_pcl(const u8 *buffer, int len,
++asmlinkage unsigned int crc_pcl(const u8 *buffer, unsigned int len,
+ 				unsigned int crc_init);
+ #endif /* CONFIG_X86_64 */
+ 
+--- a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
++++ b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
+@@ -60,7 +60,7 @@
+ # regular CRC code that does not interleave the CRC instructions.
+ #define SMALL_SIZE 200
+ 
+-# unsigned int crc_pcl(u8 *buffer, int len, unsigned int crc_init);
++# unsigned int crc_pcl(const u8 *buffer, unsigned int len, unsigned int crc_init);
+ 
+ .text
+ SYM_FUNC_START(crc_pcl)
+@@ -72,14 +72,11 @@ SYM_FUNC_START(crc_pcl)
+ #define    block_0	%rcx
+ #define    block_1	%rdx
+ #define    block_2	%r11
+-#define    len		%rsi
+-#define    len_dw	%esi
+-#define    len_w	%si
+-#define    len_b	%sil
+-#define    crc_init_arg %rdx
++#define    len		%esi
++#define    crc_init_arg %edx
+ #define    tmp		%rbx
+-#define    crc_init	%r8
+-#define    crc_init_dw	%r8d
++#define    crc_init	%r8d
++#define    crc_init_q	%r8
+ #define    crc1		%r9
+ #define    crc2		%r10
+ 
+@@ -107,9 +104,9 @@ SYM_FUNC_START(crc_pcl)
+ 	movq    (bufptmp), tmp		# load a quadward from the buffer
+ 	add     %bufp, bufptmp		# align buffer pointer for quadword
+ 					# processing
+-	sub     %bufp, len		# update buffer length
++	sub	bufp_dw, len		# update buffer length
+ .Lalign_loop:
+-	crc32b  %bl, crc_init_dw 	# compute crc32 of 1-byte
++	crc32b	%bl, crc_init		# compute crc32 of 1-byte
+ 	shr     $8, tmp			# get next byte
+ 	dec     %bufp
+ 	jne     .Lalign_loop
+@@ -121,15 +118,14 @@ SYM_FUNC_START(crc_pcl)
+ 	################################################################
+ 
+ 	## compute num of bytes to be processed
+-	movq    len, tmp		# save num bytes in tmp
+ 
+-	cmpq    $128*24, len
++	cmp	$128*24, len
+ 	jae     .Lfull_block
+ 
+ .Lcontinue_block:
+ 	## len < 128*24
+ 	movq    $2731, %rax		# 2731 = ceil(2^16 / 24)
+-	mul     len_dw
++	mul	len
+ 	shrq    $16, %rax
+ 
+ 	## eax contains floor(bytes / 24) = num 24-byte chunks to do
+@@ -176,7 +172,7 @@ SYM_FUNC_START(crc_pcl)
+ LABEL crc_ %i
+ .noaltmacro
+ 	ENDBR
+-	crc32q   -i*8(block_0), crc_init
++	crc32q   -i*8(block_0), crc_init_q
+ 	crc32q   -i*8(block_1), crc1
+ 	crc32q   -i*8(block_2), crc2
+ 	i=(i-1)
+@@ -186,7 +182,7 @@ LABEL crc_ %i
+ LABEL crc_ %i
+ .noaltmacro
+ 	ENDBR
+-	crc32q   -i*8(block_0), crc_init
++	crc32q   -i*8(block_0), crc_init_q
+ 	crc32q   -i*8(block_1), crc1
+ # SKIP  crc32  -i*8(block_2), crc2 ; Don't do this one yet
+ 
+@@ -200,9 +196,9 @@ LABEL crc_ %i
+ 	shlq    $3, %rax			# rax *= 8
+ 	pmovzxdq (%bufp,%rax), %xmm0		# 2 consts: K1:K2
+ 	leal	(%eax,%eax,2), %eax		# rax *= 3 (total *24)
+-	subq    %rax, tmp			# tmp -= rax*24
++	sub	%eax, len			# len -= rax*24
+ 
+-	movq    crc_init, %xmm1			# CRC for block 1
++	movq	crc_init_q, %xmm1		# CRC for block 1
+ 	pclmulqdq $0x00, %xmm0, %xmm1		# Multiply by K2
+ 
+ 	movq    crc1, %xmm2			# CRC for block 2
+@@ -211,8 +207,8 @@ LABEL crc_ %i
+ 	pxor    %xmm2,%xmm1
+ 	movq    %xmm1, %rax
+ 	xor     -i*8(block_2), %rax
+-	mov     crc2, crc_init
+-	crc32   %rax, crc_init
++	mov	crc2, crc_init_q
++	crc32	%rax, crc_init_q
+ 
+ 	################################################################
+ 	## 5) Check for end:
+@@ -220,10 +216,9 @@ LABEL crc_ %i
+ 
+ LABEL crc_ 0
+ 	ENDBR
+-	mov     tmp, len
+-	cmp     $128*24, tmp
++	cmp	$128*24, len
+ 	jae     .Lfull_block
+-	cmp	$SMALL_SIZE, tmp
++	cmp	$SMALL_SIZE, len
+ 	jae     .Lcontinue_block
+ 
+ 	#######################################################################
+@@ -232,30 +227,30 @@ LABEL crc_ 0
+ .Lsmall:
+ 	test	len, len
+ 	jz	.Ldone
+-	mov	len_dw, %eax
++	mov	len, %eax
+ 	shr	$3, %eax
+ 	jz	.Ldo_dword
+ .Ldo_qwords:
+-	crc32q	(bufptmp), crc_init
++	crc32q	(bufptmp), crc_init_q
+ 	add	$8, bufptmp
+ 	dec	%eax
+ 	jnz	.Ldo_qwords
+ .Ldo_dword:
+-	test	$4, len_dw
++	test	$4, len
+ 	jz	.Ldo_word
+-	crc32l	(bufptmp), crc_init_dw
++	crc32l	(bufptmp), crc_init
+ 	add	$4, bufptmp
+ .Ldo_word:
+-	test	$2, len_dw
++	test	$2, len
+ 	jz	.Ldo_byte
+-	crc32w	(bufptmp), crc_init_dw
++	crc32w	(bufptmp), crc_init
+ 	add	$2, bufptmp
+ .Ldo_byte:
+-	test	$1, len_dw
++	test	$1, len
+ 	jz	.Ldone
+-	crc32b	(bufptmp), crc_init_dw
++	crc32b	(bufptmp), crc_init
+ .Ldone:
+-	movq    crc_init, %rax
++	mov	crc_init, %eax
+ 	popq    %rsi
+ 	popq    %rdi
+ 	popq    %rbx
diff --git a/debian/patches/patchset-pf/crypto/0003-crypto-x86-crc32c-eliminate-jump-table-and-excessive.patch b/debian/patches/patchset-pf/crypto/0003-crypto-x86-crc32c-eliminate-jump-table-and-excessive.patch
new file mode 100644
index 0000000..faaae59
--- /dev/null
+++ b/debian/patches/patchset-pf/crypto/0003-crypto-x86-crc32c-eliminate-jump-table-and-excessive.patch
@@ -0,0 +1,374 @@
+From 8706bf3e3cba8c708f9933f0d1c6a23f9c2c8c33 Mon Sep 17 00:00:00 2001
+From: Eric Biggers <ebiggers@google.com>
+Date: Sun, 13 Oct 2024 21:06:49 -0700
+Subject: crypto: x86/crc32c - eliminate jump table and excessive unrolling
+
+crc32c-pcl-intel-asm_64.S has a loop with 1 to 127 iterations fully
+unrolled and uses a jump table to jump into the correct location.  This
+optimization is misguided, as it bloats the binary code size and
+introduces an indirect call.  x86_64 CPUs can predict loops well, so it
+is fine to just use a loop instead.  Loop bookkeeping instructions can
+compete with the crc instructions for the ALUs, but this is easily
+mitigated by unrolling the loop by a smaller amount, such as 4 times.
+
+Therefore, re-roll the loop and make related tweaks to the code.
+
+This reduces the binary code size of crc_pclmul() from 4546 bytes to 418
+bytes, a 91% reduction.  In general it also makes the code faster, with
+some large improvements seen when retpoline is enabled.
+
+More detailed performance results are shown below.  They are given as
+percent improvement in throughput (negative means regressed) for CPU
+microarchitecture vs. input length in bytes.  E.g. an improvement from
+40 GB/s to 50 GB/s would be listed as 25%.
+
+Table 1: Results with retpoline enabled (the default):
+
+                       |   512 |   833 |  1024 |  2000 |  3173 |  4096 |
+  ---------------------+-------+-------+-------+------ +-------+-------+
+  Intel Haswell        | 35.0% | 20.7% | 17.8% |  9.7% | -0.2% |  4.4% |
+  Intel Emerald Rapids | 66.8% | 45.2% | 36.3% | 19.3% |  0.0% |  5.4% |
+  AMD Zen 2            | 29.5% | 17.2% | 13.5% |  8.6% | -0.5% |  2.8% |
+
+Table 2: Results with retpoline disabled:
+
+                       |   512 |   833 |  1024 |  2000 |  3173 |  4096 |
+  ---------------------+-------+-------+-------+------ +-------+-------+
+  Intel Haswell        |  3.3% |  4.8% |  4.5% |  0.9% | -2.9% |  0.3% |
+  Intel Emerald Rapids |  7.5% |  6.4% |  5.2% |  2.3% | -0.0% |  0.6% |
+  AMD Zen 2            | 11.8% |  1.4% |  0.2% |  1.3% | -0.9% | -0.2% |
+
+Signed-off-by: Eric Biggers <ebiggers@google.com>
+---
+ arch/x86/crypto/crc32c-pcl-intel-asm_64.S | 233 +++++++++-------------
+ 1 file changed, 92 insertions(+), 141 deletions(-)
+
+--- a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
++++ b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
+@@ -7,6 +7,7 @@
+  * http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-paper.pdf
+  *
+  * Copyright (C) 2012 Intel Corporation.
++ * Copyright 2024 Google LLC
+  *
+  * Authors:
+  *	Wajdi Feghali <wajdi.k.feghali@intel.com>
+@@ -44,18 +45,9 @@
+  */
+ 
+ #include <linux/linkage.h>
+-#include <asm/nospec-branch.h>
+ 
+ ## ISCSI CRC 32 Implementation with crc32 and pclmulqdq Instruction
+ 
+-.macro LABEL prefix n
+-.L\prefix\n\():
+-.endm
+-
+-.macro JMPTBL_ENTRY i
+-.quad .Lcrc_\i
+-.endm
+-
+ # Define threshold below which buffers are considered "small" and routed to
+ # regular CRC code that does not interleave the CRC instructions.
+ #define SMALL_SIZE 200
+@@ -64,139 +56,116 @@
+ 
+ .text
+ SYM_FUNC_START(crc_pcl)
+-#define    bufp		rdi
+-#define    bufp_dw	%edi
+-#define    bufp_w	%di
+-#define    bufp_b	%dil
+-#define    bufptmp	%rcx
+-#define    block_0	%rcx
+-#define    block_1	%rdx
+-#define    block_2	%r11
+-#define    len		%esi
+-#define    crc_init_arg %edx
+-#define    tmp		%rbx
+-#define    crc_init	%r8d
+-#define    crc_init_q	%r8
+-#define    crc1		%r9
+-#define    crc2		%r10
+-
+-	pushq   %rbx
+-	pushq   %rdi
+-	pushq   %rsi
+-
+-	## Move crc_init for Linux to a different
+-	mov     crc_init_arg, crc_init
++#define    bufp		  %rdi
++#define    bufp_d	  %edi
++#define    len		  %esi
++#define    crc_init	  %edx
++#define    crc_init_q	  %rdx
++#define    n_misaligned	  %ecx /* overlaps chunk_bytes! */
++#define    n_misaligned_q %rcx
++#define    chunk_bytes	  %ecx /* overlaps n_misaligned! */
++#define    chunk_bytes_q  %rcx
++#define    crc1		  %r8
++#define    crc2		  %r9
+ 
+-	mov	%bufp, bufptmp		# rdi = *buf
+ 	cmp	$SMALL_SIZE, len
+ 	jb	.Lsmall
+ 
+ 	################################################################
+ 	## 1) ALIGN:
+ 	################################################################
+-	neg     %bufp
+-	and     $7, %bufp		# calculate the unalignment amount of
++	mov	bufp_d, n_misaligned
++	neg	n_misaligned
++	and	$7, n_misaligned	# calculate the misalignment amount of
+ 					# the address
+-	je      .Lproc_block		# Skip if aligned
++	je	.Laligned		# Skip if aligned
+ 
++	# Process 1 <= n_misaligned <= 7 bytes individually in order to align
++	# the remaining data to an 8-byte boundary.
+ .Ldo_align:
+-	#### Calculate CRC of unaligned bytes of the buffer (if any)
+-	movq    (bufptmp), tmp		# load a quadward from the buffer
+-	add     %bufp, bufptmp		# align buffer pointer for quadword
+-					# processing
+-	sub	bufp_dw, len		# update buffer length
++	movq	(bufp), %rax
++	add	n_misaligned_q, bufp
++	sub	n_misaligned, len
+ .Lalign_loop:
+-	crc32b	%bl, crc_init		# compute crc32 of 1-byte
+-	shr     $8, tmp			# get next byte
+-	dec     %bufp
++	crc32b	%al, crc_init		# compute crc32 of 1-byte
++	shr	$8, %rax		# get next byte
++	dec	n_misaligned
+ 	jne     .Lalign_loop
+-
+-.Lproc_block:
++.Laligned:
+ 
+ 	################################################################
+-	## 2) PROCESS  BLOCKS:
++	## 2) PROCESS BLOCK:
+ 	################################################################
+ 
+-	## compute num of bytes to be processed
+-
+ 	cmp	$128*24, len
+ 	jae     .Lfull_block
+ 
+-.Lcontinue_block:
+-	## len < 128*24
+-	movq    $2731, %rax		# 2731 = ceil(2^16 / 24)
+-	mul	len
+-	shrq    $16, %rax
+-
+-	## eax contains floor(bytes / 24) = num 24-byte chunks to do
+-
+-	## process rax 24-byte chunks (128 >= rax >= 0)
+-
+-	## compute end address of each block
+-	## block 0 (base addr + RAX * 8)
+-	## block 1 (base addr + RAX * 16)
+-	## block 2 (base addr + RAX * 24)
+-	lea     (bufptmp, %rax, 8), block_0
+-	lea     (block_0, %rax, 8), block_1
+-	lea     (block_1, %rax, 8), block_2
+-
+-	xor     crc1, crc1
+-	xor     crc2, crc2
+-
+-	## branch into array
+-	leaq	jump_table(%rip), %bufp
+-	mov	(%bufp,%rax,8), %bufp
+-	JMP_NOSPEC bufp
++.Lpartial_block:
++	# Compute floor(len / 24) to get num qwords to process from each lane.
++	imul	$2731, len, %eax	# 2731 = ceil(2^16 / 24)
++	shr	$16, %eax
++	jmp	.Lcrc_3lanes
+ 
+-	################################################################
+-	## 2a) PROCESS FULL BLOCKS:
+-	################################################################
+ .Lfull_block:
+-	movl    $128,%eax
+-	lea     128*8*2(block_0), block_1
+-	lea     128*8*3(block_0), block_2
+-	add     $128*8*1, block_0
+-
+-	xor     crc1,crc1
+-	xor     crc2,crc2
+-
+-	# Fall through into top of crc array (crc_128)
++	# Processing 128 qwords from each lane.
++	mov	$128, %eax
+ 
+ 	################################################################
+-	## 3) CRC Array:
++	## 3) CRC each of three lanes:
+ 	################################################################
+ 
+-	i=128
+-.rept 128-1
+-.altmacro
+-LABEL crc_ %i
+-.noaltmacro
+-	ENDBR
+-	crc32q   -i*8(block_0), crc_init_q
+-	crc32q   -i*8(block_1), crc1
+-	crc32q   -i*8(block_2), crc2
+-	i=(i-1)
+-.endr
+-
+-.altmacro
+-LABEL crc_ %i
+-.noaltmacro
+-	ENDBR
+-	crc32q   -i*8(block_0), crc_init_q
+-	crc32q   -i*8(block_1), crc1
+-# SKIP  crc32  -i*8(block_2), crc2 ; Don't do this one yet
++.Lcrc_3lanes:
++	xor	crc1,crc1
++	xor     crc2,crc2
++	mov	%eax, chunk_bytes
++	shl	$3, chunk_bytes		# num bytes to process from each lane
++	sub	$5, %eax		# 4 for 4x_loop, 1 for special last iter
++	jl	.Lcrc_3lanes_4x_done
++
++	# Unroll the loop by a factor of 4 to reduce the overhead of the loop
++	# bookkeeping instructions, which can compete with crc32q for the ALUs.
++.Lcrc_3lanes_4x_loop:
++	crc32q	(bufp), crc_init_q
++	crc32q	(bufp,chunk_bytes_q), crc1
++	crc32q	(bufp,chunk_bytes_q,2), crc2
++	crc32q	8(bufp), crc_init_q
++	crc32q	8(bufp,chunk_bytes_q), crc1
++	crc32q	8(bufp,chunk_bytes_q,2), crc2
++	crc32q	16(bufp), crc_init_q
++	crc32q	16(bufp,chunk_bytes_q), crc1
++	crc32q	16(bufp,chunk_bytes_q,2), crc2
++	crc32q	24(bufp), crc_init_q
++	crc32q	24(bufp,chunk_bytes_q), crc1
++	crc32q	24(bufp,chunk_bytes_q,2), crc2
++	add	$32, bufp
++	sub	$4, %eax
++	jge	.Lcrc_3lanes_4x_loop
++
++.Lcrc_3lanes_4x_done:
++	add	$4, %eax
++	jz	.Lcrc_3lanes_last_qword
++
++.Lcrc_3lanes_1x_loop:
++	crc32q	(bufp), crc_init_q
++	crc32q	(bufp,chunk_bytes_q), crc1
++	crc32q	(bufp,chunk_bytes_q,2), crc2
++	add	$8, bufp
++	dec	%eax
++	jnz	.Lcrc_3lanes_1x_loop
+ 
+-	mov     block_2, block_0
++.Lcrc_3lanes_last_qword:
++	crc32q	(bufp), crc_init_q
++	crc32q	(bufp,chunk_bytes_q), crc1
++# SKIP  crc32q	(bufp,chunk_bytes_q,2), crc2	; Don't do this one yet
+ 
+ 	################################################################
+ 	## 4) Combine three results:
+ 	################################################################
+ 
+-	lea	(K_table-8)(%rip), %bufp		# first entry is for idx 1
+-	shlq    $3, %rax			# rax *= 8
+-	pmovzxdq (%bufp,%rax), %xmm0		# 2 consts: K1:K2
+-	leal	(%eax,%eax,2), %eax		# rax *= 3 (total *24)
+-	sub	%eax, len			# len -= rax*24
++	lea	(K_table-8)(%rip), %rax		# first entry is for idx 1
++	pmovzxdq (%rax,chunk_bytes_q), %xmm0	# 2 consts: K1:K2
++	lea	(chunk_bytes,chunk_bytes,2), %eax # chunk_bytes * 3
++	sub	%eax, len			# len -= chunk_bytes * 3
+ 
+ 	movq	crc_init_q, %xmm1		# CRC for block 1
+ 	pclmulqdq $0x00, %xmm0, %xmm1		# Multiply by K2
+@@ -206,20 +175,19 @@ LABEL crc_ %i
+ 
+ 	pxor    %xmm2,%xmm1
+ 	movq    %xmm1, %rax
+-	xor     -i*8(block_2), %rax
++	xor	(bufp,chunk_bytes_q,2), %rax
+ 	mov	crc2, crc_init_q
+ 	crc32	%rax, crc_init_q
++	lea	8(bufp,chunk_bytes_q,2), bufp
+ 
+ 	################################################################
+-	## 5) Check for end:
++	## 5) If more blocks remain, goto (2):
+ 	################################################################
+ 
+-LABEL crc_ 0
+-	ENDBR
+ 	cmp	$128*24, len
+-	jae     .Lfull_block
++	jae	.Lfull_block
+ 	cmp	$SMALL_SIZE, len
+-	jae     .Lcontinue_block
++	jae	.Lpartial_block
+ 
+ 	#######################################################################
+ 	## 6) Process any remainder without interleaving:
+@@ -231,47 +199,30 @@ LABEL crc_ 0
+ 	shr	$3, %eax
+ 	jz	.Ldo_dword
+ .Ldo_qwords:
+-	crc32q	(bufptmp), crc_init_q
+-	add	$8, bufptmp
++	crc32q	(bufp), crc_init_q
++	add	$8, bufp
+ 	dec	%eax
+ 	jnz	.Ldo_qwords
+ .Ldo_dword:
+ 	test	$4, len
+ 	jz	.Ldo_word
+-	crc32l	(bufptmp), crc_init
+-	add	$4, bufptmp
++	crc32l	(bufp), crc_init
++	add	$4, bufp
+ .Ldo_word:
+ 	test	$2, len
+ 	jz	.Ldo_byte
+-	crc32w	(bufptmp), crc_init
+-	add	$2, bufptmp
++	crc32w	(bufp), crc_init
++	add	$2, bufp
+ .Ldo_byte:
+ 	test	$1, len
+ 	jz	.Ldone
+-	crc32b	(bufptmp), crc_init
++	crc32b	(bufp), crc_init
+ .Ldone:
+ 	mov	crc_init, %eax
+-	popq    %rsi
+-	popq    %rdi
+-	popq    %rbx
+         RET
+ SYM_FUNC_END(crc_pcl)
+ 
+ .section	.rodata, "a", @progbits
+-        ################################################################
+-        ## jump table        Table is 129 entries x 2 bytes each
+-        ################################################################
+-.align 4
+-jump_table:
+-	i=0
+-.rept 129
+-.altmacro
+-JMPTBL_ENTRY %i
+-.noaltmacro
+-	i=i+1
+-.endr
+-
+-
+ 	################################################################
+ 	## PCLMULQDQ tables
+ 	## Table is 128 entries x 2 words (8 bytes) each
diff --git a/debian/patches/patchset-pf/fixes/0001-arch-Kconfig-Default-to-maximum-amount-of-ASLR-bits.patch b/debian/patches/patchset-pf/fixes/0001-arch-Kconfig-Default-to-maximum-amount-of-ASLR-bits.patch
new file mode 100644
index 0000000..1fb3740
--- /dev/null
+++ b/debian/patches/patchset-pf/fixes/0001-arch-Kconfig-Default-to-maximum-amount-of-ASLR-bits.patch
@@ -0,0 +1,31 @@
+From cda0e050fec85635986e9cfe991e26339bf305dc Mon Sep 17 00:00:00 2001
+From: "Jan Alexander Steffens (heftig)" <heftig@archlinux.org>
+Date: Sat, 13 Jan 2024 15:29:25 +0100
+Subject: arch/Kconfig: Default to maximum amount of ASLR bits
+
+To mitigate https://zolutal.github.io/aslrnt/; do this with a patch to
+avoid having to enable `CONFIG_EXPERT`.
+---
+ arch/Kconfig | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+--- a/arch/Kconfig
++++ b/arch/Kconfig
+@@ -1050,7 +1050,7 @@ config ARCH_MMAP_RND_BITS
+ 	int "Number of bits to use for ASLR of mmap base address" if EXPERT
+ 	range ARCH_MMAP_RND_BITS_MIN ARCH_MMAP_RND_BITS_MAX
+ 	default ARCH_MMAP_RND_BITS_DEFAULT if ARCH_MMAP_RND_BITS_DEFAULT
+-	default ARCH_MMAP_RND_BITS_MIN
++	default ARCH_MMAP_RND_BITS_MAX
+ 	depends on HAVE_ARCH_MMAP_RND_BITS
+ 	help
+ 	  This value can be used to select the number of bits to use to
+@@ -1084,7 +1084,7 @@ config ARCH_MMAP_RND_COMPAT_BITS
+ 	int "Number of bits to use for ASLR of mmap base address for compatible applications" if EXPERT
+ 	range ARCH_MMAP_RND_COMPAT_BITS_MIN ARCH_MMAP_RND_COMPAT_BITS_MAX
+ 	default ARCH_MMAP_RND_COMPAT_BITS_DEFAULT if ARCH_MMAP_RND_COMPAT_BITS_DEFAULT
+-	default ARCH_MMAP_RND_COMPAT_BITS_MIN
++	default ARCH_MMAP_RND_COMPAT_BITS_MAX
+ 	depends on HAVE_ARCH_MMAP_RND_COMPAT_BITS
+ 	help
+ 	  This value can be used to select the number of bits to use to
diff --git a/debian/patches/patchset-pf/fixes/0002-cpufreq-Remove-LATENCY_MULTIPLIER.patch b/debian/patches/patchset-pf/fixes/0002-cpufreq-Remove-LATENCY_MULTIPLIER.patch
new file mode 100644
index 0000000..8b422f7
--- /dev/null
+++ b/debian/patches/patchset-pf/fixes/0002-cpufreq-Remove-LATENCY_MULTIPLIER.patch
@@ -0,0 +1,112 @@
+From b7d96c1f19ef15ea431a8d5d7ab2cad22c35edba Mon Sep 17 00:00:00 2001
+From: Qais Yousef <qyousef@layalina.io>
+Date: Sun, 28 Jul 2024 20:26:59 +0100
+Subject: cpufreq: Remove LATENCY_MULTIPLIER
+
+The current LATENCY_MULTIPLIER which has been around for nearly 20 years
+causes rate_limit_us to be always in ms range.
+
+On M1 mac mini I get 50 and 56us transition latency, but due to the 1000
+multiplier we end up setting rate_limit_us to 50 and 56ms, which gets
+capped into 2ms and was 10ms before e13aa799c2a6 ("cpufreq: Change
+default transition delay to 2ms")
+
+On Intel I5 system transition latency is 20us but due to the multiplier
+we end up with 20ms that again is capped to 2ms.
+
+Given how good modern hardware and how modern workloads require systems
+to be more responsive to cater for sudden changes in workload (tasks
+sleeping/wakeup/migrating, uclamp causing a sudden boost or cap) and
+that 2ms is quarter of the time of 120Hz refresh rate system, drop the
+old logic in favour of providing 50% headroom.
+
+	rate_limit_us = 1.5 * latency.
+
+I considered not adding any headroom which could mean that we can end up
+with infinite back-to-back requests.
+
+I also considered providing a constant headroom (e.g: 100us) assuming
+that any h/w or f/w dealing with the request shouldn't require a large
+headroom when transition_latency is actually high.
+
+But for both cases I wasn't sure if h/w or f/w can end up being
+overwhelmed dealing with the freq requests in a potentially busy system.
+So I opted for providing 50% breathing room.
+
+This is expected to impact schedutil only as the other user,
+dbs_governor, takes the max(2*tick, transition_delay_us) and the former
+was at least 2ms on 1ms TICK, which is equivalent to the max_delay_us
+before applying this patch. For systems with TICK of 4ms, this value
+would have almost always ended up with 8ms sampling rate.
+
+For systems that report 0 transition latency, we still default to
+returning 1ms as transition delay.
+
+This helps in eliminating a source of latency for applying requests as
+mentioned in [1]. For example if we have a 1ms tick, most systems will
+miss sending an update at tick when updating the util_avg for a task/CPU
+(rate_limit_us will be 2ms for most systems).
+
+Link: https://lore.kernel.org/lkml/20240724212255.mfr2ybiv2j2uqek7@airbuntu/ # [1]
+Link: https://lore.kernel.org/lkml/20240205022500.2232124-1-qyousef@layalina.io/
+Signed-off-by: Qais Yousef <qyousef@layalina.io>
+Link: https://patch.msgid.link/20240728192659.58115-1-qyousef@layalina.io
+[ rjw: Subject edits ]
+Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+---
+ drivers/cpufreq/cpufreq.c | 27 ++++-----------------------
+ include/linux/cpufreq.h   |  6 ------
+ 2 files changed, 4 insertions(+), 29 deletions(-)
+
+--- a/drivers/cpufreq/cpufreq.c
++++ b/drivers/cpufreq/cpufreq.c
+@@ -575,30 +575,11 @@ unsigned int cpufreq_policy_transition_d
+ 		return policy->transition_delay_us;
+ 
+ 	latency = policy->cpuinfo.transition_latency / NSEC_PER_USEC;
+-	if (latency) {
+-		unsigned int max_delay_us = 2 * MSEC_PER_SEC;
++	if (latency)
++		/* Give a 50% breathing room between updates */
++		return latency + (latency >> 1);
+ 
+-		/*
+-		 * If the platform already has high transition_latency, use it
+-		 * as-is.
+-		 */
+-		if (latency > max_delay_us)
+-			return latency;
+-
+-		/*
+-		 * For platforms that can change the frequency very fast (< 2
+-		 * us), the above formula gives a decent transition delay. But
+-		 * for platforms where transition_latency is in milliseconds, it
+-		 * ends up giving unrealistic values.
+-		 *
+-		 * Cap the default transition delay to 2 ms, which seems to be
+-		 * a reasonable amount of time after which we should reevaluate
+-		 * the frequency.
+-		 */
+-		return min(latency * LATENCY_MULTIPLIER, max_delay_us);
+-	}
+-
+-	return LATENCY_MULTIPLIER;
++	return USEC_PER_MSEC;
+ }
+ EXPORT_SYMBOL_GPL(cpufreq_policy_transition_delay_us);
+ 
+--- a/include/linux/cpufreq.h
++++ b/include/linux/cpufreq.h
+@@ -577,12 +577,6 @@ static inline unsigned long cpufreq_scal
+ #define CPUFREQ_POLICY_POWERSAVE	(1)
+ #define CPUFREQ_POLICY_PERFORMANCE	(2)
+ 
+-/*
+- * The polling frequency depends on the capability of the processor. Default
+- * polling frequency is 1000 times the transition latency of the processor.
+- */
+-#define LATENCY_MULTIPLIER		(1000)
+-
+ struct cpufreq_governor {
+ 	char	name[CPUFREQ_NAME_LEN];
+ 	int	(*init)(struct cpufreq_policy *policy);
diff --git a/debian/patches/patchset-pf/fixes/0003-drivers-firmware-skip-simpledrm-if-nvidia-drm.modese.patch b/debian/patches/patchset-pf/fixes/0003-drivers-firmware-skip-simpledrm-if-nvidia-drm.modese.patch
new file mode 100644
index 0000000..a97bffc
--- /dev/null
+++ b/debian/patches/patchset-pf/fixes/0003-drivers-firmware-skip-simpledrm-if-nvidia-drm.modese.patch
@@ -0,0 +1,83 @@
+From 218e958524c673d6e68737e7f82d80ba2b6ef59a Mon Sep 17 00:00:00 2001
+From: Javier Martinez Canillas <javierm@redhat.com>
+Date: Thu, 19 May 2022 14:40:07 +0200
+Subject: drivers/firmware: skip simpledrm if nvidia-drm.modeset=1 is set
+
+The Nvidia proprietary driver has some bugs that leads to issues if used
+with the simpledrm driver. The most noticeable is that does not register
+an emulated fbdev device.
+
+It just relies on a fbdev to be registered by another driver, that could
+be that could be attached to the framebuffer console. On UEFI machines,
+this is the efifb driver.
+
+This means that disabling the efifb driver will cause virtual consoles to
+not be present in the system when using the Nvidia driver. Legacy BIOS is
+not affected just because fbcon is not used there, but instead vgacon.
+
+Unless a VGA mode is specified using the vga= kernel command line option,
+in that case the vesafb driver is used instead and its fbdev attached to
+the fbcon.
+
+This is a problem because with CONFIG_SYSFB_SIMPLEFB=y, the sysfb platform
+code attempts to register a "simple-framebuffer" platform device (that is
+matched against simpledrm) and only registers either an "efi-framebuffer"
+or "vesa-framebuffer" if this fails to be registered due the video modes
+not being compatible.
+
+The Nvidia driver relying on another driver to register the fbdev is quite
+fragile, since it can't really assume those will stick around. For example
+there are patches posted to remove the EFI and VESA platform devices once
+a real DRM or fbdev driver probes.
+
+But in any case, moving to a simpledrm + emulated fbdev only breaks this
+assumption and causes users to not have VT if the Nvidia driver is used.
+
+So to prevent this, let's add a workaround and make the sysfb to skip the
+"simple-framebuffer" registration when nvidia-drm.modeset=1 option is set.
+
+This is quite horrible, but honestly I can't think of any other approach.
+
+For this to work, the CONFIG_FB_EFI and CONFIG_FB_VESA config options must
+be enabled besides CONFIG_DRM_SIMPLEDRM.
+
+Signed-off-by: Javier Martinez Canillas <javierm@redhat.com>
+Cherry-picked-for: https://bugs.archlinux.org/task/73720
+---
+ drivers/firmware/sysfb.c | 18 +++++++++++++++++-
+ 1 file changed, 17 insertions(+), 1 deletion(-)
+
+--- a/drivers/firmware/sysfb.c
++++ b/drivers/firmware/sysfb.c
+@@ -35,6 +35,22 @@
+ #include <linux/screen_info.h>
+ #include <linux/sysfb.h>
+ 
++static int skip_simpledrm;
++
++static int __init simpledrm_disable(char *opt)
++{
++	if (!opt)
++                return -EINVAL;
++
++	get_option(&opt, &skip_simpledrm);
++
++	if (skip_simpledrm)
++		pr_info("The simpledrm driver will not be probed\n");
++
++	return 0;
++}
++early_param("nvidia-drm.modeset", simpledrm_disable);
++
+ static struct platform_device *pd;
+ static DEFINE_MUTEX(disable_lock);
+ static bool disabled;
+@@ -145,7 +161,7 @@ static __init int sysfb_init(void)
+ 
+ 	/* try to create a simple-framebuffer device */
+ 	compatible = sysfb_parse_mode(si, &mode);
+-	if (compatible) {
++	if (compatible && !skip_simpledrm) {
+ 		pd = sysfb_create_simplefb(si, &mode, parent);
+ 		if (!IS_ERR(pd))
+ 			goto put_device;
diff --git a/debian/patches/patchset-pf/fixes/0004-nfsd-add-more-info-to-WARN_ON_ONCE-on-failed-callbac.patch b/debian/patches/patchset-pf/fixes/0004-nfsd-add-more-info-to-WARN_ON_ONCE-on-failed-callbac.patch
new file mode 100644
index 0000000..9086a94
--- /dev/null
+++ b/debian/patches/patchset-pf/fixes/0004-nfsd-add-more-info-to-WARN_ON_ONCE-on-failed-callbac.patch
@@ -0,0 +1,26 @@
+From b97d21a0aa65a6f7a7bb17bbc696b136688c96ed Mon Sep 17 00:00:00 2001
+From: Jeff Layton <jlayton@kernel.org>
+Date: Mon, 26 Aug 2024 08:50:11 -0400
+Subject: nfsd: add more info to WARN_ON_ONCE on failed callbacks
+
+Currently, you get the warning and stack trace, but nothing is printed
+about the relevant error codes. Add that in.
+
+Signed-off-by: Jeff Layton <jlayton@kernel.org>
+Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
+---
+ fs/nfsd/nfs4callback.c | 3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+--- a/fs/nfsd/nfs4callback.c
++++ b/fs/nfsd/nfs4callback.c
+@@ -1333,7 +1333,8 @@ static void nfsd4_cb_done(struct rpc_tas
+ 		return;
+ 
+ 	if (cb->cb_status) {
+-		WARN_ON_ONCE(task->tk_status);
++		WARN_ONCE(task->tk_status, "cb_status=%d tk_status=%d",
++			  cb->cb_status, task->tk_status);
+ 		task->tk_status = cb->cb_status;
+ 	}
+ 
diff --git a/debian/patches/patchset-pf/fixes/0005-e1000e-Remove-Meteor-Lake-SMBUS-workarounds.patch b/debian/patches/patchset-pf/fixes/0005-e1000e-Remove-Meteor-Lake-SMBUS-workarounds.patch
new file mode 100644
index 0000000..5b33527
--- /dev/null
+++ b/debian/patches/patchset-pf/fixes/0005-e1000e-Remove-Meteor-Lake-SMBUS-workarounds.patch
@@ -0,0 +1,57 @@
+From 1d120544580708eae6bd5981b308ca17735edaac Mon Sep 17 00:00:00 2001
+From: Vitaly Lifshits <vitaly.lifshits@intel.com>
+Date: Tue, 1 Oct 2024 20:08:48 +0300
+Subject: e1000e: Remove Meteor Lake SMBUS workarounds
+
+This is a partial revert to commit 76a0a3f9cc2f ("e1000e: fix force smbus
+during suspend flow"). That commit fixed a sporadic PHY access issue but
+introduced a regression in runtime suspend flows.
+The original issue on Meteor Lake systems was rare in terms of the
+reproduction rate and the number of the systems affected.
+
+After the integration of commit 0a6ad4d9e169 ("e1000e: avoid failing the
+system during pm_suspend"), PHY access loss can no longer cause a
+system-level suspend failure. As it only occurs when the LAN cable is
+disconnected, and is recovered during system resume flow. Therefore, its
+functional impact is low, and the priority is given to stabilizing
+runtime suspend.
+
+Fixes: 76a0a3f9cc2f ("e1000e: fix force smbus during suspend flow")
+Signed-off-by: Vitaly Lifshits <vitaly.lifshits@intel.com>
+---
+ drivers/net/ethernet/intel/e1000e/ich8lan.c | 17 ++++-------------
+ 1 file changed, 4 insertions(+), 13 deletions(-)
+
+--- a/drivers/net/ethernet/intel/e1000e/ich8lan.c
++++ b/drivers/net/ethernet/intel/e1000e/ich8lan.c
+@@ -1205,12 +1205,10 @@ s32 e1000_enable_ulp_lpt_lp(struct e1000
+ 	if (ret_val)
+ 		goto out;
+ 
+-	if (hw->mac.type != e1000_pch_mtp) {
+-		ret_val = e1000e_force_smbus(hw);
+-		if (ret_val) {
+-			e_dbg("Failed to force SMBUS: %d\n", ret_val);
+-			goto release;
+-		}
++	ret_val = e1000e_force_smbus(hw);
++	if (ret_val) {
++		e_dbg("Failed to force SMBUS: %d\n", ret_val);
++		goto release;
+ 	}
+ 
+ 	/* Si workaround for ULP entry flow on i127/rev6 h/w.  Enable
+@@ -1273,13 +1271,6 @@ s32 e1000_enable_ulp_lpt_lp(struct e1000
+ 	}
+ 
+ release:
+-	if (hw->mac.type == e1000_pch_mtp) {
+-		ret_val = e1000e_force_smbus(hw);
+-		if (ret_val)
+-			e_dbg("Failed to force SMBUS over MTL system: %d\n",
+-			      ret_val);
+-	}
+-
+ 	hw->phy.ops.release(hw);
+ out:
+ 	if (ret_val)
diff --git a/debian/patches/patchset-pf/fixes/0006-btrfs-zoned-fix-zone-unusable-accounting-for-freed-r.patch b/debian/patches/patchset-pf/fixes/0006-btrfs-zoned-fix-zone-unusable-accounting-for-freed-r.patch
new file mode 100644
index 0000000..cc462c6
--- /dev/null
+++ b/debian/patches/patchset-pf/fixes/0006-btrfs-zoned-fix-zone-unusable-accounting-for-freed-r.patch
@@ -0,0 +1,46 @@
+From 4086c1a804741c9c8f418d6088e8c531f2a481f3 Mon Sep 17 00:00:00 2001
+From: Naohiro Aota <naohiro.aota@wdc.com>
+Date: Tue, 1 Oct 2024 17:03:32 +0900
+Subject: btrfs: zoned: fix zone unusable accounting for freed reserved extent
+
+When btrfs reserves an extent and does not use it (e.g, by an error), it
+calls btrfs_free_reserved_extent() to free the reserved extent. In the
+process, it calls btrfs_add_free_space() and then it accounts the region
+bytes as block_group->zone_unusable.
+
+However, it leaves the space_info->bytes_zone_unusable side not updated. As
+a result, ENOSPC can happen while a space_info reservation succeeded. The
+reservation is fine because the freed region is not added in
+space_info->bytes_zone_unusable, leaving that space as "free". OTOH,
+corresponding block group counts it as zone_unusable and its allocation
+pointer is not rewound, we cannot allocate an extent from that block group.
+That will also negate space_info's async/sync reclaim process, and cause an
+ENOSPC error from the extent allocation process.
+
+Fix that by returning the space to space_info->bytes_zone_unusable.
+Ideally, since a bio is not submitted for this reserved region, we should
+return the space to free space and rewind the allocation pointer. But, it
+needs rework on extent allocation handling, so let it work in this way for
+now.
+
+Fixes: 169e0da91a21 ("btrfs: zoned: track unusable bytes for zones")
+CC: stable@vger.kernel.org # 5.15+
+Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
+Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+---
+ fs/btrfs/block-group.c | 2 ++
+ 1 file changed, 2 insertions(+)
+
+--- a/fs/btrfs/block-group.c
++++ b/fs/btrfs/block-group.c
+@@ -3819,6 +3819,8 @@ void btrfs_free_reserved_bytes(struct bt
+ 	spin_lock(&cache->lock);
+ 	if (cache->ro)
+ 		space_info->bytes_readonly += num_bytes;
++	else if (btrfs_is_zoned(cache->fs_info))
++		space_info->bytes_zone_unusable += num_bytes;
+ 	cache->reserved -= num_bytes;
+ 	space_info->bytes_reserved -= num_bytes;
+ 	space_info->max_extent_size = 0;
diff --git a/debian/patches/patchset-pf/fixes/0007-btrfs-clear-force-compress-on-remount-when-compress-.patch b/debian/patches/patchset-pf/fixes/0007-btrfs-clear-force-compress-on-remount-when-compress-.patch
new file mode 100644
index 0000000..42144a5
--- /dev/null
+++ b/debian/patches/patchset-pf/fixes/0007-btrfs-clear-force-compress-on-remount-when-compress-.patch
@@ -0,0 +1,64 @@
+From aa8155f0ba032729ec4f28c5cb9669fb14f6947b Mon Sep 17 00:00:00 2001
+From: Filipe Manana <fdmanana@suse.com>
+Date: Mon, 14 Oct 2024 16:14:18 +0100
+Subject: btrfs: clear force-compress on remount when compress mount option is
+ given
+
+After the migration to use fs context for processing mount options we had
+a slight change in the semantics for remounting a filesystem that was
+mounted with compress-force. Before we could clear compress-force by
+passing only "-o compress[=algo]" during a remount, but after that change
+that does not work anymore, force-compress is still present and one needs
+to pass "-o compress-force=no,compress[=algo]" to the mount command.
+
+Example, when running on a kernel 6.8+:
+
+  $ mount -o compress-force=zlib:9 /dev/sdi /mnt/sdi
+  $ mount | grep sdi
+  /dev/sdi on /mnt/sdi type btrfs (rw,relatime,compress-force=zlib:9,discard=async,space_cache=v2,subvolid=5,subvol=/)
+
+  $ mount -o remount,compress=zlib:5 /mnt/sdi
+  $ mount | grep sdi
+  /dev/sdi on /mnt/sdi type btrfs (rw,relatime,compress-force=zlib:5,discard=async,space_cache=v2,subvolid=5,subvol=/)
+
+On a 6.7 kernel (or older):
+
+  $ mount -o compress-force=zlib:9 /dev/sdi /mnt/sdi
+  $ mount | grep sdi
+  /dev/sdi on /mnt/sdi type btrfs (rw,relatime,compress-force=zlib:9,discard=async,space_cache=v2,subvolid=5,subvol=/)
+
+  $ mount -o remount,compress=zlib:5 /mnt/sdi
+  $ mount | grep sdi
+  /dev/sdi on /mnt/sdi type btrfs (rw,relatime,compress=zlib:5,discard=async,space_cache=v2,subvolid=5,subvol=/)
+
+So update btrfs_parse_param() to clear "compress-force" when "compress" is
+given, providing the same semantics as kernel 6.7 and older.
+
+Reported-by: Roman Mamedov <rm@romanrm.net>
+Link: https://lore.kernel.org/linux-btrfs/20241014182416.13d0f8b0@nvm/
+CC: stable@vger.kernel.org # 6.8+
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+---
+ fs/btrfs/super.c | 9 +++++++++
+ 1 file changed, 9 insertions(+)
+
+--- a/fs/btrfs/super.c
++++ b/fs/btrfs/super.c
+@@ -340,6 +340,15 @@ static int btrfs_parse_param(struct fs_c
+ 		fallthrough;
+ 	case Opt_compress:
+ 	case Opt_compress_type:
++		/*
++		 * Provide the same semantics as older kernels that don't use fs
++		 * context, specifying the "compress" option clears
++		 * "force-compress" without the need to pass
++		 * "compress-force=[no|none]" before specifying "compress".
++		 */
++		if (opt != Opt_compress_force && opt != Opt_compress_force_type)
++			btrfs_clear_opt(ctx->mount_opt, FORCE_COMPRESS);
++
+ 		if (opt == Opt_compress || opt == Opt_compress_force) {
+ 			ctx->compress_type = BTRFS_COMPRESS_ZLIB;
+ 			ctx->compress_level = BTRFS_ZLIB_DEFAULT_LEVEL;
diff --git a/debian/patches/patchset-pf/fixes/0008-btrfs-qgroup-set-a-more-sane-default-value-for-subtr.patch b/debian/patches/patchset-pf/fixes/0008-btrfs-qgroup-set-a-more-sane-default-value-for-subtr.patch
new file mode 100644
index 0000000..2c1b65f
--- /dev/null
+++ b/debian/patches/patchset-pf/fixes/0008-btrfs-qgroup-set-a-more-sane-default-value-for-subtr.patch
@@ -0,0 +1,68 @@
+From 81baeb2a67d8245ac5b61299e54dd65defd4ac72 Mon Sep 17 00:00:00 2001
+From: Qu Wenruo <wqu@suse.com>
+Date: Tue, 10 Sep 2024 15:21:04 +0930
+Subject: btrfs: qgroup: set a more sane default value for subtree drop
+ threshold
+
+Since commit 011b46c30476 ("btrfs: skip subtree scan if it's too high to
+avoid low stall in btrfs_commit_transaction()"), btrfs qgroup can
+automatically skip large subtree scan at the cost of marking qgroup
+inconsistent.
+
+It's designed to address the final performance problem of snapshot drop
+with qgroup enabled, but to be safe the default value is
+BTRFS_MAX_LEVEL, requiring a user space daemon to set a different value
+to make it work.
+
+I'd say it's not a good idea to rely on user space tool to set this
+default value, especially when some operations (snapshot dropping) can
+be triggered immediately after mount, leaving a very small window to
+that that sysfs interface.
+
+So instead of disabling this new feature by default, enable it with a
+low threshold (3), so that large subvolume tree drop at mount time won't
+cause huge qgroup workload.
+
+CC: stable@vger.kernel.org # 6.1
+Signed-off-by: Qu Wenruo <wqu@suse.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+---
+ fs/btrfs/disk-io.c | 2 +-
+ fs/btrfs/qgroup.c  | 2 +-
+ fs/btrfs/qgroup.h  | 2 ++
+ 3 files changed, 4 insertions(+), 2 deletions(-)
+
+--- a/fs/btrfs/disk-io.c
++++ b/fs/btrfs/disk-io.c
+@@ -1960,7 +1960,7 @@ static void btrfs_init_qgroup(struct btr
+ 	fs_info->qgroup_seq = 1;
+ 	fs_info->qgroup_ulist = NULL;
+ 	fs_info->qgroup_rescan_running = false;
+-	fs_info->qgroup_drop_subtree_thres = BTRFS_MAX_LEVEL;
++	fs_info->qgroup_drop_subtree_thres = BTRFS_QGROUP_DROP_SUBTREE_THRES_DEFAULT;
+ 	mutex_init(&fs_info->qgroup_rescan_lock);
+ }
+ 
+--- a/fs/btrfs/qgroup.c
++++ b/fs/btrfs/qgroup.c
+@@ -1407,7 +1407,7 @@ int btrfs_quota_disable(struct btrfs_fs_
+ 	fs_info->quota_root = NULL;
+ 	fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON;
+ 	fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE;
+-	fs_info->qgroup_drop_subtree_thres = BTRFS_MAX_LEVEL;
++	fs_info->qgroup_drop_subtree_thres = BTRFS_QGROUP_DROP_SUBTREE_THRES_DEFAULT;
+ 	spin_unlock(&fs_info->qgroup_lock);
+ 
+ 	btrfs_free_qgroup_config(fs_info);
+--- a/fs/btrfs/qgroup.h
++++ b/fs/btrfs/qgroup.h
+@@ -121,6 +121,8 @@ struct btrfs_inode;
+ #define BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN		(1ULL << 63)
+ #define BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING		(1ULL << 62)
+ 
++#define BTRFS_QGROUP_DROP_SUBTREE_THRES_DEFAULT		(3)
++
+ /*
+  * Record a dirty extent, and info qgroup to update quota on it
+  */
diff --git a/debian/patches/patchset-pf/fixes/0009-btrfs-also-add-stripe-entries-for-NOCOW-writes.patch b/debian/patches/patchset-pf/fixes/0009-btrfs-also-add-stripe-entries-for-NOCOW-writes.patch
new file mode 100644
index 0000000..44112eb
--- /dev/null
+++ b/debian/patches/patchset-pf/fixes/0009-btrfs-also-add-stripe-entries-for-NOCOW-writes.patch
@@ -0,0 +1,32 @@
+From 8ea93b01558ea7a752e478ad25862e7441d6053a Mon Sep 17 00:00:00 2001
+From: Johannes Thumshirn <johannes.thumshirn@wdc.com>
+Date: Thu, 19 Sep 2024 12:16:38 +0200
+Subject: btrfs: also add stripe entries for NOCOW writes
+
+NOCOW writes do not generate stripe_extent entries in the RAID stripe
+tree, as the RAID stripe-tree feature initially was designed with a
+zoned filesystem in mind and on a zoned filesystem, we do not allow NOCOW
+writes. But the RAID stripe-tree feature is independent from the zoned
+feature, so we must also do NOCOW writes for RAID stripe-tree filesystems.
+
+Reviewed-by: Naohiro Aota <naohiro.aota@wdc.com>
+Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+---
+ fs/btrfs/inode.c | 5 +++++
+ 1 file changed, 5 insertions(+)
+
+--- a/fs/btrfs/inode.c
++++ b/fs/btrfs/inode.c
+@@ -3087,6 +3087,11 @@ int btrfs_finish_one_ordered(struct btrf
+ 		ret = btrfs_update_inode_fallback(trans, inode);
+ 		if (ret) /* -ENOMEM or corruption */
+ 			btrfs_abort_transaction(trans, ret);
++
++		ret = btrfs_insert_raid_extent(trans, ordered_extent);
++		if (ret)
++			btrfs_abort_transaction(trans, ret);
++
+ 		goto out;
+ 	}
+ 
diff --git a/debian/patches/patchset-pf/fixes/0010-btrfs-fix-read-corruption-due-to-race-with-extent-ma.patch b/debian/patches/patchset-pf/fixes/0010-btrfs-fix-read-corruption-due-to-race-with-extent-ma.patch
new file mode 100644
index 0000000..496ce00
--- /dev/null
+++ b/debian/patches/patchset-pf/fixes/0010-btrfs-fix-read-corruption-due-to-race-with-extent-ma.patch
@@ -0,0 +1,107 @@
+From f6f5cd12972307324de5decd7fa41b0b3c98639c Mon Sep 17 00:00:00 2001
+From: Boris Burkov <boris@bur.io>
+Date: Fri, 18 Oct 2024 15:44:34 -0700
+Subject: btrfs: fix read corruption due to race with extent map merging
+
+In debugging some corrupt squashfs files, we observed symptoms of
+corrupt page cache pages but correct on-disk contents. Further
+investigation revealed that the exact symptom was a correct page
+followed by an incorrect, duplicate, page. This got us thinking about
+extent maps.
+
+commit ac05ca913e9f ("Btrfs: fix race between using extent maps and merging them")
+enforces a reference count on the primary `em` extent_map being merged,
+as that one gets modified.
+
+However, since,
+commit 3d2ac9922465 ("btrfs: introduce new members for extent_map")
+both 'em' and 'merge' get modified, which started modifying 'merge'
+and thus introduced the same race.
+
+We were able to reproduce this by looping the affected squashfs workload
+in parallel on a bunch of separate btrfs-es while also dropping caches.
+We are still working on a simple enough reproducer to make into an fstest.
+
+The simplest fix is to stop modifying 'merge', which is not essential,
+as it is dropped immediately after the merge. This behavior is simply
+a consequence of the order of the two extent maps being important in
+computing the new values. Modify merge_ondisk_extents to take prev and
+next by const* and also take a third merged parameter that it puts the
+results in. Note that this introduces the rather odd behavior of passing
+'em' to merge_ondisk_extents as a const * and as a regular ptr.
+
+Fixes: 3d2ac9922465 ("btrfs: introduce new members for extent_map")
+CC: stable@vger.kernel.org # 6.11+
+Reviewed-by: Qu Wenruo <wqu@suse.com>
+Reviewed-by: Filipe Manana <fdmanana@suse.com>
+Signed-off-by: Omar Sandoval <osandov@fb.com>
+Signed-off-by: Boris Burkov <boris@bur.io>
+Signed-off-by: David Sterba <dsterba@suse.com>
+---
+ fs/btrfs/extent_map.c | 31 ++++++++++++++++---------------
+ 1 file changed, 16 insertions(+), 15 deletions(-)
+
+--- a/fs/btrfs/extent_map.c
++++ b/fs/btrfs/extent_map.c
+@@ -240,13 +240,19 @@ static bool mergeable_maps(const struct
+ /*
+  * Handle the on-disk data extents merge for @prev and @next.
+  *
++ * @prev:    left extent to merge
++ * @next:    right extent to merge
++ * @merged:  the extent we will not discard after the merge; updated with new values
++ *
++ * After this, one of the two extents is the new merged extent and the other is
++ * removed from the tree and likely freed. Note that @merged is one of @prev/@next
++ * so there is const/non-const aliasing occurring here.
++ *
+  * Only touches disk_bytenr/disk_num_bytes/offset/ram_bytes.
+  * For now only uncompressed regular extent can be merged.
+- *
+- * @prev and @next will be both updated to point to the new merged range.
+- * Thus one of them should be removed by the caller.
+  */
+-static void merge_ondisk_extents(struct extent_map *prev, struct extent_map *next)
++static void merge_ondisk_extents(const struct extent_map *prev, const struct extent_map *next,
++				 struct extent_map *merged)
+ {
+ 	u64 new_disk_bytenr;
+ 	u64 new_disk_num_bytes;
+@@ -281,15 +287,10 @@ static void merge_ondisk_extents(struct
+ 			     new_disk_bytenr;
+ 	new_offset = prev->disk_bytenr + prev->offset - new_disk_bytenr;
+ 
+-	prev->disk_bytenr = new_disk_bytenr;
+-	prev->disk_num_bytes = new_disk_num_bytes;
+-	prev->ram_bytes = new_disk_num_bytes;
+-	prev->offset = new_offset;
+-
+-	next->disk_bytenr = new_disk_bytenr;
+-	next->disk_num_bytes = new_disk_num_bytes;
+-	next->ram_bytes = new_disk_num_bytes;
+-	next->offset = new_offset;
++	merged->disk_bytenr = new_disk_bytenr;
++	merged->disk_num_bytes = new_disk_num_bytes;
++	merged->ram_bytes = new_disk_num_bytes;
++	merged->offset = new_offset;
+ }
+ 
+ static void dump_extent_map(struct btrfs_fs_info *fs_info, const char *prefix,
+@@ -358,7 +359,7 @@ static void try_merge_map(struct btrfs_i
+ 			em->generation = max(em->generation, merge->generation);
+ 
+ 			if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE)
+-				merge_ondisk_extents(merge, em);
++				merge_ondisk_extents(merge, em, em);
+ 			em->flags |= EXTENT_FLAG_MERGED;
+ 
+ 			validate_extent_map(fs_info, em);
+@@ -375,7 +376,7 @@ static void try_merge_map(struct btrfs_i
+ 	if (rb && can_merge_extent_map(merge) && mergeable_maps(em, merge)) {
+ 		em->len += merge->len;
+ 		if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE)
+-			merge_ondisk_extents(em, merge);
++			merge_ondisk_extents(em, merge, em);
+ 		validate_extent_map(fs_info, em);
+ 		rb_erase(&merge->rb_node, &tree->root);
+ 		RB_CLEAR_NODE(&merge->rb_node);
diff --git a/debian/patches/patchset-pf/fixes/0011-btrfs-reject-ro-rw-reconfiguration-if-there-are-hard.patch b/debian/patches/patchset-pf/fixes/0011-btrfs-reject-ro-rw-reconfiguration-if-there-are-hard.patch
new file mode 100644
index 0000000..4851e23
--- /dev/null
+++ b/debian/patches/patchset-pf/fixes/0011-btrfs-reject-ro-rw-reconfiguration-if-there-are-hard.patch
@@ -0,0 +1,101 @@
+From 7f83049bda761f340991af8dce79a4e98c62b378 Mon Sep 17 00:00:00 2001
+From: Qu Wenruo <wqu@suse.com>
+Date: Thu, 19 Sep 2024 20:18:11 +0930
+Subject: btrfs: reject ro->rw reconfiguration if there are hard ro
+ requirements
+
+[BUG]
+Syzbot reports the following crash:
+
+  BTRFS info (device loop0 state MCS): disabling free space tree
+  BTRFS info (device loop0 state MCS): clearing compat-ro feature flag for FREE_SPACE_TREE (0x1)
+  BTRFS info (device loop0 state MCS): clearing compat-ro feature flag for FREE_SPACE_TREE_VALID (0x2)
+  Oops: general protection fault, probably for non-canonical address 0xdffffc0000000003: 0000 [#1] PREEMPT SMP KASAN NOPTI
+  KASAN: null-ptr-deref in range [0x0000000000000018-0x000000000000001f]
+  Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-debian-1.16.3-2~bpo12+1 04/01/2014
+  RIP: 0010:backup_super_roots fs/btrfs/disk-io.c:1691 [inline]
+  RIP: 0010:write_all_supers+0x97a/0x40f0 fs/btrfs/disk-io.c:4041
+  Call Trace:
+   <TASK>
+   btrfs_commit_transaction+0x1eae/0x3740 fs/btrfs/transaction.c:2530
+   btrfs_delete_free_space_tree+0x383/0x730 fs/btrfs/free-space-tree.c:1312
+   btrfs_start_pre_rw_mount+0xf28/0x1300 fs/btrfs/disk-io.c:3012
+   btrfs_remount_rw fs/btrfs/super.c:1309 [inline]
+   btrfs_reconfigure+0xae6/0x2d40 fs/btrfs/super.c:1534
+   btrfs_reconfigure_for_mount fs/btrfs/super.c:2020 [inline]
+   btrfs_get_tree_subvol fs/btrfs/super.c:2079 [inline]
+   btrfs_get_tree+0x918/0x1920 fs/btrfs/super.c:2115
+   vfs_get_tree+0x90/0x2b0 fs/super.c:1800
+   do_new_mount+0x2be/0xb40 fs/namespace.c:3472
+   do_mount fs/namespace.c:3812 [inline]
+   __do_sys_mount fs/namespace.c:4020 [inline]
+   __se_sys_mount+0x2d6/0x3c0 fs/namespace.c:3997
+   do_syscall_x64 arch/x86/entry/common.c:52 [inline]
+   do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83
+   entry_SYSCALL_64_after_hwframe+0x77/0x7f
+
+[CAUSE]
+To support mounting different subvolume with different RO/RW flags for
+the new mount APIs, btrfs introduced two workaround to support this feature:
+
+- Skip mount option/feature checks if we are mounting a different
+  subvolume
+
+- Reconfigure the fs to RW if the initial mount is RO
+
+Combining these two, we can have the following sequence:
+
+- Mount the fs ro,rescue=all,clear_cache,space_cache=v1
+  rescue=all will mark the fs as hard read-only, so no v2 cache clearing
+  will happen.
+
+- Mount a subvolume rw of the same fs.
+  We go into btrfs_get_tree_subvol(), but fc_mount() returns EBUSY
+  because our new fc is RW, different from the original fs.
+
+  Now we enter btrfs_reconfigure_for_mount(), which switches the RO flag
+  first so that we can grab the existing fs_info.
+  Then we reconfigure the fs to RW.
+
+- During reconfiguration, option/features check is skipped
+  This means we will restart the v2 cache clearing, and convert back to
+  v1 cache.
+  This will trigger fs writes, and since the original fs has "rescue=all"
+  option, it skips the csum tree read.
+
+  And eventually causing NULL pointer dereference in super block
+  writeback.
+
+[FIX]
+For reconfiguration caused by different subvolume RO/RW flags, ensure we
+always run btrfs_check_options() to ensure we have proper hard RO
+requirements met.
+
+In fact the function btrfs_check_options() doesn't really do many
+complex checks, but hard RO requirement and some feature dependency
+checks, thus there is no special reason not to do the check for mount
+reconfiguration.
+
+Reported-by: syzbot+56360f93efa90ff15870@syzkaller.appspotmail.com
+Link: https://lore.kernel.org/linux-btrfs/0000000000008c5d090621cb2770@google.com/
+Fixes: f044b318675f ("btrfs: handle the ro->rw transition for mounting different subvolumes")
+CC: stable@vger.kernel.org # 6.8+
+Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
+Signed-off-by: Qu Wenruo <wqu@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+---
+ fs/btrfs/super.c | 3 +--
+ 1 file changed, 1 insertion(+), 2 deletions(-)
+
+--- a/fs/btrfs/super.c
++++ b/fs/btrfs/super.c
+@@ -1519,8 +1519,7 @@ static int btrfs_reconfigure(struct fs_c
+ 	sync_filesystem(sb);
+ 	set_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state);
+ 
+-	if (!mount_reconfigure &&
+-	    !btrfs_check_options(fs_info, &ctx->mount_opt, fc->sb_flags))
++	if (!btrfs_check_options(fs_info, &ctx->mount_opt, fc->sb_flags))
+ 		return -EINVAL;
+ 
+ 	ret = btrfs_check_features(fs_info, !(fc->sb_flags & SB_RDONLY));
diff --git a/debian/patches/patchset-pf/fixes/0012-btrfs-fix-passing-0-to-ERR_PTR-in-btrfs_search_dir_i.patch b/debian/patches/patchset-pf/fixes/0012-btrfs-fix-passing-0-to-ERR_PTR-in-btrfs_search_dir_i.patch
new file mode 100644
index 0000000..06b8612
--- /dev/null
+++ b/debian/patches/patchset-pf/fixes/0012-btrfs-fix-passing-0-to-ERR_PTR-in-btrfs_search_dir_i.patch
@@ -0,0 +1,54 @@
+From ed73b9279db9536a9672cba6506950c26cedb140 Mon Sep 17 00:00:00 2001
+From: Yue Haibing <yuehaibing@huawei.com>
+Date: Tue, 22 Oct 2024 17:52:08 +0800
+Subject: btrfs: fix passing 0 to ERR_PTR in btrfs_search_dir_index_item()
+
+The ret may be zero in btrfs_search_dir_index_item() and should not
+passed to ERR_PTR(). Now btrfs_unlink_subvol() is the only caller to
+this, reconstructed it to check ERR_PTR(-ENOENT) while ret >= 0.
+
+This fixes smatch warnings:
+
+fs/btrfs/dir-item.c:353
+  btrfs_search_dir_index_item() warn: passing zero to 'ERR_PTR'
+
+Fixes: 9dcbe16fccbb ("btrfs: use btrfs_for_each_slot in btrfs_search_dir_index_item")
+CC: stable@vger.kernel.org # 6.1+
+Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
+Signed-off-by: Yue Haibing <yuehaibing@huawei.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+---
+ fs/btrfs/dir-item.c | 4 ++--
+ fs/btrfs/inode.c    | 7 ++-----
+ 2 files changed, 4 insertions(+), 7 deletions(-)
+
+--- a/fs/btrfs/dir-item.c
++++ b/fs/btrfs/dir-item.c
+@@ -347,8 +347,8 @@ btrfs_search_dir_index_item(struct btrfs
+ 			return di;
+ 	}
+ 	/* Adjust return code if the key was not found in the next leaf. */
+-	if (ret > 0)
+-		ret = 0;
++	if (ret >= 0)
++		ret = -ENOENT;
+ 
+ 	return ERR_PTR(ret);
+ }
+--- a/fs/btrfs/inode.c
++++ b/fs/btrfs/inode.c
+@@ -4344,11 +4344,8 @@ static int btrfs_unlink_subvol(struct bt
+ 	 */
+ 	if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) {
+ 		di = btrfs_search_dir_index_item(root, path, dir_ino, &fname.disk_name);
+-		if (IS_ERR_OR_NULL(di)) {
+-			if (!di)
+-				ret = -ENOENT;
+-			else
+-				ret = PTR_ERR(di);
++		if (IS_ERR(di)) {
++			ret = PTR_ERR(di);
+ 			btrfs_abort_transaction(trans, ret);
+ 			goto out;
+ 		}
diff --git a/debian/patches/patchset-pf/ksm/0001-mm-expose-per-process-KSM-control-via-syscalls.patch b/debian/patches/patchset-pf/ksm/0001-mm-expose-per-process-KSM-control-via-syscalls.patch
new file mode 100644
index 0000000..18a27a7
--- /dev/null
+++ b/debian/patches/patchset-pf/ksm/0001-mm-expose-per-process-KSM-control-via-syscalls.patch
@@ -0,0 +1,407 @@
+From 88362669534c70bbc7036f45bb23e63a30d4adfb Mon Sep 17 00:00:00 2001
+From: Oleksandr Natalenko <oleksandr@natalenko.name>
+Date: Mon, 29 Jul 2024 00:38:24 +0200
+Subject: mm: expose per-process KSM control via syscalls
+
+d7597f59d1d3 added a new API to enable per-process KSM control. It
+however uses prctl, which doesn't allow controlling KSM from outside of
+the current process.
+
+Hence, expose this API via 3 syscalls: process_ksm_enable,
+process_ksm_disable and process_ksm_status. Given sufficient privileges,
+auto-KSM can be enable by another process.
+
+Since these syscalls are not in the upstream kernel, also expose their
+numbers under /sys/kernel/process_ksm so that userspace tooling like
+uksmd knows how to use them.
+
+Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name>
+---
+ arch/alpha/kernel/syscalls/syscall.tbl        |   3 +
+ arch/arm/tools/syscall.tbl                    |   3 +
+ arch/m68k/kernel/syscalls/syscall.tbl         |   3 +
+ arch/microblaze/kernel/syscalls/syscall.tbl   |   3 +
+ arch/mips/kernel/syscalls/syscall_n32.tbl     |   3 +
+ arch/mips/kernel/syscalls/syscall_n64.tbl     |   3 +
+ arch/mips/kernel/syscalls/syscall_o32.tbl     |   3 +
+ arch/parisc/kernel/syscalls/syscall.tbl       |   3 +
+ arch/powerpc/kernel/syscalls/syscall.tbl      |   3 +
+ arch/s390/kernel/syscalls/syscall.tbl         |   3 +
+ arch/sh/kernel/syscalls/syscall.tbl           |   3 +
+ arch/sparc/kernel/syscalls/syscall.tbl        |   3 +
+ arch/x86/entry/syscalls/syscall_32.tbl        |   3 +
+ arch/x86/entry/syscalls/syscall_64.tbl        |   3 +
+ arch/xtensa/kernel/syscalls/syscall.tbl       |   3 +
+ include/linux/syscalls.h                      |   3 +
+ include/uapi/asm-generic/unistd.h             |   9 +-
+ kernel/sys.c                                  | 147 ++++++++++++++++++
+ kernel/sys_ni.c                               |   3 +
+ scripts/syscall.tbl                           |   3 +
+ .../arch/powerpc/entry/syscalls/syscall.tbl   |   3 +
+ .../perf/arch/s390/entry/syscalls/syscall.tbl |   3 +
+ 22 files changed, 215 insertions(+), 1 deletion(-)
+
+--- a/arch/alpha/kernel/syscalls/syscall.tbl
++++ b/arch/alpha/kernel/syscalls/syscall.tbl
+@@ -502,3 +502,6 @@
+ 570	common	lsm_set_self_attr		sys_lsm_set_self_attr
+ 571	common	lsm_list_modules		sys_lsm_list_modules
+ 572	common  mseal				sys_mseal
++573	common	process_ksm_enable		sys_process_ksm_enable
++574	common	process_ksm_disable		sys_process_ksm_disable
++575	common	process_ksm_status		sys_process_ksm_status
+--- a/arch/arm/tools/syscall.tbl
++++ b/arch/arm/tools/syscall.tbl
+@@ -477,3 +477,6 @@
+ 460	common	lsm_set_self_attr		sys_lsm_set_self_attr
+ 461	common	lsm_list_modules		sys_lsm_list_modules
+ 462	common	mseal				sys_mseal
++463	common	process_ksm_enable		sys_process_ksm_enable
++464	common	process_ksm_disable		sys_process_ksm_disable
++465	common	process_ksm_status		sys_process_ksm_status
+--- a/arch/m68k/kernel/syscalls/syscall.tbl
++++ b/arch/m68k/kernel/syscalls/syscall.tbl
+@@ -462,3 +462,6 @@
+ 460	common	lsm_set_self_attr		sys_lsm_set_self_attr
+ 461	common	lsm_list_modules		sys_lsm_list_modules
+ 462	common	mseal				sys_mseal
++463	common	process_ksm_enable		sys_process_ksm_enable
++464	common	process_ksm_disable		sys_process_ksm_disable
++465	common	process_ksm_status		sys_process_ksm_status
+--- a/arch/microblaze/kernel/syscalls/syscall.tbl
++++ b/arch/microblaze/kernel/syscalls/syscall.tbl
+@@ -468,3 +468,6 @@
+ 460	common	lsm_set_self_attr		sys_lsm_set_self_attr
+ 461	common	lsm_list_modules		sys_lsm_list_modules
+ 462	common	mseal				sys_mseal
++463	common	process_ksm_enable		sys_process_ksm_enable
++464	common	process_ksm_disable		sys_process_ksm_disable
++465	common	process_ksm_status		sys_process_ksm_status
+--- a/arch/mips/kernel/syscalls/syscall_n32.tbl
++++ b/arch/mips/kernel/syscalls/syscall_n32.tbl
+@@ -401,3 +401,6 @@
+ 460	n32	lsm_set_self_attr		sys_lsm_set_self_attr
+ 461	n32	lsm_list_modules		sys_lsm_list_modules
+ 462	n32	mseal				sys_mseal
++463	n32	process_ksm_enable		sys_process_ksm_enable
++464	n32	process_ksm_disable		sys_process_ksm_disable
++465	n32	process_ksm_status		sys_process_ksm_status
+--- a/arch/mips/kernel/syscalls/syscall_n64.tbl
++++ b/arch/mips/kernel/syscalls/syscall_n64.tbl
+@@ -377,3 +377,6 @@
+ 460	n64	lsm_set_self_attr		sys_lsm_set_self_attr
+ 461	n64	lsm_list_modules		sys_lsm_list_modules
+ 462	n64	mseal				sys_mseal
++463	n64	process_ksm_enable		sys_process_ksm_enable
++464	n64	process_ksm_disable		sys_process_ksm_disable
++465	n64	process_ksm_status		sys_process_ksm_status
+--- a/arch/mips/kernel/syscalls/syscall_o32.tbl
++++ b/arch/mips/kernel/syscalls/syscall_o32.tbl
+@@ -450,3 +450,6 @@
+ 460	o32	lsm_set_self_attr		sys_lsm_set_self_attr
+ 461	o32	lsm_list_modules		sys_lsm_list_modules
+ 462	o32	mseal				sys_mseal
++463	o32	process_ksm_enable		sys_process_ksm_enable
++464	o32	process_ksm_disable		sys_process_ksm_disable
++465	o32	process_ksm_status		sys_process_ksm_status
+--- a/arch/parisc/kernel/syscalls/syscall.tbl
++++ b/arch/parisc/kernel/syscalls/syscall.tbl
+@@ -461,3 +461,6 @@
+ 460	common	lsm_set_self_attr		sys_lsm_set_self_attr
+ 461	common	lsm_list_modules		sys_lsm_list_modules
+ 462	common	mseal				sys_mseal
++463	common	process_ksm_enable		sys_process_ksm_enable
++464	common	process_ksm_disable		sys_process_ksm_disable
++465	common	process_ksm_status		sys_process_ksm_status
+--- a/arch/powerpc/kernel/syscalls/syscall.tbl
++++ b/arch/powerpc/kernel/syscalls/syscall.tbl
+@@ -553,3 +553,6 @@
+ 460	common	lsm_set_self_attr		sys_lsm_set_self_attr
+ 461	common	lsm_list_modules		sys_lsm_list_modules
+ 462	common	mseal				sys_mseal
++463	common	process_ksm_enable		sys_process_ksm_enable
++464	common	process_ksm_disable		sys_process_ksm_disable
++465	common	process_ksm_status		sys_process_ksm_status
+--- a/arch/s390/kernel/syscalls/syscall.tbl
++++ b/arch/s390/kernel/syscalls/syscall.tbl
+@@ -465,3 +465,6 @@
+ 460  common	lsm_set_self_attr	sys_lsm_set_self_attr		sys_lsm_set_self_attr
+ 461  common	lsm_list_modules	sys_lsm_list_modules		sys_lsm_list_modules
+ 462  common	mseal			sys_mseal			sys_mseal
++463  common	process_ksm_enable	sys_process_ksm_enable		sys_process_ksm_enable
++464  common	process_ksm_disable	sys_process_ksm_disable		sys_process_ksm_disable
++465  common	process_ksm_status	sys_process_ksm_status		sys_process_ksm_status
+--- a/arch/sh/kernel/syscalls/syscall.tbl
++++ b/arch/sh/kernel/syscalls/syscall.tbl
+@@ -466,3 +466,6 @@
+ 460	common	lsm_set_self_attr		sys_lsm_set_self_attr
+ 461	common	lsm_list_modules		sys_lsm_list_modules
+ 462	common	mseal				sys_mseal
++463	common	process_ksm_enable		sys_process_ksm_enable
++464	common	process_ksm_disable		sys_process_ksm_disable
++465	common	process_ksm_status		sys_process_ksm_status
+--- a/arch/sparc/kernel/syscalls/syscall.tbl
++++ b/arch/sparc/kernel/syscalls/syscall.tbl
+@@ -508,3 +508,6 @@
+ 460	common	lsm_set_self_attr		sys_lsm_set_self_attr
+ 461	common	lsm_list_modules		sys_lsm_list_modules
+ 462	common	mseal 				sys_mseal
++463	common	process_ksm_enable		sys_process_ksm_enable
++464	common	process_ksm_disable		sys_process_ksm_disable
++465	common	process_ksm_status		sys_process_ksm_status
+--- a/arch/x86/entry/syscalls/syscall_32.tbl
++++ b/arch/x86/entry/syscalls/syscall_32.tbl
+@@ -468,3 +468,6 @@
+ 460	i386	lsm_set_self_attr	sys_lsm_set_self_attr
+ 461	i386	lsm_list_modules	sys_lsm_list_modules
+ 462	i386	mseal 			sys_mseal
++463	i386	process_ksm_enable		sys_process_ksm_enable
++464	i386	process_ksm_disable		sys_process_ksm_disable
++465	i386	process_ksm_status		sys_process_ksm_status
+--- a/arch/x86/entry/syscalls/syscall_64.tbl
++++ b/arch/x86/entry/syscalls/syscall_64.tbl
+@@ -386,6 +386,9 @@
+ 460	common	lsm_set_self_attr	sys_lsm_set_self_attr
+ 461	common	lsm_list_modules	sys_lsm_list_modules
+ 462 	common  mseal			sys_mseal
++463	common	process_ksm_enable	sys_process_ksm_enable
++464	common	process_ksm_disable	sys_process_ksm_disable
++465	common	process_ksm_status	sys_process_ksm_status
+ 
+ #
+ # Due to a historical design error, certain syscalls are numbered differently
+--- a/arch/xtensa/kernel/syscalls/syscall.tbl
++++ b/arch/xtensa/kernel/syscalls/syscall.tbl
+@@ -433,3 +433,6 @@
+ 460	common	lsm_set_self_attr		sys_lsm_set_self_attr
+ 461	common	lsm_list_modules		sys_lsm_list_modules
+ 462	common	mseal 				sys_mseal
++463	common	process_ksm_enable		sys_process_ksm_enable
++464	common	process_ksm_disable		sys_process_ksm_disable
++465	common	process_ksm_status		sys_process_ksm_status
+--- a/include/linux/syscalls.h
++++ b/include/linux/syscalls.h
+@@ -818,6 +818,9 @@ asmlinkage long sys_madvise(unsigned lon
+ asmlinkage long sys_process_madvise(int pidfd, const struct iovec __user *vec,
+ 			size_t vlen, int behavior, unsigned int flags);
+ asmlinkage long sys_process_mrelease(int pidfd, unsigned int flags);
++asmlinkage long sys_process_ksm_enable(int pidfd, unsigned int flags);
++asmlinkage long sys_process_ksm_disable(int pidfd, unsigned int flags);
++asmlinkage long sys_process_ksm_status(int pidfd, unsigned int flags);
+ asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size,
+ 			unsigned long prot, unsigned long pgoff,
+ 			unsigned long flags);
+--- a/include/uapi/asm-generic/unistd.h
++++ b/include/uapi/asm-generic/unistd.h
+@@ -841,8 +841,15 @@ __SYSCALL(__NR_lsm_list_modules, sys_lsm
+ #define __NR_mseal 462
+ __SYSCALL(__NR_mseal, sys_mseal)
+ 
++#define __NR_process_ksm_enable 463
++__SYSCALL(__NR_process_ksm_enable, sys_process_ksm_enable)
++#define __NR_process_ksm_disable 464
++__SYSCALL(__NR_process_ksm_disable, sys_process_ksm_disable)
++#define __NR_process_ksm_status 465
++__SYSCALL(__NR_process_ksm_status, sys_process_ksm_status)
++
+ #undef __NR_syscalls
+-#define __NR_syscalls 463
++#define __NR_syscalls 466
+ 
+ /*
+  * 32 bit systems traditionally used different
+--- a/kernel/sys.c
++++ b/kernel/sys.c
+@@ -2789,6 +2789,153 @@ SYSCALL_DEFINE5(prctl, int, option, unsi
+ 	return error;
+ }
+ 
++#ifdef CONFIG_KSM
++enum pkc_action {
++	PKSM_ENABLE = 0,
++	PKSM_DISABLE,
++	PKSM_STATUS,
++};
++
++static long do_process_ksm_control(int pidfd, enum pkc_action action)
++{
++	long ret;
++	struct pid *pid;
++	struct task_struct *task;
++	struct mm_struct *mm;
++	unsigned int f_flags;
++
++	pid = pidfd_get_pid(pidfd, &f_flags);
++	if (IS_ERR(pid)) {
++		ret = PTR_ERR(pid);
++		goto out;
++	}
++
++	task = get_pid_task(pid, PIDTYPE_PID);
++	if (!task) {
++		ret = -ESRCH;
++		goto put_pid;
++	}
++
++	/* Require PTRACE_MODE_READ to avoid leaking ASLR metadata. */
++	mm = mm_access(task, PTRACE_MODE_READ_FSCREDS);
++	if (IS_ERR_OR_NULL(mm)) {
++		ret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH;
++		goto release_task;
++	}
++
++	/* Require CAP_SYS_NICE for influencing process performance. */
++	if (!capable(CAP_SYS_NICE)) {
++		ret = -EPERM;
++		goto release_mm;
++	}
++
++	if (mmap_write_lock_killable(mm)) {
++		ret = -EINTR;
++		goto release_mm;
++	}
++
++	switch (action) {
++		case PKSM_ENABLE:
++			ret = ksm_enable_merge_any(mm);
++			break;
++		case PKSM_DISABLE:
++			ret = ksm_disable_merge_any(mm);
++			break;
++		case PKSM_STATUS:
++			ret = !!test_bit(MMF_VM_MERGE_ANY, &mm->flags);
++			break;
++	}
++
++	mmap_write_unlock(mm);
++
++release_mm:
++	mmput(mm);
++release_task:
++	put_task_struct(task);
++put_pid:
++	put_pid(pid);
++out:
++	return ret;
++}
++#endif /* CONFIG_KSM */
++
++SYSCALL_DEFINE2(process_ksm_enable, int, pidfd, unsigned int, flags)
++{
++#ifdef CONFIG_KSM
++	if (flags != 0)
++		return -EINVAL;
++
++	return do_process_ksm_control(pidfd, PKSM_ENABLE);
++#else /* CONFIG_KSM */
++	return -ENOSYS;
++#endif /* CONFIG_KSM */
++}
++
++SYSCALL_DEFINE2(process_ksm_disable, int, pidfd, unsigned int, flags)
++{
++#ifdef CONFIG_KSM
++	if (flags != 0)
++		return -EINVAL;
++
++	return do_process_ksm_control(pidfd, PKSM_DISABLE);
++#else /* CONFIG_KSM */
++	return -ENOSYS;
++#endif /* CONFIG_KSM */
++}
++
++SYSCALL_DEFINE2(process_ksm_status, int, pidfd, unsigned int, flags)
++{
++#ifdef CONFIG_KSM
++	if (flags != 0)
++		return -EINVAL;
++
++	return do_process_ksm_control(pidfd, PKSM_STATUS);
++#else /* CONFIG_KSM */
++	return -ENOSYS;
++#endif /* CONFIG_KSM */
++}
++
++#ifdef CONFIG_KSM
++static ssize_t process_ksm_enable_show(struct kobject *kobj,
++		struct kobj_attribute *attr, char *buf)
++{
++	return sprintf(buf, "%u\n", __NR_process_ksm_enable);
++}
++static struct kobj_attribute process_ksm_enable_attr = __ATTR_RO(process_ksm_enable);
++
++static ssize_t process_ksm_disable_show(struct kobject *kobj,
++		struct kobj_attribute *attr, char *buf)
++{
++	return sprintf(buf, "%u\n", __NR_process_ksm_disable);
++}
++static struct kobj_attribute process_ksm_disable_attr = __ATTR_RO(process_ksm_disable);
++
++static ssize_t process_ksm_status_show(struct kobject *kobj,
++		struct kobj_attribute *attr, char *buf)
++{
++	return sprintf(buf, "%u\n", __NR_process_ksm_status);
++}
++static struct kobj_attribute process_ksm_status_attr = __ATTR_RO(process_ksm_status);
++
++static struct attribute *process_ksm_sysfs_attrs[] = {
++	&process_ksm_enable_attr.attr,
++	&process_ksm_disable_attr.attr,
++	&process_ksm_status_attr.attr,
++	NULL,
++};
++
++static const struct attribute_group process_ksm_sysfs_attr_group = {
++	.attrs = process_ksm_sysfs_attrs,
++	.name = "process_ksm",
++};
++
++static int __init process_ksm_sysfs_init(void)
++{
++	return sysfs_create_group(kernel_kobj, &process_ksm_sysfs_attr_group);
++}
++subsys_initcall(process_ksm_sysfs_init);
++#endif /* CONFIG_KSM */
++
+ SYSCALL_DEFINE3(getcpu, unsigned __user *, cpup, unsigned __user *, nodep,
+ 		struct getcpu_cache __user *, unused)
+ {
+--- a/kernel/sys_ni.c
++++ b/kernel/sys_ni.c
+@@ -186,6 +186,9 @@ COND_SYSCALL(mincore);
+ COND_SYSCALL(madvise);
+ COND_SYSCALL(process_madvise);
+ COND_SYSCALL(process_mrelease);
++COND_SYSCALL(process_ksm_enable);
++COND_SYSCALL(process_ksm_disable);
++COND_SYSCALL(process_ksm_status);
+ COND_SYSCALL(remap_file_pages);
+ COND_SYSCALL(mbind);
+ COND_SYSCALL(get_mempolicy);
+--- a/scripts/syscall.tbl
++++ b/scripts/syscall.tbl
+@@ -403,3 +403,6 @@
+ 460	common	lsm_set_self_attr		sys_lsm_set_self_attr
+ 461	common	lsm_list_modules		sys_lsm_list_modules
+ 462	common	mseal				sys_mseal
++463	common	process_ksm_enable			sys_process_ksm_enable
++464	common	process_ksm_disable			sys_process_ksm_disable
++465	common	process_ksm_status			sys_process_ksm_status
+--- a/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl
++++ b/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl
+@@ -553,3 +553,6 @@
+ 460	common	lsm_set_self_attr		sys_lsm_set_self_attr
+ 461	common	lsm_list_modules		sys_lsm_list_modules
+ 462	common	mseal				sys_mseal
++463	common	process_ksm_enable		sys_process_ksm_enable
++464	common	process_ksm_disable		sys_process_ksm_disable
++465	common	process_ksm_status		sys_process_ksm_status
+--- a/tools/perf/arch/s390/entry/syscalls/syscall.tbl
++++ b/tools/perf/arch/s390/entry/syscalls/syscall.tbl
+@@ -465,3 +465,6 @@
+ 460  common	lsm_set_self_attr	sys_lsm_set_self_attr		sys_lsm_set_self_attr
+ 461  common	lsm_list_modules	sys_lsm_list_modules		sys_lsm_list_modules
+ 462  common	mseal			sys_mseal			sys_mseal
++463  common	process_ksm_enable	sys_process_ksm_enable		sys_process_ksm_enable
++464  common	process_ksm_disable	sys_process_ksm_disable		sys_process_ksm_disable
++465  common	process_ksm_status	sys_process_ksm_status		sys_process_ksm_status
diff --git a/debian/patches/patchset-pf/ksm/0002-mm-process_ksm-use-pidfd_get_task-instead-of-pidfd_g.patch b/debian/patches/patchset-pf/ksm/0002-mm-process_ksm-use-pidfd_get_task-instead-of-pidfd_g.patch
new file mode 100644
index 0000000..1213043
--- /dev/null
+++ b/debian/patches/patchset-pf/ksm/0002-mm-process_ksm-use-pidfd_get_task-instead-of-pidfd_g.patch
@@ -0,0 +1,50 @@
+From 9308d03bfeb941469da17e2903ca06254b110b25 Mon Sep 17 00:00:00 2001
+From: Oleksandr Natalenko <oleksandr@natalenko.name>
+Date: Tue, 24 Sep 2024 11:58:41 +0200
+Subject: mm/process_ksm: use pidfd_get_task() instead of
+ pidfd_get_pid()+get_pid_task()
+
+Link: https://git.kernel.org/linus/ee9955d61a0a
+Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name>
+---
+ kernel/sys.c | 15 +++------------
+ 1 file changed, 3 insertions(+), 12 deletions(-)
+
+--- a/kernel/sys.c
++++ b/kernel/sys.c
+@@ -2799,23 +2799,16 @@ enum pkc_action {
+ static long do_process_ksm_control(int pidfd, enum pkc_action action)
+ {
+ 	long ret;
+-	struct pid *pid;
+ 	struct task_struct *task;
+ 	struct mm_struct *mm;
+ 	unsigned int f_flags;
+ 
+-	pid = pidfd_get_pid(pidfd, &f_flags);
+-	if (IS_ERR(pid)) {
+-		ret = PTR_ERR(pid);
++	task = pidfd_get_task(pidfd, &f_flags);
++	if (IS_ERR(task)) {
++		ret = PTR_ERR(task);
+ 		goto out;
+ 	}
+ 
+-	task = get_pid_task(pid, PIDTYPE_PID);
+-	if (!task) {
+-		ret = -ESRCH;
+-		goto put_pid;
+-	}
+-
+ 	/* Require PTRACE_MODE_READ to avoid leaking ASLR metadata. */
+ 	mm = mm_access(task, PTRACE_MODE_READ_FSCREDS);
+ 	if (IS_ERR_OR_NULL(mm)) {
+@@ -2852,8 +2845,6 @@ release_mm:
+ 	mmput(mm);
+ release_task:
+ 	put_task_struct(task);
+-put_pid:
+-	put_pid(pid);
+ out:
+ 	return ret;
+ }
diff --git a/debian/patches/patchset-pf/zstd/0001-zstd-import-upstream-v1.5.6.patch b/debian/patches/patchset-pf/zstd/0001-zstd-import-upstream-v1.5.6.patch
new file mode 100644
index 0000000..7b94907
--- /dev/null
+++ b/debian/patches/patchset-pf/zstd/0001-zstd-import-upstream-v1.5.6.patch
@@ -0,0 +1,18533 @@
+From 9b2b1b2e996c06793bc975dd66dccaadb0c0d637 Mon Sep 17 00:00:00 2001
+From: Oleksandr Natalenko <oleksandr@natalenko.name>
+Date: Mon, 29 Jul 2024 00:42:23 +0200
+Subject: zstd: import upstream v1.5.6
+
+Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name>
+---
+ include/linux/zstd.h                          |    2 +-
+ include/linux/zstd_errors.h                   |   23 +-
+ include/linux/zstd_lib.h                      |  850 +++++--
+ lib/zstd/Makefile                             |    2 +-
+ lib/zstd/common/allocations.h                 |   56 +
+ lib/zstd/common/bits.h                        |  149 ++
+ lib/zstd/common/bitstream.h                   |  127 +-
+ lib/zstd/common/compiler.h                    |  134 +-
+ lib/zstd/common/cpu.h                         |    3 +-
+ lib/zstd/common/debug.c                       |    9 +-
+ lib/zstd/common/debug.h                       |   34 +-
+ lib/zstd/common/entropy_common.c              |   42 +-
+ lib/zstd/common/error_private.c               |   12 +-
+ lib/zstd/common/error_private.h               |   84 +-
+ lib/zstd/common/fse.h                         |   94 +-
+ lib/zstd/common/fse_decompress.c              |  130 +-
+ lib/zstd/common/huf.h                         |  237 +-
+ lib/zstd/common/mem.h                         |    3 +-
+ lib/zstd/common/portability_macros.h          |   28 +-
+ lib/zstd/common/zstd_common.c                 |   38 +-
+ lib/zstd/common/zstd_deps.h                   |   16 +-
+ lib/zstd/common/zstd_internal.h               |  109 +-
+ lib/zstd/compress/clevels.h                   |    3 +-
+ lib/zstd/compress/fse_compress.c              |   74 +-
+ lib/zstd/compress/hist.c                      |    3 +-
+ lib/zstd/compress/hist.h                      |    3 +-
+ lib/zstd/compress/huf_compress.c              |  441 ++--
+ lib/zstd/compress/zstd_compress.c             | 2111 ++++++++++++-----
+ lib/zstd/compress/zstd_compress_internal.h    |  359 ++-
+ lib/zstd/compress/zstd_compress_literals.c    |  155 +-
+ lib/zstd/compress/zstd_compress_literals.h    |   25 +-
+ lib/zstd/compress/zstd_compress_sequences.c   |    7 +-
+ lib/zstd/compress/zstd_compress_sequences.h   |    3 +-
+ lib/zstd/compress/zstd_compress_superblock.c  |  376 ++-
+ lib/zstd/compress/zstd_compress_superblock.h  |    3 +-
+ lib/zstd/compress/zstd_cwksp.h                |  169 +-
+ lib/zstd/compress/zstd_double_fast.c          |  143 +-
+ lib/zstd/compress/zstd_double_fast.h          |   17 +-
+ lib/zstd/compress/zstd_fast.c                 |  596 +++--
+ lib/zstd/compress/zstd_fast.h                 |    6 +-
+ lib/zstd/compress/zstd_lazy.c                 |  732 +++---
+ lib/zstd/compress/zstd_lazy.h                 |  138 +-
+ lib/zstd/compress/zstd_ldm.c                  |   21 +-
+ lib/zstd/compress/zstd_ldm.h                  |    3 +-
+ lib/zstd/compress/zstd_ldm_geartab.h          |    3 +-
+ lib/zstd/compress/zstd_opt.c                  |  497 ++--
+ lib/zstd/compress/zstd_opt.h                  |   41 +-
+ lib/zstd/decompress/huf_decompress.c          |  887 ++++---
+ lib/zstd/decompress/zstd_ddict.c              |    9 +-
+ lib/zstd/decompress/zstd_ddict.h              |    3 +-
+ lib/zstd/decompress/zstd_decompress.c         |  356 ++-
+ lib/zstd/decompress/zstd_decompress_block.c   |  708 +++---
+ lib/zstd/decompress/zstd_decompress_block.h   |   10 +-
+ .../decompress/zstd_decompress_internal.h     |    9 +-
+ lib/zstd/decompress_sources.h                 |    2 +-
+ lib/zstd/zstd_common_module.c                 |    5 +-
+ lib/zstd/zstd_compress_module.c               |    2 +-
+ lib/zstd/zstd_decompress_module.c             |    4 +-
+ 58 files changed, 6576 insertions(+), 3530 deletions(-)
+ create mode 100644 lib/zstd/common/allocations.h
+ create mode 100644 lib/zstd/common/bits.h
+
+--- a/include/linux/zstd.h
++++ b/include/linux/zstd.h
+@@ -1,6 +1,6 @@
+ /* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+--- a/include/linux/zstd_errors.h
++++ b/include/linux/zstd_errors.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -17,8 +18,17 @@
+ 
+ 
+ /* =====   ZSTDERRORLIB_API : control library symbols visibility   ===== */
+-#define ZSTDERRORLIB_VISIBILITY 
+-#define ZSTDERRORLIB_API ZSTDERRORLIB_VISIBILITY
++#define ZSTDERRORLIB_VISIBLE 
++
++#ifndef ZSTDERRORLIB_HIDDEN
++#  if (__GNUC__ >= 4) && !defined(__MINGW32__)
++#    define ZSTDERRORLIB_HIDDEN __attribute__ ((visibility ("hidden")))
++#  else
++#    define ZSTDERRORLIB_HIDDEN
++#  endif
++#endif
++
++#define ZSTDERRORLIB_API ZSTDERRORLIB_VISIBLE
+ 
+ /*-*********************************************
+  *  Error codes list
+@@ -43,14 +53,17 @@ typedef enum {
+   ZSTD_error_frameParameter_windowTooLarge = 16,
+   ZSTD_error_corruption_detected = 20,
+   ZSTD_error_checksum_wrong      = 22,
++  ZSTD_error_literals_headerWrong = 24,
+   ZSTD_error_dictionary_corrupted      = 30,
+   ZSTD_error_dictionary_wrong          = 32,
+   ZSTD_error_dictionaryCreation_failed = 34,
+   ZSTD_error_parameter_unsupported   = 40,
++  ZSTD_error_parameter_combination_unsupported = 41,
+   ZSTD_error_parameter_outOfBound    = 42,
+   ZSTD_error_tableLog_tooLarge       = 44,
+   ZSTD_error_maxSymbolValue_tooLarge = 46,
+   ZSTD_error_maxSymbolValue_tooSmall = 48,
++  ZSTD_error_stabilityCondition_notRespected = 50,
+   ZSTD_error_stage_wrong       = 60,
+   ZSTD_error_init_missing      = 62,
+   ZSTD_error_memory_allocation = 64,
+@@ -58,11 +71,15 @@ typedef enum {
+   ZSTD_error_dstSize_tooSmall = 70,
+   ZSTD_error_srcSize_wrong    = 72,
+   ZSTD_error_dstBuffer_null   = 74,
++  ZSTD_error_noForwardProgress_destFull = 80,
++  ZSTD_error_noForwardProgress_inputEmpty = 82,
+   /* following error codes are __NOT STABLE__, they can be removed or changed in future versions */
+   ZSTD_error_frameIndex_tooLarge = 100,
+   ZSTD_error_seekableIO          = 102,
+   ZSTD_error_dstBuffer_wrong     = 104,
+   ZSTD_error_srcBuffer_wrong     = 105,
++  ZSTD_error_sequenceProducer_failed = 106,
++  ZSTD_error_externalSequences_invalid = 107,
+   ZSTD_error_maxCode = 120  /* never EVER use this value directly, it can change in future versions! Use ZSTD_isError() instead */
+ } ZSTD_ErrorCode;
+ 
+--- a/include/linux/zstd_lib.h
++++ b/include/linux/zstd_lib.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -11,23 +12,42 @@
+ #ifndef ZSTD_H_235446
+ #define ZSTD_H_235446
+ 
+-/* ======   Dependency   ======*/
++/* ======   Dependencies   ======*/
+ #include <linux/limits.h>   /* INT_MAX */
+ #include <linux/types.h>   /* size_t */
+ 
+ 
+ /* =====   ZSTDLIB_API : control library symbols visibility   ===== */
+-#ifndef ZSTDLIB_VISIBLE
++#define ZSTDLIB_VISIBLE 
++
++#ifndef ZSTDLIB_HIDDEN
+ #  if (__GNUC__ >= 4) && !defined(__MINGW32__)
+-#    define ZSTDLIB_VISIBLE __attribute__ ((visibility ("default")))
+ #    define ZSTDLIB_HIDDEN __attribute__ ((visibility ("hidden")))
+ #  else
+-#    define ZSTDLIB_VISIBLE
+ #    define ZSTDLIB_HIDDEN
+ #  endif
+ #endif
++
+ #define ZSTDLIB_API ZSTDLIB_VISIBLE
+ 
++/* Deprecation warnings :
++ * Should these warnings be a problem, it is generally possible to disable them,
++ * typically with -Wno-deprecated-declarations for gcc or _CRT_SECURE_NO_WARNINGS in Visual.
++ * Otherwise, it's also possible to define ZSTD_DISABLE_DEPRECATE_WARNINGS.
++ */
++#ifdef ZSTD_DISABLE_DEPRECATE_WARNINGS
++#  define ZSTD_DEPRECATED(message) /* disable deprecation warnings */
++#else
++#  if (defined(GNUC) && (GNUC > 4 || (GNUC == 4 && GNUC_MINOR >= 5))) || defined(__clang__)
++#    define ZSTD_DEPRECATED(message) __attribute__((deprecated(message)))
++#  elif (__GNUC__ >= 3)
++#    define ZSTD_DEPRECATED(message) __attribute__((deprecated))
++#  else
++#    pragma message("WARNING: You need to implement ZSTD_DEPRECATED for this compiler")
++#    define ZSTD_DEPRECATED(message)
++#  endif
++#endif /* ZSTD_DISABLE_DEPRECATE_WARNINGS */
++
+ 
+ /* *****************************************************************************
+   Introduction
+@@ -65,7 +85,7 @@
+ /*------   Version   ------*/
+ #define ZSTD_VERSION_MAJOR    1
+ #define ZSTD_VERSION_MINOR    5
+-#define ZSTD_VERSION_RELEASE  2
++#define ZSTD_VERSION_RELEASE  6
+ #define ZSTD_VERSION_NUMBER  (ZSTD_VERSION_MAJOR *100*100 + ZSTD_VERSION_MINOR *100 + ZSTD_VERSION_RELEASE)
+ 
+ /*! ZSTD_versionNumber() :
+@@ -107,7 +127,8 @@ ZSTDLIB_API const char* ZSTD_versionStri
+ ***************************************/
+ /*! ZSTD_compress() :
+  *  Compresses `src` content as a single zstd compressed frame into already allocated `dst`.
+- *  Hint : compression runs faster if `dstCapacity` >=  `ZSTD_compressBound(srcSize)`.
++ *  NOTE: Providing `dstCapacity >= ZSTD_compressBound(srcSize)` guarantees that zstd will have
++ *        enough space to successfully compress the data.
+  *  @return : compressed size written into `dst` (<= `dstCapacity),
+  *            or an error code if it fails (which can be tested using ZSTD_isError()). */
+ ZSTDLIB_API size_t ZSTD_compress( void* dst, size_t dstCapacity,
+@@ -156,7 +177,9 @@ ZSTDLIB_API unsigned long long ZSTD_getF
+  *  "empty", "unknown" and "error" results to the same return value (0),
+  *  while ZSTD_getFrameContentSize() gives them separate return values.
+  * @return : decompressed size of `src` frame content _if known and not empty_, 0 otherwise. */
+-ZSTDLIB_API unsigned long long ZSTD_getDecompressedSize(const void* src, size_t srcSize);
++ZSTD_DEPRECATED("Replaced by ZSTD_getFrameContentSize")
++ZSTDLIB_API
++unsigned long long ZSTD_getDecompressedSize(const void* src, size_t srcSize);
+ 
+ /*! ZSTD_findFrameCompressedSize() : Requires v1.4.0+
+  * `src` should point to the start of a ZSTD frame or skippable frame.
+@@ -168,8 +191,30 @@ ZSTDLIB_API size_t ZSTD_findFrameCompres
+ 
+ 
+ /*======  Helper functions  ======*/
+-#define ZSTD_COMPRESSBOUND(srcSize)   ((srcSize) + ((srcSize)>>8) + (((srcSize) < (128<<10)) ? (((128<<10) - (srcSize)) >> 11) /* margin, from 64 to 0 */ : 0))  /* this formula ensures that bound(A) + bound(B) <= bound(A+B) as long as A and B >= 128 KB */
+-ZSTDLIB_API size_t      ZSTD_compressBound(size_t srcSize); /*!< maximum compressed size in worst case single-pass scenario */
++/* ZSTD_compressBound() :
++ * maximum compressed size in worst case single-pass scenario.
++ * When invoking `ZSTD_compress()` or any other one-pass compression function,
++ * it's recommended to provide @dstCapacity >= ZSTD_compressBound(srcSize)
++ * as it eliminates one potential failure scenario,
++ * aka not enough room in dst buffer to write the compressed frame.
++ * Note : ZSTD_compressBound() itself can fail, if @srcSize > ZSTD_MAX_INPUT_SIZE .
++ *        In which case, ZSTD_compressBound() will return an error code
++ *        which can be tested using ZSTD_isError().
++ *
++ * ZSTD_COMPRESSBOUND() :
++ * same as ZSTD_compressBound(), but as a macro.
++ * It can be used to produce constants, which can be useful for static allocation,
++ * for example to size a static array on stack.
++ * Will produce constant value 0 if srcSize too large.
++ */
++#define ZSTD_MAX_INPUT_SIZE ((sizeof(size_t)==8) ? 0xFF00FF00FF00FF00ULL : 0xFF00FF00U)
++#define ZSTD_COMPRESSBOUND(srcSize)   (((size_t)(srcSize) >= ZSTD_MAX_INPUT_SIZE) ? 0 : (srcSize) + ((srcSize)>>8) + (((srcSize) < (128<<10)) ? (((128<<10) - (srcSize)) >> 11) /* margin, from 64 to 0 */ : 0))  /* this formula ensures that bound(A) + bound(B) <= bound(A+B) as long as A and B >= 128 KB */
++ZSTDLIB_API size_t ZSTD_compressBound(size_t srcSize); /*!< maximum compressed size in worst case single-pass scenario */
++/* ZSTD_isError() :
++ * Most ZSTD_* functions returning a size_t value can be tested for error,
++ * using ZSTD_isError().
++ * @return 1 if error, 0 otherwise
++ */
+ ZSTDLIB_API unsigned    ZSTD_isError(size_t code);          /*!< tells if a `size_t` function result is an error code */
+ ZSTDLIB_API const char* ZSTD_getErrorName(size_t code);     /*!< provides readable string from an error code */
+ ZSTDLIB_API int         ZSTD_minCLevel(void);               /*!< minimum negative compression level allowed, requires v1.4.0+ */
+@@ -183,7 +228,7 @@ ZSTDLIB_API int         ZSTD_defaultCLev
+ /*= Compression context
+  *  When compressing many times,
+  *  it is recommended to allocate a context just once,
+- *  and re-use it for each successive compression operation.
++ *  and reuse it for each successive compression operation.
+  *  This will make workload friendlier for system's memory.
+  *  Note : re-using context is just a speed / resource optimization.
+  *         It doesn't change the compression ratio, which remains identical.
+@@ -196,9 +241,9 @@ ZSTDLIB_API size_t     ZSTD_freeCCtx(ZST
+ 
+ /*! ZSTD_compressCCtx() :
+  *  Same as ZSTD_compress(), using an explicit ZSTD_CCtx.
+- *  Important : in order to behave similarly to `ZSTD_compress()`,
+- *  this function compresses at requested compression level,
+- *  __ignoring any other parameter__ .
++ *  Important : in order to mirror `ZSTD_compress()` behavior,
++ *  this function compresses at the requested compression level,
++ *  __ignoring any other advanced parameter__ .
+  *  If any advanced parameter was set using the advanced API,
+  *  they will all be reset. Only `compressionLevel` remains.
+  */
+@@ -210,7 +255,7 @@ ZSTDLIB_API size_t ZSTD_compressCCtx(ZST
+ /*= Decompression context
+  *  When decompressing many times,
+  *  it is recommended to allocate a context only once,
+- *  and re-use it for each successive compression operation.
++ *  and reuse it for each successive compression operation.
+  *  This will make workload friendlier for system's memory.
+  *  Use one context per thread for parallel execution. */
+ typedef struct ZSTD_DCtx_s ZSTD_DCtx;
+@@ -220,7 +265,7 @@ ZSTDLIB_API size_t     ZSTD_freeDCtx(ZST
+ /*! ZSTD_decompressDCtx() :
+  *  Same as ZSTD_decompress(),
+  *  requires an allocated ZSTD_DCtx.
+- *  Compatible with sticky parameters.
++ *  Compatible with sticky parameters (see below).
+  */
+ ZSTDLIB_API size_t ZSTD_decompressDCtx(ZSTD_DCtx* dctx,
+                                        void* dst, size_t dstCapacity,
+@@ -236,12 +281,12 @@ ZSTDLIB_API size_t ZSTD_decompressDCtx(Z
+  *   using ZSTD_CCtx_set*() functions.
+  *   Pushed parameters are sticky : they are valid for next compressed frame, and any subsequent frame.
+  *   "sticky" parameters are applicable to `ZSTD_compress2()` and `ZSTD_compressStream*()` !
+- *   __They do not apply to "simple" one-shot variants such as ZSTD_compressCCtx()__ .
++ *   __They do not apply to one-shot variants such as ZSTD_compressCCtx()__ .
+  *
+  *   It's possible to reset all parameters to "default" using ZSTD_CCtx_reset().
+  *
+  *   This API supersedes all other "advanced" API entry points in the experimental section.
+- *   In the future, we expect to remove from experimental API entry points which are redundant with this API.
++ *   In the future, we expect to remove API entry points from experimental which are redundant with this API.
+  */
+ 
+ 
+@@ -324,6 +369,19 @@ typedef enum {
+                               * The higher the value of selected strategy, the more complex it is,
+                               * resulting in stronger and slower compression.
+                               * Special: value 0 means "use default strategy". */
++
++    ZSTD_c_targetCBlockSize=130, /* v1.5.6+
++                                  * Attempts to fit compressed block size into approximatively targetCBlockSize.
++                                  * Bound by ZSTD_TARGETCBLOCKSIZE_MIN and ZSTD_TARGETCBLOCKSIZE_MAX.
++                                  * Note that it's not a guarantee, just a convergence target (default:0).
++                                  * No target when targetCBlockSize == 0.
++                                  * This is helpful in low bandwidth streaming environments to improve end-to-end latency,
++                                  * when a client can make use of partial documents (a prominent example being Chrome).
++                                  * Note: this parameter is stable since v1.5.6.
++                                  * It was present as an experimental parameter in earlier versions,
++                                  * but it's not recommended using it with earlier library versions
++                                  * due to massive performance regressions.
++                                  */
+     /* LDM mode parameters */
+     ZSTD_c_enableLongDistanceMatching=160, /* Enable long distance matching.
+                                      * This parameter is designed to improve compression ratio
+@@ -403,7 +461,6 @@ typedef enum {
+      * ZSTD_c_forceMaxWindow
+      * ZSTD_c_forceAttachDict
+      * ZSTD_c_literalCompressionMode
+-     * ZSTD_c_targetCBlockSize
+      * ZSTD_c_srcSizeHint
+      * ZSTD_c_enableDedicatedDictSearch
+      * ZSTD_c_stableInBuffer
+@@ -412,6 +469,9 @@ typedef enum {
+      * ZSTD_c_validateSequences
+      * ZSTD_c_useBlockSplitter
+      * ZSTD_c_useRowMatchFinder
++     * ZSTD_c_prefetchCDictTables
++     * ZSTD_c_enableSeqProducerFallback
++     * ZSTD_c_maxBlockSize
+      * Because they are not stable, it's necessary to define ZSTD_STATIC_LINKING_ONLY to access them.
+      * note : never ever use experimentalParam? names directly;
+      *        also, the enums values themselves are unstable and can still change.
+@@ -421,7 +481,7 @@ typedef enum {
+      ZSTD_c_experimentalParam3=1000,
+      ZSTD_c_experimentalParam4=1001,
+      ZSTD_c_experimentalParam5=1002,
+-     ZSTD_c_experimentalParam6=1003,
++     /* was ZSTD_c_experimentalParam6=1003; is now ZSTD_c_targetCBlockSize */
+      ZSTD_c_experimentalParam7=1004,
+      ZSTD_c_experimentalParam8=1005,
+      ZSTD_c_experimentalParam9=1006,
+@@ -430,7 +490,11 @@ typedef enum {
+      ZSTD_c_experimentalParam12=1009,
+      ZSTD_c_experimentalParam13=1010,
+      ZSTD_c_experimentalParam14=1011,
+-     ZSTD_c_experimentalParam15=1012
++     ZSTD_c_experimentalParam15=1012,
++     ZSTD_c_experimentalParam16=1013,
++     ZSTD_c_experimentalParam17=1014,
++     ZSTD_c_experimentalParam18=1015,
++     ZSTD_c_experimentalParam19=1016
+ } ZSTD_cParameter;
+ 
+ typedef struct {
+@@ -493,7 +557,7 @@ typedef enum {
+  *                  They will be used to compress next frame.
+  *                  Resetting session never fails.
+  *  - The parameters : changes all parameters back to "default".
+- *                  This removes any reference to any dictionary too.
++ *                  This also removes any reference to any dictionary or external sequence producer.
+  *                  Parameters can only be changed between 2 sessions (i.e. no compression is currently ongoing)
+  *                  otherwise the reset fails, and function returns an error value (which can be tested using ZSTD_isError())
+  *  - Both : similar to resetting the session, followed by resetting parameters.
+@@ -502,11 +566,13 @@ ZSTDLIB_API size_t ZSTD_CCtx_reset(ZSTD_
+ 
+ /*! ZSTD_compress2() :
+  *  Behave the same as ZSTD_compressCCtx(), but compression parameters are set using the advanced API.
++ *  (note that this entry point doesn't even expose a compression level parameter).
+  *  ZSTD_compress2() always starts a new frame.
+  *  Should cctx hold data from a previously unfinished frame, everything about it is forgotten.
+  *  - Compression parameters are pushed into CCtx before starting compression, using ZSTD_CCtx_set*()
+  *  - The function is always blocking, returns when compression is completed.
+- *  Hint : compression runs faster if `dstCapacity` >=  `ZSTD_compressBound(srcSize)`.
++ *  NOTE: Providing `dstCapacity >= ZSTD_compressBound(srcSize)` guarantees that zstd will have
++ *        enough space to successfully compress the data, though it is possible it fails for other reasons.
+  * @return : compressed size written into `dst` (<= `dstCapacity),
+  *           or an error code if it fails (which can be tested using ZSTD_isError()).
+  */
+@@ -543,13 +609,17 @@ typedef enum {
+      * ZSTD_d_stableOutBuffer
+      * ZSTD_d_forceIgnoreChecksum
+      * ZSTD_d_refMultipleDDicts
++     * ZSTD_d_disableHuffmanAssembly
++     * ZSTD_d_maxBlockSize
+      * Because they are not stable, it's necessary to define ZSTD_STATIC_LINKING_ONLY to access them.
+      * note : never ever use experimentalParam? names directly
+      */
+      ZSTD_d_experimentalParam1=1000,
+      ZSTD_d_experimentalParam2=1001,
+      ZSTD_d_experimentalParam3=1002,
+-     ZSTD_d_experimentalParam4=1003
++     ZSTD_d_experimentalParam4=1003,
++     ZSTD_d_experimentalParam5=1004,
++     ZSTD_d_experimentalParam6=1005
+ 
+ } ZSTD_dParameter;
+ 
+@@ -604,14 +674,14 @@ typedef struct ZSTD_outBuffer_s {
+ *  A ZSTD_CStream object is required to track streaming operation.
+ *  Use ZSTD_createCStream() and ZSTD_freeCStream() to create/release resources.
+ *  ZSTD_CStream objects can be reused multiple times on consecutive compression operations.
+-*  It is recommended to re-use ZSTD_CStream since it will play nicer with system's memory, by re-using already allocated memory.
++*  It is recommended to reuse ZSTD_CStream since it will play nicer with system's memory, by re-using already allocated memory.
+ *
+ *  For parallel execution, use one separate ZSTD_CStream per thread.
+ *
+ *  note : since v1.3.0, ZSTD_CStream and ZSTD_CCtx are the same thing.
+ *
+ *  Parameters are sticky : when starting a new compression on the same context,
+-*  it will re-use the same sticky parameters as previous compression session.
++*  it will reuse the same sticky parameters as previous compression session.
+ *  When in doubt, it's recommended to fully initialize the context before usage.
+ *  Use ZSTD_CCtx_reset() to reset the context and ZSTD_CCtx_setParameter(),
+ *  ZSTD_CCtx_setPledgedSrcSize(), or ZSTD_CCtx_loadDictionary() and friends to
+@@ -700,6 +770,11 @@ typedef enum {
+  *            only ZSTD_e_end or ZSTD_e_flush operations are allowed.
+  *            Before starting a new compression job, or changing compression parameters,
+  *            it is required to fully flush internal buffers.
++ *  - note: if an operation ends with an error, it may leave @cctx in an undefined state.
++ *          Therefore, it's UB to invoke ZSTD_compressStream2() of ZSTD_compressStream() on such a state.
++ *          In order to be re-employed after an error, a state must be reset,
++ *          which can be done explicitly (ZSTD_CCtx_reset()),
++ *          or is sometimes implied by methods starting a new compression job (ZSTD_initCStream(), ZSTD_compressCCtx())
+  */
+ ZSTDLIB_API size_t ZSTD_compressStream2( ZSTD_CCtx* cctx,
+                                          ZSTD_outBuffer* output,
+@@ -728,8 +803,6 @@ ZSTDLIB_API size_t ZSTD_CStreamOutSize(v
+  * This following is a legacy streaming API, available since v1.0+ .
+  * It can be replaced by ZSTD_CCtx_reset() and ZSTD_compressStream2().
+  * It is redundant, but remains fully supported.
+- * Streaming in combination with advanced parameters and dictionary compression
+- * can only be used through the new API.
+  ******************************************************************************/
+ 
+ /*!
+@@ -738,6 +811,9 @@ ZSTDLIB_API size_t ZSTD_CStreamOutSize(v
+  *     ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only);
+  *     ZSTD_CCtx_refCDict(zcs, NULL); // clear the dictionary (if any)
+  *     ZSTD_CCtx_setParameter(zcs, ZSTD_c_compressionLevel, compressionLevel);
++ *
++ * Note that ZSTD_initCStream() clears any previously set dictionary. Use the new API
++ * to compress with a dictionary.
+  */
+ ZSTDLIB_API size_t ZSTD_initCStream(ZSTD_CStream* zcs, int compressionLevel);
+ /*!
+@@ -758,7 +834,7 @@ ZSTDLIB_API size_t ZSTD_endStream(ZSTD_C
+ *
+ *  A ZSTD_DStream object is required to track streaming operations.
+ *  Use ZSTD_createDStream() and ZSTD_freeDStream() to create/release resources.
+-*  ZSTD_DStream objects can be re-used multiple times.
++*  ZSTD_DStream objects can be reused multiple times.
+ *
+ *  Use ZSTD_initDStream() to start a new decompression operation.
+ * @return : recommended first input size
+@@ -788,13 +864,37 @@ ZSTDLIB_API size_t ZSTD_freeDStream(ZSTD
+ 
+ /*===== Streaming decompression functions =====*/
+ 
+-/* This function is redundant with the advanced API and equivalent to:
++/*! ZSTD_initDStream() :
++ * Initialize/reset DStream state for new decompression operation.
++ * Call before new decompression operation using same DStream.
+  *
++ * Note : This function is redundant with the advanced API and equivalent to:
+  *     ZSTD_DCtx_reset(zds, ZSTD_reset_session_only);
+  *     ZSTD_DCtx_refDDict(zds, NULL);
+  */
+ ZSTDLIB_API size_t ZSTD_initDStream(ZSTD_DStream* zds);
+ 
++/*! ZSTD_decompressStream() :
++ * Streaming decompression function.
++ * Call repetitively to consume full input updating it as necessary.
++ * Function will update both input and output `pos` fields exposing current state via these fields:
++ * - `input.pos < input.size`, some input remaining and caller should provide remaining input
++ *   on the next call.
++ * - `output.pos < output.size`, decoder finished and flushed all remaining buffers.
++ * - `output.pos == output.size`, potentially uncflushed data present in the internal buffers,
++ *   call ZSTD_decompressStream() again to flush remaining data to output.
++ * Note : with no additional input, amount of data flushed <= ZSTD_BLOCKSIZE_MAX.
++ *
++ * @return : 0 when a frame is completely decoded and fully flushed,
++ *           or an error code, which can be tested using ZSTD_isError(),
++ *           or any other value > 0, which means there is some decoding or flushing to do to complete current frame.
++ *
++ * Note: when an operation returns with an error code, the @zds state may be left in undefined state.
++ *       It's UB to invoke `ZSTD_decompressStream()` on such a state.
++ *       In order to re-use such a state, it must be first reset,
++ *       which can be done explicitly (`ZSTD_DCtx_reset()`),
++ *       or is implied for operations starting some new decompression job (`ZSTD_initDStream`, `ZSTD_decompressDCtx()`, `ZSTD_decompress_usingDict()`)
++ */
+ ZSTDLIB_API size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inBuffer* input);
+ 
+ ZSTDLIB_API size_t ZSTD_DStreamInSize(void);    /*!< recommended size for input buffer */
+@@ -913,7 +1013,7 @@ ZSTDLIB_API unsigned ZSTD_getDictID_from
+  *  If @return == 0, the dictID could not be decoded.
+  *  This could for one of the following reasons :
+  *  - The frame does not require a dictionary to be decoded (most common case).
+- *  - The frame was built with dictID intentionally removed. Whatever dictionary is necessary is a hidden information.
++ *  - The frame was built with dictID intentionally removed. Whatever dictionary is necessary is a hidden piece of information.
+  *    Note : this use case also happens when using a non-conformant dictionary.
+  *  - `srcSize` is too small, and as a result, the frame header could not be decoded (only possible if `srcSize < ZSTD_FRAMEHEADERSIZE_MAX`).
+  *  - This is not a Zstandard frame.
+@@ -925,9 +1025,11 @@ ZSTDLIB_API unsigned ZSTD_getDictID_from
+  * Advanced dictionary and prefix API (Requires v1.4.0+)
+  *
+  * This API allows dictionaries to be used with ZSTD_compress2(),
+- * ZSTD_compressStream2(), and ZSTD_decompressDCtx(). Dictionaries are sticky, and
+- * only reset with the context is reset with ZSTD_reset_parameters or
+- * ZSTD_reset_session_and_parameters. Prefixes are single-use.
++ * ZSTD_compressStream2(), and ZSTD_decompressDCtx().
++ * Dictionaries are sticky, they remain valid when same context is reused,
++ * they only reset when the context is reset
++ * with ZSTD_reset_parameters or ZSTD_reset_session_and_parameters.
++ * In contrast, Prefixes are single-use.
+  ******************************************************************************/
+ 
+ 
+@@ -937,8 +1039,9 @@ ZSTDLIB_API unsigned ZSTD_getDictID_from
+  * @result : 0, or an error code (which can be tested with ZSTD_isError()).
+  *  Special: Loading a NULL (or 0-size) dictionary invalidates previous dictionary,
+  *           meaning "return to no-dictionary mode".
+- *  Note 1 : Dictionary is sticky, it will be used for all future compressed frames.
+- *           To return to "no-dictionary" situation, load a NULL dictionary (or reset parameters).
++ *  Note 1 : Dictionary is sticky, it will be used for all future compressed frames,
++ *           until parameters are reset, a new dictionary is loaded, or the dictionary
++ *           is explicitly invalidated by loading a NULL dictionary.
+  *  Note 2 : Loading a dictionary involves building tables.
+  *           It's also a CPU consuming operation, with non-negligible impact on latency.
+  *           Tables are dependent on compression parameters, and for this reason,
+@@ -947,11 +1050,15 @@ ZSTDLIB_API unsigned ZSTD_getDictID_from
+  *           Use experimental ZSTD_CCtx_loadDictionary_byReference() to reference content instead.
+  *           In such a case, dictionary buffer must outlive its users.
+  *  Note 4 : Use ZSTD_CCtx_loadDictionary_advanced()
+- *           to precisely select how dictionary content must be interpreted. */
++ *           to precisely select how dictionary content must be interpreted.
++ *  Note 5 : This method does not benefit from LDM (long distance mode).
++ *           If you want to employ LDM on some large dictionary content,
++ *           prefer employing ZSTD_CCtx_refPrefix() described below.
++ */
+ ZSTDLIB_API size_t ZSTD_CCtx_loadDictionary(ZSTD_CCtx* cctx, const void* dict, size_t dictSize);
+ 
+ /*! ZSTD_CCtx_refCDict() : Requires v1.4.0+
+- *  Reference a prepared dictionary, to be used for all next compressed frames.
++ *  Reference a prepared dictionary, to be used for all future compressed frames.
+  *  Note that compression parameters are enforced from within CDict,
+  *  and supersede any compression parameter previously set within CCtx.
+  *  The parameters ignored are labelled as "superseded-by-cdict" in the ZSTD_cParameter enum docs.
+@@ -970,6 +1077,7 @@ ZSTDLIB_API size_t ZSTD_CCtx_refCDict(ZS
+  *  Decompression will need same prefix to properly regenerate data.
+  *  Compressing with a prefix is similar in outcome as performing a diff and compressing it,
+  *  but performs much faster, especially during decompression (compression speed is tunable with compression level).
++ *  This method is compatible with LDM (long distance mode).
+  * @result : 0, or an error code (which can be tested with ZSTD_isError()).
+  *  Special: Adding any prefix (including NULL) invalidates any previous prefix or dictionary
+  *  Note 1 : Prefix buffer is referenced. It **must** outlive compression.
+@@ -986,9 +1094,9 @@ ZSTDLIB_API size_t ZSTD_CCtx_refPrefix(Z
+                                  const void* prefix, size_t prefixSize);
+ 
+ /*! ZSTD_DCtx_loadDictionary() : Requires v1.4.0+
+- *  Create an internal DDict from dict buffer,
+- *  to be used to decompress next frames.
+- *  The dictionary remains valid for all future frames, until explicitly invalidated.
++ *  Create an internal DDict from dict buffer, to be used to decompress all future frames.
++ *  The dictionary remains valid for all future frames, until explicitly invalidated, or
++ *  a new dictionary is loaded.
+  * @result : 0, or an error code (which can be tested with ZSTD_isError()).
+  *  Special : Adding a NULL (or 0-size) dictionary invalidates any previous dictionary,
+  *            meaning "return to no-dictionary mode".
+@@ -1012,9 +1120,10 @@ ZSTDLIB_API size_t ZSTD_DCtx_loadDiction
+  *  The memory for the table is allocated on the first call to refDDict, and can be
+  *  freed with ZSTD_freeDCtx().
+  *
++ *  If called with ZSTD_d_refMultipleDDicts disabled (the default), only one dictionary
++ *  will be managed, and referencing a dictionary effectively "discards" any previous one.
++ *
+  * @result : 0, or an error code (which can be tested with ZSTD_isError()).
+- *  Note 1 : Currently, only one dictionary can be managed.
+- *           Referencing a new dictionary effectively "discards" any previous one.
+  *  Special: referencing a NULL DDict means "return to no-dictionary mode".
+  *  Note 2 : DDict is just referenced, its lifetime must outlive its usage from DCtx.
+  */
+@@ -1071,24 +1180,6 @@ ZSTDLIB_API size_t ZSTD_sizeof_DDict(con
+ #define ZSTDLIB_STATIC_API ZSTDLIB_VISIBLE
+ #endif
+ 
+-/* Deprecation warnings :
+- * Should these warnings be a problem, it is generally possible to disable them,
+- * typically with -Wno-deprecated-declarations for gcc or _CRT_SECURE_NO_WARNINGS in Visual.
+- * Otherwise, it's also possible to define ZSTD_DISABLE_DEPRECATE_WARNINGS.
+- */
+-#ifdef ZSTD_DISABLE_DEPRECATE_WARNINGS
+-#  define ZSTD_DEPRECATED(message) ZSTDLIB_STATIC_API  /* disable deprecation warnings */
+-#else
+-#  if (defined(GNUC) && (GNUC > 4 || (GNUC == 4 && GNUC_MINOR >= 5))) || defined(__clang__)
+-#    define ZSTD_DEPRECATED(message) ZSTDLIB_STATIC_API __attribute__((deprecated(message)))
+-#  elif (__GNUC__ >= 3)
+-#    define ZSTD_DEPRECATED(message) ZSTDLIB_STATIC_API __attribute__((deprecated))
+-#  else
+-#    pragma message("WARNING: You need to implement ZSTD_DEPRECATED for this compiler")
+-#    define ZSTD_DEPRECATED(message) ZSTDLIB_STATIC_API
+-#  endif
+-#endif /* ZSTD_DISABLE_DEPRECATE_WARNINGS */
+-
+ /* **************************************************************************************
+  *   experimental API (static linking only)
+  ****************************************************************************************
+@@ -1123,6 +1214,7 @@ ZSTDLIB_API size_t ZSTD_sizeof_DDict(con
+ #define ZSTD_TARGETLENGTH_MIN     0   /* note : comparing this constant to an unsigned results in a tautological test */
+ #define ZSTD_STRATEGY_MIN        ZSTD_fast
+ #define ZSTD_STRATEGY_MAX        ZSTD_btultra2
++#define ZSTD_BLOCKSIZE_MAX_MIN (1 << 10) /* The minimum valid max blocksize. Maximum blocksizes smaller than this make compressBound() inaccurate. */
+ 
+ 
+ #define ZSTD_OVERLAPLOG_MIN       0
+@@ -1146,7 +1238,7 @@ ZSTDLIB_API size_t ZSTD_sizeof_DDict(con
+ #define ZSTD_LDM_HASHRATELOG_MAX (ZSTD_WINDOWLOG_MAX - ZSTD_HASHLOG_MIN)
+ 
+ /* Advanced parameter bounds */
+-#define ZSTD_TARGETCBLOCKSIZE_MIN   64
++#define ZSTD_TARGETCBLOCKSIZE_MIN   1340 /* suitable to fit into an ethernet / wifi / 4G transport frame */
+ #define ZSTD_TARGETCBLOCKSIZE_MAX   ZSTD_BLOCKSIZE_MAX
+ #define ZSTD_SRCSIZEHINT_MIN        0
+ #define ZSTD_SRCSIZEHINT_MAX        INT_MAX
+@@ -1303,7 +1395,7 @@ typedef enum {
+ } ZSTD_paramSwitch_e;
+ 
+ /* *************************************
+-*  Frame size functions
++*  Frame header and size functions
+ ***************************************/
+ 
+ /*! ZSTD_findDecompressedSize() :
+@@ -1350,29 +1442,122 @@ ZSTDLIB_STATIC_API unsigned long long ZS
+  *           or an error code (if srcSize is too small) */
+ ZSTDLIB_STATIC_API size_t ZSTD_frameHeaderSize(const void* src, size_t srcSize);
+ 
++typedef enum { ZSTD_frame, ZSTD_skippableFrame } ZSTD_frameType_e;
++typedef struct {
++    unsigned long long frameContentSize; /* if == ZSTD_CONTENTSIZE_UNKNOWN, it means this field is not available. 0 means "empty" */
++    unsigned long long windowSize;       /* can be very large, up to <= frameContentSize */
++    unsigned blockSizeMax;
++    ZSTD_frameType_e frameType;          /* if == ZSTD_skippableFrame, frameContentSize is the size of skippable content */
++    unsigned headerSize;
++    unsigned dictID;
++    unsigned checksumFlag;
++    unsigned _reserved1;
++    unsigned _reserved2;
++} ZSTD_frameHeader;
++
++/*! ZSTD_getFrameHeader() :
++ *  decode Frame Header, or requires larger `srcSize`.
++ * @return : 0, `zfhPtr` is correctly filled,
++ *          >0, `srcSize` is too small, value is wanted `srcSize` amount,
++ *           or an error code, which can be tested using ZSTD_isError() */
++ZSTDLIB_STATIC_API size_t ZSTD_getFrameHeader(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize);   /*< doesn't consume input */
++/*! ZSTD_getFrameHeader_advanced() :
++ *  same as ZSTD_getFrameHeader(),
++ *  with added capability to select a format (like ZSTD_f_zstd1_magicless) */
++ZSTDLIB_STATIC_API size_t ZSTD_getFrameHeader_advanced(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize, ZSTD_format_e format);
++
++/*! ZSTD_decompressionMargin() :
++ * Zstd supports in-place decompression, where the input and output buffers overlap.
++ * In this case, the output buffer must be at least (Margin + Output_Size) bytes large,
++ * and the input buffer must be at the end of the output buffer.
++ *
++ *  _______________________ Output Buffer ________________________
++ * |                                                              |
++ * |                                        ____ Input Buffer ____|
++ * |                                       |                      |
++ * v                                       v                      v
++ * |---------------------------------------|-----------|----------|
++ * ^                                                   ^          ^
++ * |___________________ Output_Size ___________________|_ Margin _|
++ *
++ * NOTE: See also ZSTD_DECOMPRESSION_MARGIN().
++ * NOTE: This applies only to single-pass decompression through ZSTD_decompress() or
++ * ZSTD_decompressDCtx().
++ * NOTE: This function supports multi-frame input.
++ *
++ * @param src The compressed frame(s)
++ * @param srcSize The size of the compressed frame(s)
++ * @returns The decompression margin or an error that can be checked with ZSTD_isError().
++ */
++ZSTDLIB_STATIC_API size_t ZSTD_decompressionMargin(const void* src, size_t srcSize);
++
++/*! ZSTD_DECOMPRESS_MARGIN() :
++ * Similar to ZSTD_decompressionMargin(), but instead of computing the margin from
++ * the compressed frame, compute it from the original size and the blockSizeLog.
++ * See ZSTD_decompressionMargin() for details.
++ *
++ * WARNING: This macro does not support multi-frame input, the input must be a single
++ * zstd frame. If you need that support use the function, or implement it yourself.
++ *
++ * @param originalSize The original uncompressed size of the data.
++ * @param blockSize    The block size == MIN(windowSize, ZSTD_BLOCKSIZE_MAX).
++ *                     Unless you explicitly set the windowLog smaller than
++ *                     ZSTD_BLOCKSIZELOG_MAX you can just use ZSTD_BLOCKSIZE_MAX.
++ */
++#define ZSTD_DECOMPRESSION_MARGIN(originalSize, blockSize) ((size_t)(                                              \
++        ZSTD_FRAMEHEADERSIZE_MAX                                                              /* Frame header */ + \
++        4                                                                                         /* checksum */ + \
++        ((originalSize) == 0 ? 0 : 3 * (((originalSize) + (blockSize) - 1) / blockSize)) /* 3 bytes per block */ + \
++        (blockSize)                                                                    /* One block of margin */   \
++    ))
++
+ typedef enum {
+   ZSTD_sf_noBlockDelimiters = 0,         /* Representation of ZSTD_Sequence has no block delimiters, sequences only */
+   ZSTD_sf_explicitBlockDelimiters = 1    /* Representation of ZSTD_Sequence contains explicit block delimiters */
+ } ZSTD_sequenceFormat_e;
+ 
++/*! ZSTD_sequenceBound() :
++ * `srcSize` : size of the input buffer
++ *  @return : upper-bound for the number of sequences that can be generated
++ *            from a buffer of srcSize bytes
++ *
++ *  note : returns number of sequences - to get bytes, multiply by sizeof(ZSTD_Sequence).
++ */
++ZSTDLIB_STATIC_API size_t ZSTD_sequenceBound(size_t srcSize);
++
+ /*! ZSTD_generateSequences() :
+- * Generate sequences using ZSTD_compress2, given a source buffer.
++ * WARNING: This function is meant for debugging and informational purposes ONLY!
++ * Its implementation is flawed, and it will be deleted in a future version.
++ * It is not guaranteed to succeed, as there are several cases where it will give
++ * up and fail. You should NOT use this function in production code.
++ *
++ * This function is deprecated, and will be removed in a future version.
++ *
++ * Generate sequences using ZSTD_compress2(), given a source buffer.
++ *
++ * @param zc The compression context to be used for ZSTD_compress2(). Set any
++ *           compression parameters you need on this context.
++ * @param outSeqs The output sequences buffer of size @p outSeqsSize
++ * @param outSeqsSize The size of the output sequences buffer.
++ *                    ZSTD_sequenceBound(srcSize) is an upper bound on the number
++ *                    of sequences that can be generated.
++ * @param src The source buffer to generate sequences from of size @p srcSize.
++ * @param srcSize The size of the source buffer.
+  *
+  * Each block will end with a dummy sequence
+  * with offset == 0, matchLength == 0, and litLength == length of last literals.
+  * litLength may be == 0, and if so, then the sequence of (of: 0 ml: 0 ll: 0)
+  * simply acts as a block delimiter.
+  *
+- * zc can be used to insert custom compression params.
+- * This function invokes ZSTD_compress2
+- *
+- * The output of this function can be fed into ZSTD_compressSequences() with CCtx
+- * setting of ZSTD_c_blockDelimiters as ZSTD_sf_explicitBlockDelimiters
+- * @return : number of sequences generated
+- */
+-
+-ZSTDLIB_STATIC_API size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs,
+-                                          size_t outSeqsSize, const void* src, size_t srcSize);
++ * @returns The number of sequences generated, necessarily less than
++ *          ZSTD_sequenceBound(srcSize), or an error code that can be checked
++ *          with ZSTD_isError().
++ */
++ZSTD_DEPRECATED("For debugging only, will be replaced by ZSTD_extractSequences()")
++ZSTDLIB_STATIC_API size_t
++ZSTD_generateSequences(ZSTD_CCtx* zc,
++                       ZSTD_Sequence* outSeqs, size_t outSeqsSize,
++                       const void* src, size_t srcSize);
+ 
+ /*! ZSTD_mergeBlockDelimiters() :
+  * Given an array of ZSTD_Sequence, remove all sequences that represent block delimiters/last literals
+@@ -1388,7 +1573,9 @@ ZSTDLIB_STATIC_API size_t ZSTD_generateS
+ ZSTDLIB_STATIC_API size_t ZSTD_mergeBlockDelimiters(ZSTD_Sequence* sequences, size_t seqsSize);
+ 
+ /*! ZSTD_compressSequences() :
+- * Compress an array of ZSTD_Sequence, generated from the original source buffer, into dst.
++ * Compress an array of ZSTD_Sequence, associated with @src buffer, into dst.
++ * @src contains the entire input (not just the literals).
++ * If @srcSize > sum(sequence.length), the remaining bytes are considered all literals
+  * If a dictionary is included, then the cctx should reference the dict. (see: ZSTD_CCtx_refCDict(), ZSTD_CCtx_loadDictionary(), etc.)
+  * The entire source is compressed into a single frame.
+  *
+@@ -1413,11 +1600,12 @@ ZSTDLIB_STATIC_API size_t ZSTD_mergeBloc
+  * Note: Repcodes are, as of now, always re-calculated within this function, so ZSTD_Sequence::rep is unused.
+  * Note 2: Once we integrate ability to ingest repcodes, the explicit block delims mode must respect those repcodes exactly,
+  *         and cannot emit an RLE block that disagrees with the repcode history
+- * @return : final compressed size or a ZSTD error.
++ * @return : final compressed size, or a ZSTD error code.
+  */
+-ZSTDLIB_STATIC_API size_t ZSTD_compressSequences(ZSTD_CCtx* const cctx, void* dst, size_t dstSize,
+-                                  const ZSTD_Sequence* inSeqs, size_t inSeqsSize,
+-                                  const void* src, size_t srcSize);
++ZSTDLIB_STATIC_API size_t
++ZSTD_compressSequences( ZSTD_CCtx* cctx, void* dst, size_t dstSize,
++                        const ZSTD_Sequence* inSeqs, size_t inSeqsSize,
++                        const void* src, size_t srcSize);
+ 
+ 
+ /*! ZSTD_writeSkippableFrame() :
+@@ -1464,48 +1652,59 @@ ZSTDLIB_API unsigned ZSTD_isSkippableFra
+ /*! ZSTD_estimate*() :
+  *  These functions make it possible to estimate memory usage
+  *  of a future {D,C}Ctx, before its creation.
++ *  This is useful in combination with ZSTD_initStatic(),
++ *  which makes it possible to employ a static buffer for ZSTD_CCtx* state.
+  *
+  *  ZSTD_estimateCCtxSize() will provide a memory budget large enough
+- *  for any compression level up to selected one.
+- *  Note : Unlike ZSTD_estimateCStreamSize*(), this estimate
+- *         does not include space for a window buffer.
+- *         Therefore, the estimation is only guaranteed for single-shot compressions, not streaming.
++ *  to compress data of any size using one-shot compression ZSTD_compressCCtx() or ZSTD_compress2()
++ *  associated with any compression level up to max specified one.
+  *  The estimate will assume the input may be arbitrarily large,
+  *  which is the worst case.
+  *
++ *  Note that the size estimation is specific for one-shot compression,
++ *  it is not valid for streaming (see ZSTD_estimateCStreamSize*())
++ *  nor other potential ways of using a ZSTD_CCtx* state.
++ *
+  *  When srcSize can be bound by a known and rather "small" value,
+- *  this fact can be used to provide a tighter estimation
+- *  because the CCtx compression context will need less memory.
+- *  This tighter estimation can be provided by more advanced functions
++ *  this knowledge can be used to provide a tighter budget estimation
++ *  because the ZSTD_CCtx* state will need less memory for small inputs.
++ *  This tighter estimation can be provided by employing more advanced functions
+  *  ZSTD_estimateCCtxSize_usingCParams(), which can be used in tandem with ZSTD_getCParams(),
+  *  and ZSTD_estimateCCtxSize_usingCCtxParams(), which can be used in tandem with ZSTD_CCtxParams_setParameter().
+  *  Both can be used to estimate memory using custom compression parameters and arbitrary srcSize limits.
+  *
+- *  Note 2 : only single-threaded compression is supported.
++ *  Note : only single-threaded compression is supported.
+  *  ZSTD_estimateCCtxSize_usingCCtxParams() will return an error code if ZSTD_c_nbWorkers is >= 1.
+  */
+-ZSTDLIB_STATIC_API size_t ZSTD_estimateCCtxSize(int compressionLevel);
++ZSTDLIB_STATIC_API size_t ZSTD_estimateCCtxSize(int maxCompressionLevel);
+ ZSTDLIB_STATIC_API size_t ZSTD_estimateCCtxSize_usingCParams(ZSTD_compressionParameters cParams);
+ ZSTDLIB_STATIC_API size_t ZSTD_estimateCCtxSize_usingCCtxParams(const ZSTD_CCtx_params* params);
+ ZSTDLIB_STATIC_API size_t ZSTD_estimateDCtxSize(void);
+ 
+ /*! ZSTD_estimateCStreamSize() :
+- *  ZSTD_estimateCStreamSize() will provide a budget large enough for any compression level up to selected one.
+- *  It will also consider src size to be arbitrarily "large", which is worst case.
++ *  ZSTD_estimateCStreamSize() will provide a memory budget large enough for streaming compression
++ *  using any compression level up to the max specified one.
++ *  It will also consider src size to be arbitrarily "large", which is a worst case scenario.
+  *  If srcSize is known to always be small, ZSTD_estimateCStreamSize_usingCParams() can provide a tighter estimation.
+  *  ZSTD_estimateCStreamSize_usingCParams() can be used in tandem with ZSTD_getCParams() to create cParams from compressionLevel.
+  *  ZSTD_estimateCStreamSize_usingCCtxParams() can be used in tandem with ZSTD_CCtxParams_setParameter(). Only single-threaded compression is supported. This function will return an error code if ZSTD_c_nbWorkers is >= 1.
+  *  Note : CStream size estimation is only correct for single-threaded compression.
+- *  ZSTD_DStream memory budget depends on window Size.
++ *  ZSTD_estimateCStreamSize_usingCCtxParams() will return an error code if ZSTD_c_nbWorkers is >= 1.
++ *  Note 2 : ZSTD_estimateCStreamSize* functions are not compatible with the Block-Level Sequence Producer API at this time.
++ *  Size estimates assume that no external sequence producer is registered.
++ *
++ *  ZSTD_DStream memory budget depends on frame's window Size.
+  *  This information can be passed manually, using ZSTD_estimateDStreamSize,
+  *  or deducted from a valid frame Header, using ZSTD_estimateDStreamSize_fromFrame();
++ *  Any frame requesting a window size larger than max specified one will be rejected.
+  *  Note : if streaming is init with function ZSTD_init?Stream_usingDict(),
+  *         an internal ?Dict will be created, which additional size is not estimated here.
+- *         In this case, get total size by adding ZSTD_estimate?DictSize */
+-ZSTDLIB_STATIC_API size_t ZSTD_estimateCStreamSize(int compressionLevel);
++ *         In this case, get total size by adding ZSTD_estimate?DictSize
++ */
++ZSTDLIB_STATIC_API size_t ZSTD_estimateCStreamSize(int maxCompressionLevel);
+ ZSTDLIB_STATIC_API size_t ZSTD_estimateCStreamSize_usingCParams(ZSTD_compressionParameters cParams);
+ ZSTDLIB_STATIC_API size_t ZSTD_estimateCStreamSize_usingCCtxParams(const ZSTD_CCtx_params* params);
+-ZSTDLIB_STATIC_API size_t ZSTD_estimateDStreamSize(size_t windowSize);
++ZSTDLIB_STATIC_API size_t ZSTD_estimateDStreamSize(size_t maxWindowSize);
+ ZSTDLIB_STATIC_API size_t ZSTD_estimateDStreamSize_fromFrame(const void* src, size_t srcSize);
+ 
+ /*! ZSTD_estimate?DictSize() :
+@@ -1649,22 +1848,45 @@ ZSTDLIB_STATIC_API size_t ZSTD_checkCPar
+  *  This function never fails (wide contract) */
+ ZSTDLIB_STATIC_API ZSTD_compressionParameters ZSTD_adjustCParams(ZSTD_compressionParameters cPar, unsigned long long srcSize, size_t dictSize);
+ 
++/*! ZSTD_CCtx_setCParams() :
++ *  Set all parameters provided within @p cparams into the working @p cctx.
++ *  Note : if modifying parameters during compression (MT mode only),
++ *         note that changes to the .windowLog parameter will be ignored.
++ * @return 0 on success, or an error code (can be checked with ZSTD_isError()).
++ *         On failure, no parameters are updated.
++ */
++ZSTDLIB_STATIC_API size_t ZSTD_CCtx_setCParams(ZSTD_CCtx* cctx, ZSTD_compressionParameters cparams);
++
++/*! ZSTD_CCtx_setFParams() :
++ *  Set all parameters provided within @p fparams into the working @p cctx.
++ * @return 0 on success, or an error code (can be checked with ZSTD_isError()).
++ */
++ZSTDLIB_STATIC_API size_t ZSTD_CCtx_setFParams(ZSTD_CCtx* cctx, ZSTD_frameParameters fparams);
++
++/*! ZSTD_CCtx_setParams() :
++ *  Set all parameters provided within @p params into the working @p cctx.
++ * @return 0 on success, or an error code (can be checked with ZSTD_isError()).
++ */
++ZSTDLIB_STATIC_API size_t ZSTD_CCtx_setParams(ZSTD_CCtx* cctx, ZSTD_parameters params);
++
+ /*! ZSTD_compress_advanced() :
+  *  Note : this function is now DEPRECATED.
+  *         It can be replaced by ZSTD_compress2(), in combination with ZSTD_CCtx_setParameter() and other parameter setters.
+  *  This prototype will generate compilation warnings. */
+ ZSTD_DEPRECATED("use ZSTD_compress2")
++ZSTDLIB_STATIC_API
+ size_t ZSTD_compress_advanced(ZSTD_CCtx* cctx,
+-                                          void* dst, size_t dstCapacity,
+-                                    const void* src, size_t srcSize,
+-                                    const void* dict,size_t dictSize,
+-                                          ZSTD_parameters params);
++                              void* dst, size_t dstCapacity,
++                        const void* src, size_t srcSize,
++                        const void* dict,size_t dictSize,
++                              ZSTD_parameters params);
+ 
+ /*! ZSTD_compress_usingCDict_advanced() :
+  *  Note : this function is now DEPRECATED.
+  *         It can be replaced by ZSTD_compress2(), in combination with ZSTD_CCtx_loadDictionary() and other parameter setters.
+  *  This prototype will generate compilation warnings. */
+ ZSTD_DEPRECATED("use ZSTD_compress2 with ZSTD_CCtx_loadDictionary")
++ZSTDLIB_STATIC_API
+ size_t ZSTD_compress_usingCDict_advanced(ZSTD_CCtx* cctx,
+                                               void* dst, size_t dstCapacity,
+                                         const void* src, size_t srcSize,
+@@ -1737,11 +1959,6 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refP
+  */
+ #define ZSTD_c_literalCompressionMode ZSTD_c_experimentalParam5
+ 
+-/* Tries to fit compressed block size to be around targetCBlockSize.
+- * No target when targetCBlockSize == 0.
+- * There is no guarantee on compressed block size (default:0) */
+-#define ZSTD_c_targetCBlockSize ZSTD_c_experimentalParam6
+-
+ /* User's best guess of source size.
+  * Hint is not valid when srcSizeHint == 0.
+  * There is no guarantee that hint is close to actual source size,
+@@ -1808,13 +2025,16 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refP
+  * Experimental parameter.
+  * Default is 0 == disabled. Set to 1 to enable.
+  *
+- * Tells the compressor that the ZSTD_inBuffer will ALWAYS be the same
+- * between calls, except for the modifications that zstd makes to pos (the
+- * caller must not modify pos). This is checked by the compressor, and
+- * compression will fail if it ever changes. This means the only flush
+- * mode that makes sense is ZSTD_e_end, so zstd will error if ZSTD_e_end
+- * is not used. The data in the ZSTD_inBuffer in the range [src, src + pos)
+- * MUST not be modified during compression or you will get data corruption.
++ * Tells the compressor that input data presented with ZSTD_inBuffer
++ * will ALWAYS be the same between calls.
++ * Technically, the @src pointer must never be changed,
++ * and the @pos field can only be updated by zstd.
++ * However, it's possible to increase the @size field,
++ * allowing scenarios where more data can be appended after compressions starts.
++ * These conditions are checked by the compressor,
++ * and compression will fail if they are not respected.
++ * Also, data in the ZSTD_inBuffer within the range [src, src + pos)
++ * MUST not be modified during compression or it will result in data corruption.
+  *
+  * When this flag is enabled zstd won't allocate an input window buffer,
+  * because the user guarantees it can reference the ZSTD_inBuffer until
+@@ -1822,18 +2042,15 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refP
+  * large enough to fit a block (see ZSTD_c_stableOutBuffer). This will also
+  * avoid the memcpy() from the input buffer to the input window buffer.
+  *
+- * NOTE: ZSTD_compressStream2() will error if ZSTD_e_end is not used.
+- * That means this flag cannot be used with ZSTD_compressStream().
+- *
+  * NOTE: So long as the ZSTD_inBuffer always points to valid memory, using
+  * this flag is ALWAYS memory safe, and will never access out-of-bounds
+- * memory. However, compression WILL fail if you violate the preconditions.
++ * memory. However, compression WILL fail if conditions are not respected.
+  *
+- * WARNING: The data in the ZSTD_inBuffer in the range [dst, dst + pos) MUST
+- * not be modified during compression or you will get data corruption. This
+- * is because zstd needs to reference data in the ZSTD_inBuffer to find
++ * WARNING: The data in the ZSTD_inBuffer in the range [src, src + pos) MUST
++ * not be modified during compression or it will result in data corruption.
++ * This is because zstd needs to reference data in the ZSTD_inBuffer to find
+  * matches. Normally zstd maintains its own window buffer for this purpose,
+- * but passing this flag tells zstd to use the user provided buffer.
++ * but passing this flag tells zstd to rely on user provided buffer instead.
+  */
+ #define ZSTD_c_stableInBuffer ZSTD_c_experimentalParam9
+ 
+@@ -1878,7 +2095,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refP
+  * Without validation, providing a sequence that does not conform to the zstd spec will cause
+  * undefined behavior, and may produce a corrupted block.
+  *
+- * With validation enabled, a if sequence is invalid (see doc/zstd_compression_format.md for
++ * With validation enabled, if sequence is invalid (see doc/zstd_compression_format.md for
+  * specifics regarding offset/matchlength requirements) then the function will bail out and
+  * return an error.
+  *
+@@ -1928,6 +2145,79 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refP
+  */
+ #define ZSTD_c_deterministicRefPrefix ZSTD_c_experimentalParam15
+ 
++/* ZSTD_c_prefetchCDictTables
++ * Controlled with ZSTD_paramSwitch_e enum. Default is ZSTD_ps_auto.
++ *
++ * In some situations, zstd uses CDict tables in-place rather than copying them
++ * into the working context. (See docs on ZSTD_dictAttachPref_e above for details).
++ * In such situations, compression speed is seriously impacted when CDict tables are
++ * "cold" (outside CPU cache). This parameter instructs zstd to prefetch CDict tables
++ * when they are used in-place.
++ *
++ * For sufficiently small inputs, the cost of the prefetch will outweigh the benefit.
++ * For sufficiently large inputs, zstd will by default memcpy() CDict tables
++ * into the working context, so there is no need to prefetch. This parameter is
++ * targeted at a middle range of input sizes, where a prefetch is cheap enough to be
++ * useful but memcpy() is too expensive. The exact range of input sizes where this
++ * makes sense is best determined by careful experimentation.
++ *
++ * Note: for this parameter, ZSTD_ps_auto is currently equivalent to ZSTD_ps_disable,
++ * but in the future zstd may conditionally enable this feature via an auto-detection
++ * heuristic for cold CDicts.
++ * Use ZSTD_ps_disable to opt out of prefetching under any circumstances.
++ */
++#define ZSTD_c_prefetchCDictTables ZSTD_c_experimentalParam16
++
++/* ZSTD_c_enableSeqProducerFallback
++ * Allowed values are 0 (disable) and 1 (enable). The default setting is 0.
++ *
++ * Controls whether zstd will fall back to an internal sequence producer if an
++ * external sequence producer is registered and returns an error code. This fallback
++ * is block-by-block: the internal sequence producer will only be called for blocks
++ * where the external sequence producer returns an error code. Fallback parsing will
++ * follow any other cParam settings, such as compression level, the same as in a
++ * normal (fully-internal) compression operation.
++ *
++ * The user is strongly encouraged to read the full Block-Level Sequence Producer API
++ * documentation (below) before setting this parameter. */
++#define ZSTD_c_enableSeqProducerFallback ZSTD_c_experimentalParam17
++
++/* ZSTD_c_maxBlockSize
++ * Allowed values are between 1KB and ZSTD_BLOCKSIZE_MAX (128KB).
++ * The default is ZSTD_BLOCKSIZE_MAX, and setting to 0 will set to the default.
++ *
++ * This parameter can be used to set an upper bound on the blocksize
++ * that overrides the default ZSTD_BLOCKSIZE_MAX. It cannot be used to set upper
++ * bounds greater than ZSTD_BLOCKSIZE_MAX or bounds lower than 1KB (will make
++ * compressBound() inaccurate). Only currently meant to be used for testing.
++ *
++ */
++#define ZSTD_c_maxBlockSize ZSTD_c_experimentalParam18
++
++/* ZSTD_c_searchForExternalRepcodes
++ * This parameter affects how zstd parses external sequences, such as sequences
++ * provided through the compressSequences() API or from an external block-level
++ * sequence producer.
++ *
++ * If set to ZSTD_ps_enable, the library will check for repeated offsets in
++ * external sequences, even if those repcodes are not explicitly indicated in
++ * the "rep" field. Note that this is the only way to exploit repcode matches
++ * while using compressSequences() or an external sequence producer, since zstd
++ * currently ignores the "rep" field of external sequences.
++ *
++ * If set to ZSTD_ps_disable, the library will not exploit repeated offsets in
++ * external sequences, regardless of whether the "rep" field has been set. This
++ * reduces sequence compression overhead by about 25% while sacrificing some
++ * compression ratio.
++ *
++ * The default value is ZSTD_ps_auto, for which the library will enable/disable
++ * based on compression level.
++ *
++ * Note: for now, this param only has an effect if ZSTD_c_blockDelimiters is
++ * set to ZSTD_sf_explicitBlockDelimiters. That may change in the future.
++ */
++#define ZSTD_c_searchForExternalRepcodes ZSTD_c_experimentalParam19
++
+ /*! ZSTD_CCtx_getParameter() :
+  *  Get the requested compression parameter value, selected by enum ZSTD_cParameter,
+  *  and store it into int* value.
+@@ -2084,7 +2374,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_DCtx_getP
+  * in the range [dst, dst + pos) MUST not be modified during decompression
+  * or you will get data corruption.
+  *
+- * When this flags is enabled zstd won't allocate an output buffer, because
++ * When this flag is enabled zstd won't allocate an output buffer, because
+  * it can write directly to the ZSTD_outBuffer, but it will still allocate
+  * an input buffer large enough to fit any compressed block. This will also
+  * avoid the memcpy() from the internal output buffer to the ZSTD_outBuffer.
+@@ -2137,6 +2427,33 @@ ZSTDLIB_STATIC_API size_t ZSTD_DCtx_getP
+  */
+ #define ZSTD_d_refMultipleDDicts ZSTD_d_experimentalParam4
+ 
++/* ZSTD_d_disableHuffmanAssembly
++ * Set to 1 to disable the Huffman assembly implementation.
++ * The default value is 0, which allows zstd to use the Huffman assembly
++ * implementation if available.
++ *
++ * This parameter can be used to disable Huffman assembly at runtime.
++ * If you want to disable it at compile time you can define the macro
++ * ZSTD_DISABLE_ASM.
++ */
++#define ZSTD_d_disableHuffmanAssembly ZSTD_d_experimentalParam5
++
++/* ZSTD_d_maxBlockSize
++ * Allowed values are between 1KB and ZSTD_BLOCKSIZE_MAX (128KB).
++ * The default is ZSTD_BLOCKSIZE_MAX, and setting to 0 will set to the default.
++ *
++ * Forces the decompressor to reject blocks whose content size is
++ * larger than the configured maxBlockSize. When maxBlockSize is
++ * larger than the windowSize, the windowSize is used instead.
++ * This saves memory on the decoder when you know all blocks are small.
++ *
++ * This option is typically used in conjunction with ZSTD_c_maxBlockSize.
++ *
++ * WARNING: This causes the decoder to reject otherwise valid frames
++ * that have block sizes larger than the configured maxBlockSize.
++ */
++#define ZSTD_d_maxBlockSize ZSTD_d_experimentalParam6
++
+ 
+ /*! ZSTD_DCtx_setFormat() :
+  *  This function is REDUNDANT. Prefer ZSTD_DCtx_setParameter().
+@@ -2145,6 +2462,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_DCtx_getP
+  *  such ZSTD_f_zstd1_magicless for example.
+  * @return : 0, or an error code (which can be tested using ZSTD_isError()). */
+ ZSTD_DEPRECATED("use ZSTD_DCtx_setParameter() instead")
++ZSTDLIB_STATIC_API
+ size_t ZSTD_DCtx_setFormat(ZSTD_DCtx* dctx, ZSTD_format_e format);
+ 
+ /*! ZSTD_decompressStream_simpleArgs() :
+@@ -2181,6 +2499,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_decompres
+  * This prototype will generate compilation warnings.
+  */
+ ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions")
++ZSTDLIB_STATIC_API
+ size_t ZSTD_initCStream_srcSize(ZSTD_CStream* zcs,
+                          int compressionLevel,
+                          unsigned long long pledgedSrcSize);
+@@ -2198,17 +2517,15 @@ size_t ZSTD_initCStream_srcSize(ZSTD_CSt
+  * This prototype will generate compilation warnings.
+  */
+ ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions")
++ZSTDLIB_STATIC_API
+ size_t ZSTD_initCStream_usingDict(ZSTD_CStream* zcs,
+                      const void* dict, size_t dictSize,
+                            int compressionLevel);
+ 
+ /*! ZSTD_initCStream_advanced() :
+- * This function is DEPRECATED, and is approximately equivalent to:
++ * This function is DEPRECATED, and is equivalent to:
+  *     ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only);
+- *     // Pseudocode: Set each zstd parameter and leave the rest as-is.
+- *     for ((param, value) : params) {
+- *         ZSTD_CCtx_setParameter(zcs, param, value);
+- *     }
++ *     ZSTD_CCtx_setParams(zcs, params);
+  *     ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize);
+  *     ZSTD_CCtx_loadDictionary(zcs, dict, dictSize);
+  *
+@@ -2218,6 +2535,7 @@ size_t ZSTD_initCStream_usingDict(ZSTD_C
+  * This prototype will generate compilation warnings.
+  */
+ ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions")
++ZSTDLIB_STATIC_API
+ size_t ZSTD_initCStream_advanced(ZSTD_CStream* zcs,
+                     const void* dict, size_t dictSize,
+                           ZSTD_parameters params,
+@@ -2232,15 +2550,13 @@ size_t ZSTD_initCStream_advanced(ZSTD_CS
+  * This prototype will generate compilation warnings.
+  */
+ ZSTD_DEPRECATED("use ZSTD_CCtx_reset and ZSTD_CCtx_refCDict, see zstd.h for detailed instructions")
++ZSTDLIB_STATIC_API
+ size_t ZSTD_initCStream_usingCDict(ZSTD_CStream* zcs, const ZSTD_CDict* cdict);
+ 
+ /*! ZSTD_initCStream_usingCDict_advanced() :
+- *   This function is DEPRECATED, and is approximately equivalent to:
++ *   This function is DEPRECATED, and is equivalent to:
+  *     ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only);
+- *     // Pseudocode: Set each zstd frame parameter and leave the rest as-is.
+- *     for ((fParam, value) : fParams) {
+- *         ZSTD_CCtx_setParameter(zcs, fParam, value);
+- *     }
++ *     ZSTD_CCtx_setFParams(zcs, fParams);
+  *     ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize);
+  *     ZSTD_CCtx_refCDict(zcs, cdict);
+  *
+@@ -2250,6 +2566,7 @@ size_t ZSTD_initCStream_usingCDict(ZSTD_
+  * This prototype will generate compilation warnings.
+  */
+ ZSTD_DEPRECATED("use ZSTD_CCtx_reset and ZSTD_CCtx_refCDict, see zstd.h for detailed instructions")
++ZSTDLIB_STATIC_API
+ size_t ZSTD_initCStream_usingCDict_advanced(ZSTD_CStream* zcs,
+                                const ZSTD_CDict* cdict,
+                                      ZSTD_frameParameters fParams,
+@@ -2264,7 +2581,7 @@ size_t ZSTD_initCStream_usingCDict_advan
+  *       explicitly specified.
+  *
+  *  start a new frame, using same parameters from previous frame.
+- *  This is typically useful to skip dictionary loading stage, since it will re-use it in-place.
++ *  This is typically useful to skip dictionary loading stage, since it will reuse it in-place.
+  *  Note that zcs must be init at least once before using ZSTD_resetCStream().
+  *  If pledgedSrcSize is not known at reset time, use macro ZSTD_CONTENTSIZE_UNKNOWN.
+  *  If pledgedSrcSize > 0, its value must be correct, as it will be written in header, and controlled at the end.
+@@ -2274,6 +2591,7 @@ size_t ZSTD_initCStream_usingCDict_advan
+  *  This prototype will generate compilation warnings.
+  */
+ ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions")
++ZSTDLIB_STATIC_API
+ size_t ZSTD_resetCStream(ZSTD_CStream* zcs, unsigned long long pledgedSrcSize);
+ 
+ 
+@@ -2319,8 +2637,8 @@ ZSTDLIB_STATIC_API size_t ZSTD_toFlushNo
+  *     ZSTD_DCtx_loadDictionary(zds, dict, dictSize);
+  *
+  * note: no dictionary will be used if dict == NULL or dictSize < 8
+- * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x
+  */
++ZSTD_DEPRECATED("use ZSTD_DCtx_reset + ZSTD_DCtx_loadDictionary, see zstd.h for detailed instructions")
+ ZSTDLIB_STATIC_API size_t ZSTD_initDStream_usingDict(ZSTD_DStream* zds, const void* dict, size_t dictSize);
+ 
+ /*!
+@@ -2330,8 +2648,8 @@ ZSTDLIB_STATIC_API size_t ZSTD_initDStre
+  *     ZSTD_DCtx_refDDict(zds, ddict);
+  *
+  * note : ddict is referenced, it must outlive decompression session
+- * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x
+  */
++ZSTD_DEPRECATED("use ZSTD_DCtx_reset + ZSTD_DCtx_refDDict, see zstd.h for detailed instructions")
+ ZSTDLIB_STATIC_API size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* zds, const ZSTD_DDict* ddict);
+ 
+ /*!
+@@ -2339,18 +2657,202 @@ ZSTDLIB_STATIC_API size_t ZSTD_initDStre
+  *
+  *     ZSTD_DCtx_reset(zds, ZSTD_reset_session_only);
+  *
+- * re-use decompression parameters from previous init; saves dictionary loading
+- * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x
++ * reuse decompression parameters from previous init; saves dictionary loading
+  */
++ZSTD_DEPRECATED("use ZSTD_DCtx_reset, see zstd.h for detailed instructions")
+ ZSTDLIB_STATIC_API size_t ZSTD_resetDStream(ZSTD_DStream* zds);
+ 
+ 
++/* ********************* BLOCK-LEVEL SEQUENCE PRODUCER API *********************
++ *
++ * *** OVERVIEW ***
++ * The Block-Level Sequence Producer API allows users to provide their own custom
++ * sequence producer which libzstd invokes to process each block. The produced list
++ * of sequences (literals and matches) is then post-processed by libzstd to produce
++ * valid compressed blocks.
++ *
++ * This block-level offload API is a more granular complement of the existing
++ * frame-level offload API compressSequences() (introduced in v1.5.1). It offers
++ * an easier migration story for applications already integrated with libzstd: the
++ * user application continues to invoke the same compression functions
++ * ZSTD_compress2() or ZSTD_compressStream2() as usual, and transparently benefits
++ * from the specific advantages of the external sequence producer. For example,
++ * the sequence producer could be tuned to take advantage of known characteristics
++ * of the input, to offer better speed / ratio, or could leverage hardware
++ * acceleration not available within libzstd itself.
++ *
++ * See contrib/externalSequenceProducer for an example program employing the
++ * Block-Level Sequence Producer API.
++ *
++ * *** USAGE ***
++ * The user is responsible for implementing a function of type
++ * ZSTD_sequenceProducer_F. For each block, zstd will pass the following
++ * arguments to the user-provided function:
++ *
++ *   - sequenceProducerState: a pointer to a user-managed state for the sequence
++ *     producer.
++ *
++ *   - outSeqs, outSeqsCapacity: an output buffer for the sequence producer.
++ *     outSeqsCapacity is guaranteed >= ZSTD_sequenceBound(srcSize). The memory
++ *     backing outSeqs is managed by the CCtx.
++ *
++ *   - src, srcSize: an input buffer for the sequence producer to parse.
++ *     srcSize is guaranteed to be <= ZSTD_BLOCKSIZE_MAX.
++ *
++ *   - dict, dictSize: a history buffer, which may be empty, which the sequence
++ *     producer may reference as it parses the src buffer. Currently, zstd will
++ *     always pass dictSize == 0 into external sequence producers, but this will
++ *     change in the future.
++ *
++ *   - compressionLevel: a signed integer representing the zstd compression level
++ *     set by the user for the current operation. The sequence producer may choose
++ *     to use this information to change its compression strategy and speed/ratio
++ *     tradeoff. Note: the compression level does not reflect zstd parameters set
++ *     through the advanced API.
++ *
++ *   - windowSize: a size_t representing the maximum allowed offset for external
++ *     sequences. Note that sequence offsets are sometimes allowed to exceed the
++ *     windowSize if a dictionary is present, see doc/zstd_compression_format.md
++ *     for details.
++ *
++ * The user-provided function shall return a size_t representing the number of
++ * sequences written to outSeqs. This return value will be treated as an error
++ * code if it is greater than outSeqsCapacity. The return value must be non-zero
++ * if srcSize is non-zero. The ZSTD_SEQUENCE_PRODUCER_ERROR macro is provided
++ * for convenience, but any value greater than outSeqsCapacity will be treated as
++ * an error code.
++ *
++ * If the user-provided function does not return an error code, the sequences
++ * written to outSeqs must be a valid parse of the src buffer. Data corruption may
++ * occur if the parse is not valid. A parse is defined to be valid if the
++ * following conditions hold:
++ *   - The sum of matchLengths and literalLengths must equal srcSize.
++ *   - All sequences in the parse, except for the final sequence, must have
++ *     matchLength >= ZSTD_MINMATCH_MIN. The final sequence must have
++ *     matchLength >= ZSTD_MINMATCH_MIN or matchLength == 0.
++ *   - All offsets must respect the windowSize parameter as specified in
++ *     doc/zstd_compression_format.md.
++ *   - If the final sequence has matchLength == 0, it must also have offset == 0.
++ *
++ * zstd will only validate these conditions (and fail compression if they do not
++ * hold) if the ZSTD_c_validateSequences cParam is enabled. Note that sequence
++ * validation has a performance cost.
++ *
++ * If the user-provided function returns an error, zstd will either fall back
++ * to an internal sequence producer or fail the compression operation. The user can
++ * choose between the two behaviors by setting the ZSTD_c_enableSeqProducerFallback
++ * cParam. Fallback compression will follow any other cParam settings, such as
++ * compression level, the same as in a normal compression operation.
++ *
++ * The user shall instruct zstd to use a particular ZSTD_sequenceProducer_F
++ * function by calling
++ *         ZSTD_registerSequenceProducer(cctx,
++ *                                       sequenceProducerState,
++ *                                       sequenceProducer)
++ * This setting will persist until the next parameter reset of the CCtx.
++ *
++ * The sequenceProducerState must be initialized by the user before calling
++ * ZSTD_registerSequenceProducer(). The user is responsible for destroying the
++ * sequenceProducerState.
++ *
++ * *** LIMITATIONS ***
++ * This API is compatible with all zstd compression APIs which respect advanced parameters.
++ * However, there are three limitations:
++ *
++ * First, the ZSTD_c_enableLongDistanceMatching cParam is not currently supported.
++ * COMPRESSION WILL FAIL if it is enabled and the user tries to compress with a block-level
++ * external sequence producer.
++ *   - Note that ZSTD_c_enableLongDistanceMatching is auto-enabled by default in some
++ *     cases (see its documentation for details). Users must explicitly set
++ *     ZSTD_c_enableLongDistanceMatching to ZSTD_ps_disable in such cases if an external
++ *     sequence producer is registered.
++ *   - As of this writing, ZSTD_c_enableLongDistanceMatching is disabled by default
++ *     whenever ZSTD_c_windowLog < 128MB, but that's subject to change. Users should
++ *     check the docs on ZSTD_c_enableLongDistanceMatching whenever the Block-Level Sequence
++ *     Producer API is used in conjunction with advanced settings (like ZSTD_c_windowLog).
++ *
++ * Second, history buffers are not currently supported. Concretely, zstd will always pass
++ * dictSize == 0 to the external sequence producer (for now). This has two implications:
++ *   - Dictionaries are not currently supported. Compression will *not* fail if the user
++ *     references a dictionary, but the dictionary won't have any effect.
++ *   - Stream history is not currently supported. All advanced compression APIs, including
++ *     streaming APIs, work with external sequence producers, but each block is treated as
++ *     an independent chunk without history from previous blocks.
++ *
++ * Third, multi-threading within a single compression is not currently supported. In other words,
++ * COMPRESSION WILL FAIL if ZSTD_c_nbWorkers > 0 and an external sequence producer is registered.
++ * Multi-threading across compressions is fine: simply create one CCtx per thread.
++ *
++ * Long-term, we plan to overcome all three limitations. There is no technical blocker to
++ * overcoming them. It is purely a question of engineering effort.
++ */
++
++#define ZSTD_SEQUENCE_PRODUCER_ERROR ((size_t)(-1))
++
++typedef size_t (*ZSTD_sequenceProducer_F) (
++  void* sequenceProducerState,
++  ZSTD_Sequence* outSeqs, size_t outSeqsCapacity,
++  const void* src, size_t srcSize,
++  const void* dict, size_t dictSize,
++  int compressionLevel,
++  size_t windowSize
++);
++
++/*! ZSTD_registerSequenceProducer() :
++ * Instruct zstd to use a block-level external sequence producer function.
++ *
++ * The sequenceProducerState must be initialized by the caller, and the caller is
++ * responsible for managing its lifetime. This parameter is sticky across
++ * compressions. It will remain set until the user explicitly resets compression
++ * parameters.
++ *
++ * Sequence producer registration is considered to be an "advanced parameter",
++ * part of the "advanced API". This means it will only have an effect on compression
++ * APIs which respect advanced parameters, such as compress2() and compressStream2().
++ * Older compression APIs such as compressCCtx(), which predate the introduction of
++ * "advanced parameters", will ignore any external sequence producer setting.
++ *
++ * The sequence producer can be "cleared" by registering a NULL function pointer. This
++ * removes all limitations described above in the "LIMITATIONS" section of the API docs.
++ *
++ * The user is strongly encouraged to read the full API documentation (above) before
++ * calling this function. */
++ZSTDLIB_STATIC_API void
++ZSTD_registerSequenceProducer(
++  ZSTD_CCtx* cctx,
++  void* sequenceProducerState,
++  ZSTD_sequenceProducer_F sequenceProducer
++);
++
++/*! ZSTD_CCtxParams_registerSequenceProducer() :
++ * Same as ZSTD_registerSequenceProducer(), but operates on ZSTD_CCtx_params.
++ * This is used for accurate size estimation with ZSTD_estimateCCtxSize_usingCCtxParams(),
++ * which is needed when creating a ZSTD_CCtx with ZSTD_initStaticCCtx().
++ *
++ * If you are using the external sequence producer API in a scenario where ZSTD_initStaticCCtx()
++ * is required, then this function is for you. Otherwise, you probably don't need it.
++ *
++ * See tests/zstreamtest.c for example usage. */
++ZSTDLIB_STATIC_API void
++ZSTD_CCtxParams_registerSequenceProducer(
++  ZSTD_CCtx_params* params,
++  void* sequenceProducerState,
++  ZSTD_sequenceProducer_F sequenceProducer
++);
++
++
+ /* *******************************************************************
+-*  Buffer-less and synchronous inner streaming functions
++*  Buffer-less and synchronous inner streaming functions (DEPRECATED)
+ *
+-*  This is an advanced API, giving full control over buffer management, for users which need direct control over memory.
+-*  But it's also a complex one, with several restrictions, documented below.
+-*  Prefer normal streaming API for an easier experience.
++*  This API is deprecated, and will be removed in a future version.
++*  It allows streaming (de)compression with user allocated buffers.
++*  However, it is hard to use, and not as well tested as the rest of
++*  our API.
++*
++*  Please use the normal streaming API instead: ZSTD_compressStream2,
++*  and ZSTD_decompressStream.
++*  If there is functionality that you need, but it doesn't provide,
++*  please open an issue on our GitHub.
+ ********************************************************************* */
+ 
+ /*
+@@ -2358,11 +2860,10 @@ ZSTDLIB_STATIC_API size_t ZSTD_resetDStr
+ 
+   A ZSTD_CCtx object is required to track streaming operations.
+   Use ZSTD_createCCtx() / ZSTD_freeCCtx() to manage resource.
+-  ZSTD_CCtx object can be re-used multiple times within successive compression operations.
++  ZSTD_CCtx object can be reused multiple times within successive compression operations.
+ 
+   Start by initializing a context.
+   Use ZSTD_compressBegin(), or ZSTD_compressBegin_usingDict() for dictionary compression.
+-  It's also possible to duplicate a reference context which has already been initialized, using ZSTD_copyCCtx()
+ 
+   Then, consume your input using ZSTD_compressContinue().
+   There are some important considerations to keep in mind when using this advanced function :
+@@ -2380,36 +2881,46 @@ ZSTDLIB_STATIC_API size_t ZSTD_resetDStr
+   It's possible to use srcSize==0, in which case, it will write a final empty block to end the frame.
+   Without last block mark, frames are considered unfinished (hence corrupted) by compliant decoders.
+ 
+-  `ZSTD_CCtx` object can be re-used (ZSTD_compressBegin()) to compress again.
++  `ZSTD_CCtx` object can be reused (ZSTD_compressBegin()) to compress again.
+ */
+ 
+ /*=====   Buffer-less streaming compression functions  =====*/
++ZSTD_DEPRECATED("The buffer-less API is deprecated in favor of the normal streaming API. See docs.")
+ ZSTDLIB_STATIC_API size_t ZSTD_compressBegin(ZSTD_CCtx* cctx, int compressionLevel);
++ZSTD_DEPRECATED("The buffer-less API is deprecated in favor of the normal streaming API. See docs.")
+ ZSTDLIB_STATIC_API size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel);
++ZSTD_DEPRECATED("The buffer-less API is deprecated in favor of the normal streaming API. See docs.")
+ ZSTDLIB_STATIC_API size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict); /*< note: fails if cdict==NULL */
+-ZSTDLIB_STATIC_API size_t ZSTD_copyCCtx(ZSTD_CCtx* cctx, const ZSTD_CCtx* preparedCCtx, unsigned long long pledgedSrcSize); /*<  note: if pledgedSrcSize is not known, use ZSTD_CONTENTSIZE_UNKNOWN */
+ 
++ZSTD_DEPRECATED("This function will likely be removed in a future release. It is misleading and has very limited utility.")
++ZSTDLIB_STATIC_API
++size_t ZSTD_copyCCtx(ZSTD_CCtx* cctx, const ZSTD_CCtx* preparedCCtx, unsigned long long pledgedSrcSize); /*<  note: if pledgedSrcSize is not known, use ZSTD_CONTENTSIZE_UNKNOWN */
++
++ZSTD_DEPRECATED("The buffer-less API is deprecated in favor of the normal streaming API. See docs.")
+ ZSTDLIB_STATIC_API size_t ZSTD_compressContinue(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
++ZSTD_DEPRECATED("The buffer-less API is deprecated in favor of the normal streaming API. See docs.")
+ ZSTDLIB_STATIC_API size_t ZSTD_compressEnd(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+ 
+ /* The ZSTD_compressBegin_advanced() and ZSTD_compressBegin_usingCDict_advanced() are now DEPRECATED and will generate a compiler warning */
+ ZSTD_DEPRECATED("use advanced API to access custom parameters")
++ZSTDLIB_STATIC_API
+ size_t ZSTD_compressBegin_advanced(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, ZSTD_parameters params, unsigned long long pledgedSrcSize); /*< pledgedSrcSize : If srcSize is not known at init time, use ZSTD_CONTENTSIZE_UNKNOWN */
+ ZSTD_DEPRECATED("use advanced API to access custom parameters")
++ZSTDLIB_STATIC_API
+ size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_CDict* const cdict, ZSTD_frameParameters const fParams, unsigned long long const pledgedSrcSize);   /* compression parameters are already set within cdict. pledgedSrcSize must be correct. If srcSize is not known, use macro ZSTD_CONTENTSIZE_UNKNOWN */
+ /*
+   Buffer-less streaming decompression (synchronous mode)
+ 
+   A ZSTD_DCtx object is required to track streaming operations.
+   Use ZSTD_createDCtx() / ZSTD_freeDCtx() to manage it.
+-  A ZSTD_DCtx object can be re-used multiple times.
++  A ZSTD_DCtx object can be reused multiple times.
+ 
+   First typical operation is to retrieve frame parameters, using ZSTD_getFrameHeader().
+   Frame header is extracted from the beginning of compressed frame, so providing only the frame's beginning is enough.
+   Data fragment must be large enough to ensure successful decoding.
+  `ZSTD_frameHeaderSize_max` bytes is guaranteed to always be large enough.
+-  @result : 0 : successful decoding, the `ZSTD_frameHeader` structure is correctly filled.
+-           >0 : `srcSize` is too small, please provide at least @result bytes on next attempt.
++  result  : 0 : successful decoding, the `ZSTD_frameHeader` structure is correctly filled.
++           >0 : `srcSize` is too small, please provide at least result bytes on next attempt.
+            errorCode, which can be tested using ZSTD_isError().
+ 
+   It fills a ZSTD_frameHeader structure with important information to correctly decode the frame,
+@@ -2428,7 +2939,7 @@ size_t ZSTD_compressBegin_usingCDict_adv
+ 
+   The most memory efficient way is to use a round buffer of sufficient size.
+   Sufficient size is determined by invoking ZSTD_decodingBufferSize_min(),
+-  which can @return an error code if required value is too large for current system (in 32-bits mode).
++  which can return an error code if required value is too large for current system (in 32-bits mode).
+   In a round buffer methodology, ZSTD_decompressContinue() decompresses each block next to previous one,
+   up to the moment there is not enough room left in the buffer to guarantee decoding another full block,
+   which maximum size is provided in `ZSTD_frameHeader` structure, field `blockSizeMax`.
+@@ -2448,7 +2959,7 @@ size_t ZSTD_compressBegin_usingCDict_adv
+   ZSTD_nextSrcSizeToDecompress() tells how many bytes to provide as 'srcSize' to ZSTD_decompressContinue().
+   ZSTD_decompressContinue() requires this _exact_ amount of bytes, or it will fail.
+ 
+- @result of ZSTD_decompressContinue() is the number of bytes regenerated within 'dst' (necessarily <= dstCapacity).
++  result of ZSTD_decompressContinue() is the number of bytes regenerated within 'dst' (necessarily <= dstCapacity).
+   It can be zero : it just means ZSTD_decompressContinue() has decoded some metadata item.
+   It can also be an error code, which can be tested with ZSTD_isError().
+ 
+@@ -2471,27 +2982,7 @@ size_t ZSTD_compressBegin_usingCDict_adv
+ */
+ 
+ /*=====   Buffer-less streaming decompression functions  =====*/
+-typedef enum { ZSTD_frame, ZSTD_skippableFrame } ZSTD_frameType_e;
+-typedef struct {
+-    unsigned long long frameContentSize; /* if == ZSTD_CONTENTSIZE_UNKNOWN, it means this field is not available. 0 means "empty" */
+-    unsigned long long windowSize;       /* can be very large, up to <= frameContentSize */
+-    unsigned blockSizeMax;
+-    ZSTD_frameType_e frameType;          /* if == ZSTD_skippableFrame, frameContentSize is the size of skippable content */
+-    unsigned headerSize;
+-    unsigned dictID;
+-    unsigned checksumFlag;
+-} ZSTD_frameHeader;
+ 
+-/*! ZSTD_getFrameHeader() :
+- *  decode Frame Header, or requires larger `srcSize`.
+- * @return : 0, `zfhPtr` is correctly filled,
+- *          >0, `srcSize` is too small, value is wanted `srcSize` amount,
+- *           or an error code, which can be tested using ZSTD_isError() */
+-ZSTDLIB_STATIC_API size_t ZSTD_getFrameHeader(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize);   /*< doesn't consume input */
+-/*! ZSTD_getFrameHeader_advanced() :
+- *  same as ZSTD_getFrameHeader(),
+- *  with added capability to select a format (like ZSTD_f_zstd1_magicless) */
+-ZSTDLIB_STATIC_API size_t ZSTD_getFrameHeader_advanced(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize, ZSTD_format_e format);
+ ZSTDLIB_STATIC_API size_t ZSTD_decodingBufferSize_min(unsigned long long windowSize, unsigned long long frameContentSize);  /*< when frame content size is not known, pass in frameContentSize == ZSTD_CONTENTSIZE_UNKNOWN */
+ 
+ ZSTDLIB_STATIC_API size_t ZSTD_decompressBegin(ZSTD_DCtx* dctx);
+@@ -2502,6 +2993,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_nextSrcSi
+ ZSTDLIB_STATIC_API size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+ 
+ /* misc */
++ZSTD_DEPRECATED("This function will likely be removed in the next minor release. It is misleading and has very limited utility.")
+ ZSTDLIB_STATIC_API void   ZSTD_copyDCtx(ZSTD_DCtx* dctx, const ZSTD_DCtx* preparedDCtx);
+ typedef enum { ZSTDnit_frameHeader, ZSTDnit_blockHeader, ZSTDnit_block, ZSTDnit_lastBlock, ZSTDnit_checksum, ZSTDnit_skippableFrame } ZSTD_nextInputType_e;
+ ZSTDLIB_STATIC_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx);
+@@ -2509,11 +3001,23 @@ ZSTDLIB_STATIC_API ZSTD_nextInputType_e
+ 
+ 
+ 
+-/* ============================ */
+-/*       Block level API       */
+-/* ============================ */
++/* ========================================= */
++/*       Block level API (DEPRECATED)       */
++/* ========================================= */
+ 
+ /*!
++
++    This API is deprecated in favor of the regular compression API.
++    You can get the frame header down to 2 bytes by setting:
++      - ZSTD_c_format = ZSTD_f_zstd1_magicless
++      - ZSTD_c_contentSizeFlag = 0
++      - ZSTD_c_checksumFlag = 0
++      - ZSTD_c_dictIDFlag = 0
++
++    This API is not as well tested as our normal API, so we recommend not using it.
++    We will be removing it in a future version. If the normal API doesn't provide
++    the functionality you need, please open a GitHub issue.
++
+     Block functions produce and decode raw zstd blocks, without frame metadata.
+     Frame metadata cost is typically ~12 bytes, which can be non-negligible for very small blocks (< 100 bytes).
+     But users will have to take in charge needed metadata to regenerate data, such as compressed and content sizes.
+@@ -2524,7 +3028,6 @@ ZSTDLIB_STATIC_API ZSTD_nextInputType_e
+     - It is necessary to init context before starting
+       + compression : any ZSTD_compressBegin*() variant, including with dictionary
+       + decompression : any ZSTD_decompressBegin*() variant, including with dictionary
+-      + copyCCtx() and copyDCtx() can be used too
+     - Block size is limited, it must be <= ZSTD_getBlockSize() <= ZSTD_BLOCKSIZE_MAX == 128 KB
+       + If input is larger than a block size, it's necessary to split input data into multiple blocks
+       + For inputs larger than a single block, consider using regular ZSTD_compress() instead.
+@@ -2541,11 +3044,14 @@ ZSTDLIB_STATIC_API ZSTD_nextInputType_e
+ */
+ 
+ /*=====   Raw zstd block functions  =====*/
++ZSTD_DEPRECATED("The block API is deprecated in favor of the normal compression API. See docs.")
+ ZSTDLIB_STATIC_API size_t ZSTD_getBlockSize   (const ZSTD_CCtx* cctx);
++ZSTD_DEPRECATED("The block API is deprecated in favor of the normal compression API. See docs.")
+ ZSTDLIB_STATIC_API size_t ZSTD_compressBlock  (ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
++ZSTD_DEPRECATED("The block API is deprecated in favor of the normal compression API. See docs.")
+ ZSTDLIB_STATIC_API size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
++ZSTD_DEPRECATED("The block API is deprecated in favor of the normal compression API. See docs.")
+ ZSTDLIB_STATIC_API size_t ZSTD_insertBlock    (ZSTD_DCtx* dctx, const void* blockStart, size_t blockSize);  /*< insert uncompressed block into `dctx` history. Useful for multi-blocks decompression. */
+ 
+-
+ #endif   /* ZSTD_H_ZSTD_STATIC_LINKING_ONLY */
+ 
+--- a/lib/zstd/Makefile
++++ b/lib/zstd/Makefile
+@@ -1,6 +1,6 @@
+ # SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ # ################################################################
+-# Copyright (c) Facebook, Inc.
++# Copyright (c) Meta Platforms, Inc. and affiliates.
+ # All rights reserved.
+ #
+ # This source code is licensed under both the BSD-style license (found in the
+--- /dev/null
++++ b/lib/zstd/common/allocations.h
+@@ -0,0 +1,56 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
++/*
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
++ * All rights reserved.
++ *
++ * This source code is licensed under both the BSD-style license (found in the
++ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
++ * in the COPYING file in the root directory of this source tree).
++ * You may select, at your option, one of the above-listed licenses.
++ */
++
++/* This file provides custom allocation primitives
++ */
++
++#define ZSTD_DEPS_NEED_MALLOC
++#include "zstd_deps.h"   /* ZSTD_malloc, ZSTD_calloc, ZSTD_free, ZSTD_memset */
++
++#include "compiler.h" /* MEM_STATIC */
++#define ZSTD_STATIC_LINKING_ONLY
++#include <linux/zstd.h> /* ZSTD_customMem */
++
++#ifndef ZSTD_ALLOCATIONS_H
++#define ZSTD_ALLOCATIONS_H
++
++/* custom memory allocation functions */
++
++MEM_STATIC void* ZSTD_customMalloc(size_t size, ZSTD_customMem customMem)
++{
++    if (customMem.customAlloc)
++        return customMem.customAlloc(customMem.opaque, size);
++    return ZSTD_malloc(size);
++}
++
++MEM_STATIC void* ZSTD_customCalloc(size_t size, ZSTD_customMem customMem)
++{
++    if (customMem.customAlloc) {
++        /* calloc implemented as malloc+memset;
++         * not as efficient as calloc, but next best guess for custom malloc */
++        void* const ptr = customMem.customAlloc(customMem.opaque, size);
++        ZSTD_memset(ptr, 0, size);
++        return ptr;
++    }
++    return ZSTD_calloc(1, size);
++}
++
++MEM_STATIC void ZSTD_customFree(void* ptr, ZSTD_customMem customMem)
++{
++    if (ptr!=NULL) {
++        if (customMem.customFree)
++            customMem.customFree(customMem.opaque, ptr);
++        else
++            ZSTD_free(ptr);
++    }
++}
++
++#endif /* ZSTD_ALLOCATIONS_H */
+--- /dev/null
++++ b/lib/zstd/common/bits.h
+@@ -0,0 +1,149 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
++/*
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
++ * All rights reserved.
++ *
++ * This source code is licensed under both the BSD-style license (found in the
++ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
++ * in the COPYING file in the root directory of this source tree).
++ * You may select, at your option, one of the above-listed licenses.
++ */
++
++#ifndef ZSTD_BITS_H
++#define ZSTD_BITS_H
++
++#include "mem.h"
++
++MEM_STATIC unsigned ZSTD_countTrailingZeros32_fallback(U32 val)
++{
++    assert(val != 0);
++    {
++        static const U32 DeBruijnBytePos[32] = {0, 1, 28, 2, 29, 14, 24, 3,
++                                                30, 22, 20, 15, 25, 17, 4, 8,
++                                                31, 27, 13, 23, 21, 19, 16, 7,
++                                                26, 12, 18, 6, 11, 5, 10, 9};
++        return DeBruijnBytePos[((U32) ((val & -(S32) val) * 0x077CB531U)) >> 27];
++    }
++}
++
++MEM_STATIC unsigned ZSTD_countTrailingZeros32(U32 val)
++{
++    assert(val != 0);
++#   if (__GNUC__ >= 4)
++        return (unsigned)__builtin_ctz(val);
++#   else
++        return ZSTD_countTrailingZeros32_fallback(val);
++#   endif
++}
++
++MEM_STATIC unsigned ZSTD_countLeadingZeros32_fallback(U32 val) {
++    assert(val != 0);
++    {
++        static const U32 DeBruijnClz[32] = {0, 9, 1, 10, 13, 21, 2, 29,
++                                            11, 14, 16, 18, 22, 25, 3, 30,
++                                            8, 12, 20, 28, 15, 17, 24, 7,
++                                            19, 27, 23, 6, 26, 5, 4, 31};
++        val |= val >> 1;
++        val |= val >> 2;
++        val |= val >> 4;
++        val |= val >> 8;
++        val |= val >> 16;
++        return 31 - DeBruijnClz[(val * 0x07C4ACDDU) >> 27];
++    }
++}
++
++MEM_STATIC unsigned ZSTD_countLeadingZeros32(U32 val)
++{
++    assert(val != 0);
++#   if (__GNUC__ >= 4)
++        return (unsigned)__builtin_clz(val);
++#   else
++        return ZSTD_countLeadingZeros32_fallback(val);
++#   endif
++}
++
++MEM_STATIC unsigned ZSTD_countTrailingZeros64(U64 val)
++{
++    assert(val != 0);
++#   if (__GNUC__ >= 4) && defined(__LP64__)
++        return (unsigned)__builtin_ctzll(val);
++#   else
++        {
++            U32 mostSignificantWord = (U32)(val >> 32);
++            U32 leastSignificantWord = (U32)val;
++            if (leastSignificantWord == 0) {
++                return 32 + ZSTD_countTrailingZeros32(mostSignificantWord);
++            } else {
++                return ZSTD_countTrailingZeros32(leastSignificantWord);
++            }
++        }
++#   endif
++}
++
++MEM_STATIC unsigned ZSTD_countLeadingZeros64(U64 val)
++{
++    assert(val != 0);
++#   if (__GNUC__ >= 4)
++        return (unsigned)(__builtin_clzll(val));
++#   else
++        {
++            U32 mostSignificantWord = (U32)(val >> 32);
++            U32 leastSignificantWord = (U32)val;
++            if (mostSignificantWord == 0) {
++                return 32 + ZSTD_countLeadingZeros32(leastSignificantWord);
++            } else {
++                return ZSTD_countLeadingZeros32(mostSignificantWord);
++            }
++        }
++#   endif
++}
++
++MEM_STATIC unsigned ZSTD_NbCommonBytes(size_t val)
++{
++    if (MEM_isLittleEndian()) {
++        if (MEM_64bits()) {
++            return ZSTD_countTrailingZeros64((U64)val) >> 3;
++        } else {
++            return ZSTD_countTrailingZeros32((U32)val) >> 3;
++        }
++    } else {  /* Big Endian CPU */
++        if (MEM_64bits()) {
++            return ZSTD_countLeadingZeros64((U64)val) >> 3;
++        } else {
++            return ZSTD_countLeadingZeros32((U32)val) >> 3;
++        }
++    }
++}
++
++MEM_STATIC unsigned ZSTD_highbit32(U32 val)   /* compress, dictBuilder, decodeCorpus */
++{
++    assert(val != 0);
++    return 31 - ZSTD_countLeadingZeros32(val);
++}
++
++/* ZSTD_rotateRight_*():
++ * Rotates a bitfield to the right by "count" bits.
++ * https://en.wikipedia.org/w/index.php?title=Circular_shift&oldid=991635599#Implementing_circular_shifts
++ */
++MEM_STATIC
++U64 ZSTD_rotateRight_U64(U64 const value, U32 count) {
++    assert(count < 64);
++    count &= 0x3F; /* for fickle pattern recognition */
++    return (value >> count) | (U64)(value << ((0U - count) & 0x3F));
++}
++
++MEM_STATIC
++U32 ZSTD_rotateRight_U32(U32 const value, U32 count) {
++    assert(count < 32);
++    count &= 0x1F; /* for fickle pattern recognition */
++    return (value >> count) | (U32)(value << ((0U - count) & 0x1F));
++}
++
++MEM_STATIC
++U16 ZSTD_rotateRight_U16(U16 const value, U32 count) {
++    assert(count < 16);
++    count &= 0x0F; /* for fickle pattern recognition */
++    return (value >> count) | (U16)(value << ((0U - count) & 0x0F));
++}
++
++#endif /* ZSTD_BITS_H */
+--- a/lib/zstd/common/bitstream.h
++++ b/lib/zstd/common/bitstream.h
+@@ -1,7 +1,8 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /* ******************************************************************
+  * bitstream
+  * Part of FSE library
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  *
+  * You can contact the author at :
+  * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+@@ -27,6 +28,7 @@
+ #include "compiler.h"       /* UNLIKELY() */
+ #include "debug.h"          /* assert(), DEBUGLOG(), RAWLOG() */
+ #include "error_private.h"  /* error codes and messages */
++#include "bits.h"           /* ZSTD_highbit32 */
+ 
+ 
+ /*=========================================
+@@ -79,19 +81,20 @@ MEM_STATIC size_t BIT_closeCStream(BIT_C
+ /*-********************************************
+ *  bitStream decoding API (read backward)
+ **********************************************/
++typedef size_t BitContainerType;
+ typedef struct {
+-    size_t   bitContainer;
++    BitContainerType bitContainer;
+     unsigned bitsConsumed;
+     const char* ptr;
+     const char* start;
+     const char* limitPtr;
+ } BIT_DStream_t;
+ 
+-typedef enum { BIT_DStream_unfinished = 0,
+-               BIT_DStream_endOfBuffer = 1,
+-               BIT_DStream_completed = 2,
+-               BIT_DStream_overflow = 3 } BIT_DStream_status;  /* result of BIT_reloadDStream() */
+-               /* 1,2,4,8 would be better for bitmap combinations, but slows down performance a bit ... :( */
++typedef enum { BIT_DStream_unfinished = 0,  /* fully refilled */
++               BIT_DStream_endOfBuffer = 1, /* still some bits left in bitstream */
++               BIT_DStream_completed = 2,   /* bitstream entirely consumed, bit-exact */
++               BIT_DStream_overflow = 3     /* user requested more bits than present in bitstream */
++    } BIT_DStream_status;  /* result of BIT_reloadDStream() */
+ 
+ MEM_STATIC size_t   BIT_initDStream(BIT_DStream_t* bitD, const void* srcBuffer, size_t srcSize);
+ MEM_STATIC size_t   BIT_readBits(BIT_DStream_t* bitD, unsigned nbBits);
+@@ -101,7 +104,7 @@ MEM_STATIC unsigned BIT_endOfDStream(con
+ 
+ /* Start by invoking BIT_initDStream().
+ *  A chunk of the bitStream is then stored into a local register.
+-*  Local register size is 64-bits on 64-bits systems, 32-bits on 32-bits systems (size_t).
++*  Local register size is 64-bits on 64-bits systems, 32-bits on 32-bits systems (BitContainerType).
+ *  You can then retrieve bitFields stored into the local register, **in reverse order**.
+ *  Local register is explicitly reloaded from memory by the BIT_reloadDStream() method.
+ *  A reload guarantee a minimum of ((8*sizeof(bitD->bitContainer))-7) bits when its result is BIT_DStream_unfinished.
+@@ -122,33 +125,6 @@ MEM_STATIC void BIT_flushBitsFast(BIT_CS
+ MEM_STATIC size_t BIT_readBitsFast(BIT_DStream_t* bitD, unsigned nbBits);
+ /* faster, but works only if nbBits >= 1 */
+ 
+-
+-
+-/*-**************************************************************
+-*  Internal functions
+-****************************************************************/
+-MEM_STATIC unsigned BIT_highbit32 (U32 val)
+-{
+-    assert(val != 0);
+-    {
+-#   if (__GNUC__ >= 3)   /* Use GCC Intrinsic */
+-        return __builtin_clz (val) ^ 31;
+-#   else   /* Software version */
+-        static const unsigned DeBruijnClz[32] = { 0,  9,  1, 10, 13, 21,  2, 29,
+-                                                 11, 14, 16, 18, 22, 25,  3, 30,
+-                                                  8, 12, 20, 28, 15, 17, 24,  7,
+-                                                 19, 27, 23,  6, 26,  5,  4, 31 };
+-        U32 v = val;
+-        v |= v >> 1;
+-        v |= v >> 2;
+-        v |= v >> 4;
+-        v |= v >> 8;
+-        v |= v >> 16;
+-        return DeBruijnClz[ (U32) (v * 0x07C4ACDDU) >> 27];
+-#   endif
+-    }
+-}
+-
+ /*=====    Local Constants   =====*/
+ static const unsigned BIT_mask[] = {
+     0,          1,         3,         7,         0xF,       0x1F,
+@@ -178,6 +154,12 @@ MEM_STATIC size_t BIT_initCStream(BIT_CS
+     return 0;
+ }
+ 
++FORCE_INLINE_TEMPLATE size_t BIT_getLowerBits(size_t bitContainer, U32 const nbBits)
++{
++    assert(nbBits < BIT_MASK_SIZE);
++    return bitContainer & BIT_mask[nbBits];
++}
++
+ /*! BIT_addBits() :
+  *  can add up to 31 bits into `bitC`.
+  *  Note : does not check for register overflow ! */
+@@ -187,7 +169,7 @@ MEM_STATIC void BIT_addBits(BIT_CStream_
+     DEBUG_STATIC_ASSERT(BIT_MASK_SIZE == 32);
+     assert(nbBits < BIT_MASK_SIZE);
+     assert(nbBits + bitC->bitPos < sizeof(bitC->bitContainer) * 8);
+-    bitC->bitContainer |= (value & BIT_mask[nbBits]) << bitC->bitPos;
++    bitC->bitContainer |= BIT_getLowerBits(value, nbBits) << bitC->bitPos;
+     bitC->bitPos += nbBits;
+ }
+ 
+@@ -266,35 +248,35 @@ MEM_STATIC size_t BIT_initDStream(BIT_DS
+         bitD->ptr   = (const char*)srcBuffer + srcSize - sizeof(bitD->bitContainer);
+         bitD->bitContainer = MEM_readLEST(bitD->ptr);
+         { BYTE const lastByte = ((const BYTE*)srcBuffer)[srcSize-1];
+-          bitD->bitsConsumed = lastByte ? 8 - BIT_highbit32(lastByte) : 0;  /* ensures bitsConsumed is always set */
++          bitD->bitsConsumed = lastByte ? 8 - ZSTD_highbit32(lastByte) : 0;  /* ensures bitsConsumed is always set */
+           if (lastByte == 0) return ERROR(GENERIC); /* endMark not present */ }
+     } else {
+         bitD->ptr   = bitD->start;
+         bitD->bitContainer = *(const BYTE*)(bitD->start);
+         switch(srcSize)
+         {
+-        case 7: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[6]) << (sizeof(bitD->bitContainer)*8 - 16);
++        case 7: bitD->bitContainer += (BitContainerType)(((const BYTE*)(srcBuffer))[6]) << (sizeof(bitD->bitContainer)*8 - 16);
+                 ZSTD_FALLTHROUGH;
+ 
+-        case 6: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[5]) << (sizeof(bitD->bitContainer)*8 - 24);
++        case 6: bitD->bitContainer += (BitContainerType)(((const BYTE*)(srcBuffer))[5]) << (sizeof(bitD->bitContainer)*8 - 24);
+                 ZSTD_FALLTHROUGH;
+ 
+-        case 5: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[4]) << (sizeof(bitD->bitContainer)*8 - 32);
++        case 5: bitD->bitContainer += (BitContainerType)(((const BYTE*)(srcBuffer))[4]) << (sizeof(bitD->bitContainer)*8 - 32);
+                 ZSTD_FALLTHROUGH;
+ 
+-        case 4: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[3]) << 24;
++        case 4: bitD->bitContainer += (BitContainerType)(((const BYTE*)(srcBuffer))[3]) << 24;
+                 ZSTD_FALLTHROUGH;
+ 
+-        case 3: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[2]) << 16;
++        case 3: bitD->bitContainer += (BitContainerType)(((const BYTE*)(srcBuffer))[2]) << 16;
+                 ZSTD_FALLTHROUGH;
+ 
+-        case 2: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[1]) <<  8;
++        case 2: bitD->bitContainer += (BitContainerType)(((const BYTE*)(srcBuffer))[1]) <<  8;
+                 ZSTD_FALLTHROUGH;
+ 
+         default: break;
+         }
+         {   BYTE const lastByte = ((const BYTE*)srcBuffer)[srcSize-1];
+-            bitD->bitsConsumed = lastByte ? 8 - BIT_highbit32(lastByte) : 0;
++            bitD->bitsConsumed = lastByte ? 8 - ZSTD_highbit32(lastByte) : 0;
+             if (lastByte == 0) return ERROR(corruption_detected);  /* endMark not present */
+         }
+         bitD->bitsConsumed += (U32)(sizeof(bitD->bitContainer) - srcSize)*8;
+@@ -303,12 +285,12 @@ MEM_STATIC size_t BIT_initDStream(BIT_DS
+     return srcSize;
+ }
+ 
+-MEM_STATIC FORCE_INLINE_ATTR size_t BIT_getUpperBits(size_t bitContainer, U32 const start)
++FORCE_INLINE_TEMPLATE size_t BIT_getUpperBits(BitContainerType bitContainer, U32 const start)
+ {
+     return bitContainer >> start;
+ }
+ 
+-MEM_STATIC FORCE_INLINE_ATTR size_t BIT_getMiddleBits(size_t bitContainer, U32 const start, U32 const nbBits)
++FORCE_INLINE_TEMPLATE size_t BIT_getMiddleBits(BitContainerType bitContainer, U32 const start, U32 const nbBits)
+ {
+     U32 const regMask = sizeof(bitContainer)*8 - 1;
+     /* if start > regMask, bitstream is corrupted, and result is undefined */
+@@ -325,19 +307,13 @@ MEM_STATIC FORCE_INLINE_ATTR size_t BIT_
+ #endif
+ }
+ 
+-MEM_STATIC FORCE_INLINE_ATTR size_t BIT_getLowerBits(size_t bitContainer, U32 const nbBits)
+-{
+-    assert(nbBits < BIT_MASK_SIZE);
+-    return bitContainer & BIT_mask[nbBits];
+-}
+-
+ /*! BIT_lookBits() :
+  *  Provides next n bits from local register.
+  *  local register is not modified.
+  *  On 32-bits, maxNbBits==24.
+  *  On 64-bits, maxNbBits==56.
+  * @return : value extracted */
+-MEM_STATIC  FORCE_INLINE_ATTR size_t BIT_lookBits(const BIT_DStream_t*  bitD, U32 nbBits)
++FORCE_INLINE_TEMPLATE size_t BIT_lookBits(const BIT_DStream_t*  bitD, U32 nbBits)
+ {
+     /* arbitrate between double-shift and shift+mask */
+ #if 1
+@@ -360,7 +336,7 @@ MEM_STATIC size_t BIT_lookBitsFast(const
+     return (bitD->bitContainer << (bitD->bitsConsumed & regMask)) >> (((regMask+1)-nbBits) & regMask);
+ }
+ 
+-MEM_STATIC FORCE_INLINE_ATTR void BIT_skipBits(BIT_DStream_t* bitD, U32 nbBits)
++FORCE_INLINE_TEMPLATE void BIT_skipBits(BIT_DStream_t* bitD, U32 nbBits)
+ {
+     bitD->bitsConsumed += nbBits;
+ }
+@@ -369,7 +345,7 @@ MEM_STATIC FORCE_INLINE_ATTR void BIT_sk
+  *  Read (consume) next n bits from local register and update.
+  *  Pay attention to not read more than nbBits contained into local register.
+  * @return : extracted value. */
+-MEM_STATIC FORCE_INLINE_ATTR size_t BIT_readBits(BIT_DStream_t* bitD, unsigned nbBits)
++FORCE_INLINE_TEMPLATE size_t BIT_readBits(BIT_DStream_t* bitD, unsigned nbBits)
+ {
+     size_t const value = BIT_lookBits(bitD, nbBits);
+     BIT_skipBits(bitD, nbBits);
+@@ -377,7 +353,7 @@ MEM_STATIC FORCE_INLINE_ATTR size_t BIT_
+ }
+ 
+ /*! BIT_readBitsFast() :
+- *  unsafe version; only works only if nbBits >= 1 */
++ *  unsafe version; only works if nbBits >= 1 */
+ MEM_STATIC size_t BIT_readBitsFast(BIT_DStream_t* bitD, unsigned nbBits)
+ {
+     size_t const value = BIT_lookBitsFast(bitD, nbBits);
+@@ -386,6 +362,21 @@ MEM_STATIC size_t BIT_readBitsFast(BIT_D
+     return value;
+ }
+ 
++/*! BIT_reloadDStream_internal() :
++ *  Simple variant of BIT_reloadDStream(), with two conditions:
++ *  1. bitstream is valid : bitsConsumed <= sizeof(bitD->bitContainer)*8
++ *  2. look window is valid after shifted down : bitD->ptr >= bitD->start
++ */
++MEM_STATIC BIT_DStream_status BIT_reloadDStream_internal(BIT_DStream_t* bitD)
++{
++    assert(bitD->bitsConsumed <= sizeof(bitD->bitContainer)*8);
++    bitD->ptr -= bitD->bitsConsumed >> 3;
++    assert(bitD->ptr >= bitD->start);
++    bitD->bitsConsumed &= 7;
++    bitD->bitContainer = MEM_readLEST(bitD->ptr);
++    return BIT_DStream_unfinished;
++}
++
+ /*! BIT_reloadDStreamFast() :
+  *  Similar to BIT_reloadDStream(), but with two differences:
+  *  1. bitsConsumed <= sizeof(bitD->bitContainer)*8 must hold!
+@@ -396,31 +387,35 @@ MEM_STATIC BIT_DStream_status BIT_reload
+ {
+     if (UNLIKELY(bitD->ptr < bitD->limitPtr))
+         return BIT_DStream_overflow;
+-    assert(bitD->bitsConsumed <= sizeof(bitD->bitContainer)*8);
+-    bitD->ptr -= bitD->bitsConsumed >> 3;
+-    bitD->bitsConsumed &= 7;
+-    bitD->bitContainer = MEM_readLEST(bitD->ptr);
+-    return BIT_DStream_unfinished;
++    return BIT_reloadDStream_internal(bitD);
+ }
+ 
+ /*! BIT_reloadDStream() :
+  *  Refill `bitD` from buffer previously set in BIT_initDStream() .
+- *  This function is safe, it guarantees it will not read beyond src buffer.
++ *  This function is safe, it guarantees it will not never beyond src buffer.
+  * @return : status of `BIT_DStream_t` internal register.
+  *           when status == BIT_DStream_unfinished, internal register is filled with at least 25 or 57 bits */
+-MEM_STATIC BIT_DStream_status BIT_reloadDStream(BIT_DStream_t* bitD)
++FORCE_INLINE_TEMPLATE BIT_DStream_status BIT_reloadDStream(BIT_DStream_t* bitD)
+ {
+-    if (bitD->bitsConsumed > (sizeof(bitD->bitContainer)*8))  /* overflow detected, like end of stream */
++    /* note : once in overflow mode, a bitstream remains in this mode until it's reset */
++    if (UNLIKELY(bitD->bitsConsumed > (sizeof(bitD->bitContainer)*8))) {
++        static const BitContainerType zeroFilled = 0;
++        bitD->ptr = (const char*)&zeroFilled; /* aliasing is allowed for char */
++        /* overflow detected, erroneous scenario or end of stream: no update */
+         return BIT_DStream_overflow;
++    }
++
++    assert(bitD->ptr >= bitD->start);
+ 
+     if (bitD->ptr >= bitD->limitPtr) {
+-        return BIT_reloadDStreamFast(bitD);
++        return BIT_reloadDStream_internal(bitD);
+     }
+     if (bitD->ptr == bitD->start) {
++        /* reached end of bitStream => no update */
+         if (bitD->bitsConsumed < sizeof(bitD->bitContainer)*8) return BIT_DStream_endOfBuffer;
+         return BIT_DStream_completed;
+     }
+-    /* start < ptr < limitPtr */
++    /* start < ptr < limitPtr => cautious update */
+     {   U32 nbBytes = bitD->bitsConsumed >> 3;
+         BIT_DStream_status result = BIT_DStream_unfinished;
+         if (bitD->ptr - nbBytes < bitD->start) {
+--- a/lib/zstd/common/compiler.h
++++ b/lib/zstd/common/compiler.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -11,6 +12,8 @@
+ #ifndef ZSTD_COMPILER_H
+ #define ZSTD_COMPILER_H
+ 
++#include <linux/types.h>
++
+ #include "portability_macros.h"
+ 
+ /*-*******************************************************
+@@ -41,12 +44,15 @@
+ */
+ #define WIN_CDECL
+ 
++/* UNUSED_ATTR tells the compiler it is okay if the function is unused. */
++#define UNUSED_ATTR __attribute__((unused))
++
+ /*
+  * FORCE_INLINE_TEMPLATE is used to define C "templates", which take constant
+  * parameters. They must be inlined for the compiler to eliminate the constant
+  * branches.
+  */
+-#define FORCE_INLINE_TEMPLATE static INLINE_KEYWORD FORCE_INLINE_ATTR
++#define FORCE_INLINE_TEMPLATE static INLINE_KEYWORD FORCE_INLINE_ATTR UNUSED_ATTR
+ /*
+  * HINT_INLINE is used to help the compiler generate better code. It is *not*
+  * used for "templates", so it can be tweaked based on the compilers
+@@ -61,11 +67,21 @@
+ #if !defined(__clang__) && defined(__GNUC__) && __GNUC__ >= 4 && __GNUC_MINOR__ >= 8 && __GNUC__ < 5
+ #  define HINT_INLINE static INLINE_KEYWORD
+ #else
+-#  define HINT_INLINE static INLINE_KEYWORD FORCE_INLINE_ATTR
++#  define HINT_INLINE FORCE_INLINE_TEMPLATE
+ #endif
+ 
+-/* UNUSED_ATTR tells the compiler it is okay if the function is unused. */
+-#define UNUSED_ATTR __attribute__((unused))
++/* "soft" inline :
++ * The compiler is free to select if it's a good idea to inline or not.
++ * The main objective is to silence compiler warnings
++ * when a defined function in included but not used.
++ *
++ * Note : this macro is prefixed `MEM_` because it used to be provided by `mem.h` unit.
++ * Updating the prefix is probably preferable, but requires a fairly large codemod,
++ * since this name is used everywhere.
++ */
++#ifndef MEM_STATIC  /* already defined in Linux Kernel mem.h */
++#define MEM_STATIC static __inline UNUSED_ATTR
++#endif
+ 
+ /* force no inlining */
+ #define FORCE_NOINLINE static __attribute__((__noinline__))
+@@ -86,23 +102,24 @@
+ #  define PREFETCH_L1(ptr)  __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */)
+ #  define PREFETCH_L2(ptr)  __builtin_prefetch((ptr), 0 /* rw==read */, 2 /* locality */)
+ #elif defined(__aarch64__)
+-#  define PREFETCH_L1(ptr)  __asm__ __volatile__("prfm pldl1keep, %0" ::"Q"(*(ptr)))
+-#  define PREFETCH_L2(ptr)  __asm__ __volatile__("prfm pldl2keep, %0" ::"Q"(*(ptr)))
++#  define PREFETCH_L1(ptr)  do { __asm__ __volatile__("prfm pldl1keep, %0" ::"Q"(*(ptr))); } while (0)
++#  define PREFETCH_L2(ptr)  do { __asm__ __volatile__("prfm pldl2keep, %0" ::"Q"(*(ptr))); } while (0)
+ #else
+-#  define PREFETCH_L1(ptr) (void)(ptr)  /* disabled */
+-#  define PREFETCH_L2(ptr) (void)(ptr)  /* disabled */
++#  define PREFETCH_L1(ptr) do { (void)(ptr); } while (0)  /* disabled */
++#  define PREFETCH_L2(ptr) do { (void)(ptr); } while (0)  /* disabled */
+ #endif  /* NO_PREFETCH */
+ 
+ #define CACHELINE_SIZE 64
+ 
+-#define PREFETCH_AREA(p, s)  {            \
+-    const char* const _ptr = (const char*)(p);  \
+-    size_t const _size = (size_t)(s);     \
+-    size_t _pos;                          \
+-    for (_pos=0; _pos<_size; _pos+=CACHELINE_SIZE) {  \
+-        PREFETCH_L2(_ptr + _pos);         \
+-    }                                     \
+-}
++#define PREFETCH_AREA(p, s)                              \
++    do {                                                 \
++        const char* const _ptr = (const char*)(p);       \
++        size_t const _size = (size_t)(s);                \
++        size_t _pos;                                     \
++        for (_pos=0; _pos<_size; _pos+=CACHELINE_SIZE) { \
++            PREFETCH_L2(_ptr + _pos);                    \
++        }                                                \
++    } while (0)
+ 
+ /* vectorization
+  * older GCC (pre gcc-4.3 picked as the cutoff) uses a different syntax,
+@@ -126,9 +143,9 @@
+ #define UNLIKELY(x) (__builtin_expect((x), 0))
+ 
+ #if __has_builtin(__builtin_unreachable) || (defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 5)))
+-#  define ZSTD_UNREACHABLE { assert(0), __builtin_unreachable(); }
++#  define ZSTD_UNREACHABLE do { assert(0), __builtin_unreachable(); } while (0)
+ #else
+-#  define ZSTD_UNREACHABLE { assert(0); }
++#  define ZSTD_UNREACHABLE do { assert(0); } while (0)
+ #endif
+ 
+ /* disable warnings */
+@@ -179,6 +196,85 @@
+ *  Sanitizer
+ *****************************************************************/
+ 
++/*
++ * Zstd relies on pointer overflow in its decompressor.
++ * We add this attribute to functions that rely on pointer overflow.
++ */
++#ifndef ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++#  if __has_attribute(no_sanitize)
++#    if !defined(__clang__) && defined(__GNUC__) && __GNUC__ < 8
++       /* gcc < 8 only has signed-integer-overlow which triggers on pointer overflow */
++#      define ZSTD_ALLOW_POINTER_OVERFLOW_ATTR __attribute__((no_sanitize("signed-integer-overflow")))
++#    else
++       /* older versions of clang [3.7, 5.0) will warn that pointer-overflow is ignored. */
++#      define ZSTD_ALLOW_POINTER_OVERFLOW_ATTR __attribute__((no_sanitize("pointer-overflow")))
++#    endif
++#  else
++#    define ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++#  endif
++#endif
++
++/*
++ * Helper function to perform a wrapped pointer difference without trigging
++ * UBSAN.
++ *
++ * @returns lhs - rhs with wrapping
++ */
++MEM_STATIC
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++ptrdiff_t ZSTD_wrappedPtrDiff(unsigned char const* lhs, unsigned char const* rhs)
++{
++    return lhs - rhs;
++}
++
++/*
++ * Helper function to perform a wrapped pointer add without triggering UBSAN.
++ *
++ * @return ptr + add with wrapping
++ */
++MEM_STATIC
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++unsigned char const* ZSTD_wrappedPtrAdd(unsigned char const* ptr, ptrdiff_t add)
++{
++    return ptr + add;
++}
++
++/*
++ * Helper function to perform a wrapped pointer subtraction without triggering
++ * UBSAN.
++ *
++ * @return ptr - sub with wrapping
++ */
++MEM_STATIC
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++unsigned char const* ZSTD_wrappedPtrSub(unsigned char const* ptr, ptrdiff_t sub)
++{
++    return ptr - sub;
++}
++
++/*
++ * Helper function to add to a pointer that works around C's undefined behavior
++ * of adding 0 to NULL.
++ *
++ * @returns `ptr + add` except it defines `NULL + 0 == NULL`.
++ */
++MEM_STATIC
++unsigned char* ZSTD_maybeNullPtrAdd(unsigned char* ptr, ptrdiff_t add)
++{
++    return add > 0 ? ptr + add : ptr;
++}
++
++/* Issue #3240 reports an ASAN failure on an llvm-mingw build. Out of an
++ * abundance of caution, disable our custom poisoning on mingw. */
++#ifdef __MINGW32__
++#ifndef ZSTD_ASAN_DONT_POISON_WORKSPACE
++#define ZSTD_ASAN_DONT_POISON_WORKSPACE 1
++#endif
++#ifndef ZSTD_MSAN_DONT_POISON_WORKSPACE
++#define ZSTD_MSAN_DONT_POISON_WORKSPACE 1
++#endif
++#endif
++
+ 
+ 
+ #endif /* ZSTD_COMPILER_H */
+--- a/lib/zstd/common/cpu.h
++++ b/lib/zstd/common/cpu.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+--- a/lib/zstd/common/debug.c
++++ b/lib/zstd/common/debug.c
+@@ -1,7 +1,8 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /* ******************************************************************
+  * debug
+  * Part of FSE library
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  *
+  * You can contact the author at :
+  * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+@@ -21,4 +22,10 @@
+ 
+ #include "debug.h"
+ 
++#if (DEBUGLEVEL>=2)
++/* We only use this when DEBUGLEVEL>=2, but we get -Werror=pedantic errors if a
++ * translation unit is empty. So remove this from Linux kernel builds, but
++ * otherwise just leave it in.
++ */
+ int g_debuglevel = DEBUGLEVEL;
++#endif
+--- a/lib/zstd/common/debug.h
++++ b/lib/zstd/common/debug.h
+@@ -1,7 +1,8 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /* ******************************************************************
+  * debug
+  * Part of FSE library
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  *
+  * You can contact the author at :
+  * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+@@ -82,18 +83,27 @@ extern int g_debuglevel; /* the variable
+                             It's useful when enabling very verbose levels
+                             on selective conditions (such as position in src) */
+ 
+-#  define RAWLOG(l, ...) {                                       \
+-                if (l<=g_debuglevel) {                           \
+-                    ZSTD_DEBUG_PRINT(__VA_ARGS__);               \
+-            }   }
+-#  define DEBUGLOG(l, ...) {                                     \
+-                if (l<=g_debuglevel) {                           \
+-                    ZSTD_DEBUG_PRINT(__FILE__ ": " __VA_ARGS__); \
+-                    ZSTD_DEBUG_PRINT(" \n");                     \
+-            }   }
++#  define RAWLOG(l, ...)                   \
++    do {                                   \
++        if (l<=g_debuglevel) {             \
++            ZSTD_DEBUG_PRINT(__VA_ARGS__); \
++        }                                  \
++    } while (0)
++
++#define STRINGIFY(x) #x
++#define TOSTRING(x) STRINGIFY(x)
++#define LINE_AS_STRING TOSTRING(__LINE__)
++
++#  define DEBUGLOG(l, ...)                               \
++    do {                                                 \
++        if (l<=g_debuglevel) {                           \
++            ZSTD_DEBUG_PRINT(__FILE__ ":" LINE_AS_STRING ": " __VA_ARGS__); \
++            ZSTD_DEBUG_PRINT(" \n");                     \
++        }                                                \
++    } while (0)
+ #else
+-#  define RAWLOG(l, ...)      {}    /* disabled */
+-#  define DEBUGLOG(l, ...)    {}    /* disabled */
++#  define RAWLOG(l, ...)   do { } while (0)    /* disabled */
++#  define DEBUGLOG(l, ...) do { } while (0)    /* disabled */
+ #endif
+ 
+ 
+--- a/lib/zstd/common/entropy_common.c
++++ b/lib/zstd/common/entropy_common.c
+@@ -1,6 +1,7 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /* ******************************************************************
+  * Common functions of New Generation Entropy library
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  *
+  *  You can contact the author at :
+  *  - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy
+@@ -19,8 +20,8 @@
+ #include "error_private.h"       /* ERR_*, ERROR */
+ #define FSE_STATIC_LINKING_ONLY  /* FSE_MIN_TABLELOG */
+ #include "fse.h"
+-#define HUF_STATIC_LINKING_ONLY  /* HUF_TABLELOG_ABSOLUTEMAX */
+ #include "huf.h"
++#include "bits.h"                /* ZSDT_highbit32, ZSTD_countTrailingZeros32 */
+ 
+ 
+ /*===   Version   ===*/
+@@ -38,23 +39,6 @@ const char* HUF_getErrorName(size_t code
+ /*-**************************************************************
+ *  FSE NCount encoding-decoding
+ ****************************************************************/
+-static U32 FSE_ctz(U32 val)
+-{
+-    assert(val != 0);
+-    {
+-#   if (__GNUC__ >= 3)   /* GCC Intrinsic */
+-        return __builtin_ctz(val);
+-#   else   /* Software version */
+-        U32 count = 0;
+-        while ((val & 1) == 0) {
+-            val >>= 1;
+-            ++count;
+-        }
+-        return count;
+-#   endif
+-    }
+-}
+-
+ FORCE_INLINE_TEMPLATE
+ size_t FSE_readNCount_body(short* normalizedCounter, unsigned* maxSVPtr, unsigned* tableLogPtr,
+                            const void* headerBuffer, size_t hbSize)
+@@ -102,7 +86,7 @@ size_t FSE_readNCount_body(short* normal
+              * repeat.
+              * Avoid UB by setting the high bit to 1.
+              */
+-            int repeats = FSE_ctz(~bitStream | 0x80000000) >> 1;
++            int repeats = ZSTD_countTrailingZeros32(~bitStream | 0x80000000) >> 1;
+             while (repeats >= 12) {
+                 charnum += 3 * 12;
+                 if (LIKELY(ip <= iend-7)) {
+@@ -113,7 +97,7 @@ size_t FSE_readNCount_body(short* normal
+                     ip = iend - 4;
+                 }
+                 bitStream = MEM_readLE32(ip) >> bitCount;
+-                repeats = FSE_ctz(~bitStream | 0x80000000) >> 1;
++                repeats = ZSTD_countTrailingZeros32(~bitStream | 0x80000000) >> 1;
+             }
+             charnum += 3 * repeats;
+             bitStream >>= 2 * repeats;
+@@ -178,7 +162,7 @@ size_t FSE_readNCount_body(short* normal
+                  * know that threshold > 1.
+                  */
+                 if (remaining <= 1) break;
+-                nbBits = BIT_highbit32(remaining) + 1;
++                nbBits = ZSTD_highbit32(remaining) + 1;
+                 threshold = 1 << (nbBits - 1);
+             }
+             if (charnum >= maxSV1) break;
+@@ -253,7 +237,7 @@ size_t HUF_readStats(BYTE* huffWeight, s
+                      const void* src, size_t srcSize)
+ {
+     U32 wksp[HUF_READ_STATS_WORKSPACE_SIZE_U32];
+-    return HUF_readStats_wksp(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, wksp, sizeof(wksp), /* bmi2 */ 0);
++    return HUF_readStats_wksp(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, wksp, sizeof(wksp), /* flags */ 0);
+ }
+ 
+ FORCE_INLINE_TEMPLATE size_t
+@@ -301,14 +285,14 @@ HUF_readStats_body(BYTE* huffWeight, siz
+     if (weightTotal == 0) return ERROR(corruption_detected);
+ 
+     /* get last non-null symbol weight (implied, total must be 2^n) */
+-    {   U32 const tableLog = BIT_highbit32(weightTotal) + 1;
++    {   U32 const tableLog = ZSTD_highbit32(weightTotal) + 1;
+         if (tableLog > HUF_TABLELOG_MAX) return ERROR(corruption_detected);
+         *tableLogPtr = tableLog;
+         /* determine last weight */
+         {   U32 const total = 1 << tableLog;
+             U32 const rest = total - weightTotal;
+-            U32 const verif = 1 << BIT_highbit32(rest);
+-            U32 const lastWeight = BIT_highbit32(rest) + 1;
++            U32 const verif = 1 << ZSTD_highbit32(rest);
++            U32 const lastWeight = ZSTD_highbit32(rest) + 1;
+             if (verif != rest) return ERROR(corruption_detected);    /* last value must be a clean power of 2 */
+             huffWeight[oSize] = (BYTE)lastWeight;
+             rankStats[lastWeight]++;
+@@ -345,13 +329,13 @@ size_t HUF_readStats_wksp(BYTE* huffWeig
+                      U32* nbSymbolsPtr, U32* tableLogPtr,
+                      const void* src, size_t srcSize,
+                      void* workSpace, size_t wkspSize,
+-                     int bmi2)
++                     int flags)
+ {
+ #if DYNAMIC_BMI2
+-    if (bmi2) {
++    if (flags & HUF_flags_bmi2) {
+         return HUF_readStats_body_bmi2(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, workSpace, wkspSize);
+     }
+ #endif
+-    (void)bmi2;
++    (void)flags;
+     return HUF_readStats_body_default(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, workSpace, wkspSize);
+ }
+--- a/lib/zstd/common/error_private.c
++++ b/lib/zstd/common/error_private.c
+@@ -1,5 +1,6 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -27,9 +28,11 @@ const char* ERR_getErrorString(ERR_enum
+     case PREFIX(version_unsupported): return "Version not supported";
+     case PREFIX(frameParameter_unsupported): return "Unsupported frame parameter";
+     case PREFIX(frameParameter_windowTooLarge): return "Frame requires too much memory for decoding";
+-    case PREFIX(corruption_detected): return "Corrupted block detected";
++    case PREFIX(corruption_detected): return "Data corruption detected";
+     case PREFIX(checksum_wrong): return "Restored data doesn't match checksum";
++    case PREFIX(literals_headerWrong): return "Header of Literals' block doesn't respect format specification";
+     case PREFIX(parameter_unsupported): return "Unsupported parameter";
++    case PREFIX(parameter_combination_unsupported): return "Unsupported combination of parameters";
+     case PREFIX(parameter_outOfBound): return "Parameter is out of bound";
+     case PREFIX(init_missing): return "Context should be init first";
+     case PREFIX(memory_allocation): return "Allocation error : not enough memory";
+@@ -38,17 +41,22 @@ const char* ERR_getErrorString(ERR_enum
+     case PREFIX(tableLog_tooLarge): return "tableLog requires too much memory : unsupported";
+     case PREFIX(maxSymbolValue_tooLarge): return "Unsupported max Symbol Value : too large";
+     case PREFIX(maxSymbolValue_tooSmall): return "Specified maxSymbolValue is too small";
++    case PREFIX(stabilityCondition_notRespected): return "pledged buffer stability condition is not respected";
+     case PREFIX(dictionary_corrupted): return "Dictionary is corrupted";
+     case PREFIX(dictionary_wrong): return "Dictionary mismatch";
+     case PREFIX(dictionaryCreation_failed): return "Cannot create Dictionary from provided samples";
+     case PREFIX(dstSize_tooSmall): return "Destination buffer is too small";
+     case PREFIX(srcSize_wrong): return "Src size is incorrect";
+     case PREFIX(dstBuffer_null): return "Operation on NULL destination buffer";
++    case PREFIX(noForwardProgress_destFull): return "Operation made no progress over multiple calls, due to output buffer being full";
++    case PREFIX(noForwardProgress_inputEmpty): return "Operation made no progress over multiple calls, due to input being empty";
+         /* following error codes are not stable and may be removed or changed in a future version */
+     case PREFIX(frameIndex_tooLarge): return "Frame index is too large";
+     case PREFIX(seekableIO): return "An I/O error occurred when reading/seeking";
+     case PREFIX(dstBuffer_wrong): return "Destination buffer is wrong";
+     case PREFIX(srcBuffer_wrong): return "Source buffer is wrong";
++    case PREFIX(sequenceProducer_failed): return "Block-level external sequence producer returned an error code";
++    case PREFIX(externalSequences_invalid): return "External sequences are not valid";
+     case PREFIX(maxCode):
+     default: return notErrorCode;
+     }
+--- a/lib/zstd/common/error_private.h
++++ b/lib/zstd/common/error_private.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -49,8 +50,13 @@ ERR_STATIC unsigned ERR_isError(size_t c
+ ERR_STATIC ERR_enum ERR_getErrorCode(size_t code) { if (!ERR_isError(code)) return (ERR_enum)0; return (ERR_enum) (0-code); }
+ 
+ /* check and forward error code */
+-#define CHECK_V_F(e, f) size_t const e = f; if (ERR_isError(e)) return e
+-#define CHECK_F(f)   { CHECK_V_F(_var_err__, f); }
++#define CHECK_V_F(e, f)     \
++    size_t const e = f;     \
++    do {                    \
++        if (ERR_isError(e)) \
++            return e;       \
++    } while (0)
++#define CHECK_F(f)   do { CHECK_V_F(_var_err__, f); } while (0)
+ 
+ 
+ /*-****************************************
+@@ -84,10 +90,12 @@ void _force_has_format_string(const char
+  * We want to force this function invocation to be syntactically correct, but
+  * we don't want to force runtime evaluation of its arguments.
+  */
+-#define _FORCE_HAS_FORMAT_STRING(...) \
+-  if (0) { \
+-    _force_has_format_string(__VA_ARGS__); \
+-  }
++#define _FORCE_HAS_FORMAT_STRING(...)              \
++    do {                                           \
++        if (0) {                                   \
++            _force_has_format_string(__VA_ARGS__); \
++        }                                          \
++    } while (0)
+ 
+ #define ERR_QUOTE(str) #str
+ 
+@@ -98,48 +106,50 @@ void _force_has_format_string(const char
+  * In order to do that (particularly, printing the conditional that failed),
+  * this can't just wrap RETURN_ERROR().
+  */
+-#define RETURN_ERROR_IF(cond, err, ...) \
+-  if (cond) { \
+-    RAWLOG(3, "%s:%d: ERROR!: check %s failed, returning %s", \
+-           __FILE__, __LINE__, ERR_QUOTE(cond), ERR_QUOTE(ERROR(err))); \
+-    _FORCE_HAS_FORMAT_STRING(__VA_ARGS__); \
+-    RAWLOG(3, ": " __VA_ARGS__); \
+-    RAWLOG(3, "\n"); \
+-    return ERROR(err); \
+-  }
++#define RETURN_ERROR_IF(cond, err, ...)                                        \
++    do {                                                                       \
++        if (cond) {                                                            \
++            RAWLOG(3, "%s:%d: ERROR!: check %s failed, returning %s",          \
++                  __FILE__, __LINE__, ERR_QUOTE(cond), ERR_QUOTE(ERROR(err))); \
++            _FORCE_HAS_FORMAT_STRING(__VA_ARGS__);                             \
++            RAWLOG(3, ": " __VA_ARGS__);                                       \
++            RAWLOG(3, "\n");                                                   \
++            return ERROR(err);                                                 \
++        }                                                                      \
++    } while (0)
+ 
+ /*
+  * Unconditionally return the specified error.
+  *
+  * In debug modes, prints additional information.
+  */
+-#define RETURN_ERROR(err, ...) \
+-  do { \
+-    RAWLOG(3, "%s:%d: ERROR!: unconditional check failed, returning %s", \
+-           __FILE__, __LINE__, ERR_QUOTE(ERROR(err))); \
+-    _FORCE_HAS_FORMAT_STRING(__VA_ARGS__); \
+-    RAWLOG(3, ": " __VA_ARGS__); \
+-    RAWLOG(3, "\n"); \
+-    return ERROR(err); \
+-  } while(0);
++#define RETURN_ERROR(err, ...)                                               \
++    do {                                                                     \
++        RAWLOG(3, "%s:%d: ERROR!: unconditional check failed, returning %s", \
++              __FILE__, __LINE__, ERR_QUOTE(ERROR(err)));                    \
++        _FORCE_HAS_FORMAT_STRING(__VA_ARGS__);                               \
++        RAWLOG(3, ": " __VA_ARGS__);                                         \
++        RAWLOG(3, "\n");                                                     \
++        return ERROR(err);                                                   \
++    } while(0)
+ 
+ /*
+  * If the provided expression evaluates to an error code, returns that error code.
+  *
+  * In debug modes, prints additional information.
+  */
+-#define FORWARD_IF_ERROR(err, ...) \
+-  do { \
+-    size_t const err_code = (err); \
+-    if (ERR_isError(err_code)) { \
+-      RAWLOG(3, "%s:%d: ERROR!: forwarding error in %s: %s", \
+-             __FILE__, __LINE__, ERR_QUOTE(err), ERR_getErrorName(err_code)); \
+-      _FORCE_HAS_FORMAT_STRING(__VA_ARGS__); \
+-      RAWLOG(3, ": " __VA_ARGS__); \
+-      RAWLOG(3, "\n"); \
+-      return err_code; \
+-    } \
+-  } while(0);
++#define FORWARD_IF_ERROR(err, ...)                                                 \
++    do {                                                                           \
++        size_t const err_code = (err);                                             \
++        if (ERR_isError(err_code)) {                                               \
++            RAWLOG(3, "%s:%d: ERROR!: forwarding error in %s: %s",                 \
++                  __FILE__, __LINE__, ERR_QUOTE(err), ERR_getErrorName(err_code)); \
++            _FORCE_HAS_FORMAT_STRING(__VA_ARGS__);                                 \
++            RAWLOG(3, ": " __VA_ARGS__);                                           \
++            RAWLOG(3, "\n");                                                       \
++            return err_code;                                                       \
++        }                                                                          \
++    } while(0)
+ 
+ 
+ #endif /* ERROR_H_MODULE */
+--- a/lib/zstd/common/fse.h
++++ b/lib/zstd/common/fse.h
+@@ -1,7 +1,8 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /* ******************************************************************
+  * FSE : Finite State Entropy codec
+  * Public Prototypes declaration
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  *
+  * You can contact the author at :
+  * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+@@ -50,34 +51,6 @@
+ FSE_PUBLIC_API unsigned FSE_versionNumber(void);   /*< library version number; to be used when checking dll version */
+ 
+ 
+-/*-****************************************
+-*  FSE simple functions
+-******************************************/
+-/*! FSE_compress() :
+-    Compress content of buffer 'src', of size 'srcSize', into destination buffer 'dst'.
+-    'dst' buffer must be already allocated. Compression runs faster is dstCapacity >= FSE_compressBound(srcSize).
+-    @return : size of compressed data (<= dstCapacity).
+-    Special values : if return == 0, srcData is not compressible => Nothing is stored within dst !!!
+-                     if return == 1, srcData is a single byte symbol * srcSize times. Use RLE compression instead.
+-                     if FSE_isError(return), compression failed (more details using FSE_getErrorName())
+-*/
+-FSE_PUBLIC_API size_t FSE_compress(void* dst, size_t dstCapacity,
+-                             const void* src, size_t srcSize);
+-
+-/*! FSE_decompress():
+-    Decompress FSE data from buffer 'cSrc', of size 'cSrcSize',
+-    into already allocated destination buffer 'dst', of size 'dstCapacity'.
+-    @return : size of regenerated data (<= maxDstSize),
+-              or an error code, which can be tested using FSE_isError() .
+-
+-    ** Important ** : FSE_decompress() does not decompress non-compressible nor RLE data !!!
+-    Why ? : making this distinction requires a header.
+-    Header management is intentionally delegated to the user layer, which can better manage special cases.
+-*/
+-FSE_PUBLIC_API size_t FSE_decompress(void* dst,  size_t dstCapacity,
+-                               const void* cSrc, size_t cSrcSize);
+-
+-
+ /*-*****************************************
+ *  Tool functions
+ ******************************************/
+@@ -89,20 +62,6 @@ FSE_PUBLIC_API const char* FSE_getErrorN
+ 
+ 
+ /*-*****************************************
+-*  FSE advanced functions
+-******************************************/
+-/*! FSE_compress2() :
+-    Same as FSE_compress(), but allows the selection of 'maxSymbolValue' and 'tableLog'
+-    Both parameters can be defined as '0' to mean : use default value
+-    @return : size of compressed data
+-    Special values : if return == 0, srcData is not compressible => Nothing is stored within cSrc !!!
+-                     if return == 1, srcData is a single byte symbol * srcSize times. Use RLE compression.
+-                     if FSE_isError(return), it's an error code.
+-*/
+-FSE_PUBLIC_API size_t FSE_compress2 (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog);
+-
+-
+-/*-*****************************************
+ *  FSE detailed API
+ ******************************************/
+ /*!
+@@ -161,8 +120,6 @@ FSE_PUBLIC_API size_t FSE_writeNCount (v
+ /*! Constructor and Destructor of FSE_CTable.
+     Note that FSE_CTable size depends on 'tableLog' and 'maxSymbolValue' */
+ typedef unsigned FSE_CTable;   /* don't allocate that. It's only meant to be more restrictive than void* */
+-FSE_PUBLIC_API FSE_CTable* FSE_createCTable (unsigned maxSymbolValue, unsigned tableLog);
+-FSE_PUBLIC_API void        FSE_freeCTable (FSE_CTable* ct);
+ 
+ /*! FSE_buildCTable():
+     Builds `ct`, which must be already allocated, using FSE_createCTable().
+@@ -238,23 +195,7 @@ FSE_PUBLIC_API size_t FSE_readNCount_bmi
+                            unsigned* maxSymbolValuePtr, unsigned* tableLogPtr,
+                            const void* rBuffer, size_t rBuffSize, int bmi2);
+ 
+-/*! Constructor and Destructor of FSE_DTable.
+-    Note that its size depends on 'tableLog' */
+ typedef unsigned FSE_DTable;   /* don't allocate that. It's just a way to be more restrictive than void* */
+-FSE_PUBLIC_API FSE_DTable* FSE_createDTable(unsigned tableLog);
+-FSE_PUBLIC_API void        FSE_freeDTable(FSE_DTable* dt);
+-
+-/*! FSE_buildDTable():
+-    Builds 'dt', which must be already allocated, using FSE_createDTable().
+-    return : 0, or an errorCode, which can be tested using FSE_isError() */
+-FSE_PUBLIC_API size_t FSE_buildDTable (FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog);
+-
+-/*! FSE_decompress_usingDTable():
+-    Decompress compressed source `cSrc` of size `cSrcSize` using `dt`
+-    into `dst` which must be already allocated.
+-    @return : size of regenerated data (necessarily <= `dstCapacity`),
+-              or an errorCode, which can be tested using FSE_isError() */
+-FSE_PUBLIC_API size_t FSE_decompress_usingDTable(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, const FSE_DTable* dt);
+ 
+ /*!
+ Tutorial :
+@@ -286,6 +227,7 @@ If there is an error, the function will
+ 
+ #endif  /* FSE_H */
+ 
++
+ #if !defined(FSE_H_FSE_STATIC_LINKING_ONLY)
+ #define FSE_H_FSE_STATIC_LINKING_ONLY
+ 
+@@ -317,16 +259,6 @@ If there is an error, the function will
+ unsigned FSE_optimalTableLog_internal(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue, unsigned minus);
+ /*< same as FSE_optimalTableLog(), which used `minus==2` */
+ 
+-/* FSE_compress_wksp() :
+- * Same as FSE_compress2(), but using an externally allocated scratch buffer (`workSpace`).
+- * FSE_COMPRESS_WKSP_SIZE_U32() provides the minimum size required for `workSpace` as a table of FSE_CTable.
+- */
+-#define FSE_COMPRESS_WKSP_SIZE_U32(maxTableLog, maxSymbolValue)   ( FSE_CTABLE_SIZE_U32(maxTableLog, maxSymbolValue) + ((maxTableLog > 12) ? (1 << (maxTableLog - 2)) : 1024) )
+-size_t FSE_compress_wksp (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize);
+-
+-size_t FSE_buildCTable_raw (FSE_CTable* ct, unsigned nbBits);
+-/*< build a fake FSE_CTable, designed for a flat distribution, where each symbol uses nbBits */
+-
+ size_t FSE_buildCTable_rle (FSE_CTable* ct, unsigned char symbolValue);
+ /*< build a fake FSE_CTable, designed to compress always the same symbolValue */
+ 
+@@ -344,19 +276,11 @@ size_t FSE_buildCTable_wksp(FSE_CTable*
+ FSE_PUBLIC_API size_t FSE_buildDTable_wksp(FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize);
+ /*< Same as FSE_buildDTable(), using an externally allocated `workspace` produced with `FSE_BUILD_DTABLE_WKSP_SIZE_U32(maxSymbolValue)` */
+ 
+-size_t FSE_buildDTable_raw (FSE_DTable* dt, unsigned nbBits);
+-/*< build a fake FSE_DTable, designed to read a flat distribution where each symbol uses nbBits */
+-
+-size_t FSE_buildDTable_rle (FSE_DTable* dt, unsigned char symbolValue);
+-/*< build a fake FSE_DTable, designed to always generate the same symbolValue */
+-
+-#define FSE_DECOMPRESS_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) (FSE_DTABLE_SIZE_U32(maxTableLog) + FSE_BUILD_DTABLE_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) + (FSE_MAX_SYMBOL_VALUE + 1) / 2 + 1)
++#define FSE_DECOMPRESS_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) (FSE_DTABLE_SIZE_U32(maxTableLog) + 1 + FSE_BUILD_DTABLE_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) + (FSE_MAX_SYMBOL_VALUE + 1) / 2 + 1)
+ #define FSE_DECOMPRESS_WKSP_SIZE(maxTableLog, maxSymbolValue) (FSE_DECOMPRESS_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) * sizeof(unsigned))
+-size_t FSE_decompress_wksp(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, unsigned maxLog, void* workSpace, size_t wkspSize);
+-/*< same as FSE_decompress(), using an externally allocated `workSpace` produced with `FSE_DECOMPRESS_WKSP_SIZE_U32(maxLog, maxSymbolValue)` */
+-
+ size_t FSE_decompress_wksp_bmi2(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, unsigned maxLog, void* workSpace, size_t wkspSize, int bmi2);
+-/*< Same as FSE_decompress_wksp() but with dynamic BMI2 support. Pass 1 if your CPU supports BMI2 or 0 if it doesn't. */
++/*< same as FSE_decompress(), using an externally allocated `workSpace` produced with `FSE_DECOMPRESS_WKSP_SIZE_U32(maxLog, maxSymbolValue)`.
++ * Set bmi2 to 1 if your CPU supports BMI2 or 0 if it doesn't */
+ 
+ typedef enum {
+    FSE_repeat_none,  /*< Cannot use the previous table */
+@@ -539,20 +463,20 @@ MEM_STATIC void FSE_encodeSymbol(BIT_CSt
+     FSE_symbolCompressionTransform const symbolTT = ((const FSE_symbolCompressionTransform*)(statePtr->symbolTT))[symbol];
+     const U16* const stateTable = (const U16*)(statePtr->stateTable);
+     U32 const nbBitsOut  = (U32)((statePtr->value + symbolTT.deltaNbBits) >> 16);
+-    BIT_addBits(bitC, statePtr->value, nbBitsOut);
++    BIT_addBits(bitC,  (size_t)statePtr->value, nbBitsOut);
+     statePtr->value = stateTable[ (statePtr->value >> nbBitsOut) + symbolTT.deltaFindState];
+ }
+ 
+ MEM_STATIC void FSE_flushCState(BIT_CStream_t* bitC, const FSE_CState_t* statePtr)
+ {
+-    BIT_addBits(bitC, statePtr->value, statePtr->stateLog);
++    BIT_addBits(bitC, (size_t)statePtr->value, statePtr->stateLog);
+     BIT_flushBits(bitC);
+ }
+ 
+ 
+ /* FSE_getMaxNbBits() :
+  * Approximate maximum cost of a symbol, in bits.
+- * Fractional get rounded up (i.e : a symbol with a normalized frequency of 3 gives the same result as a frequency of 2)
++ * Fractional get rounded up (i.e. a symbol with a normalized frequency of 3 gives the same result as a frequency of 2)
+  * note 1 : assume symbolValue is valid (<= maxSymbolValue)
+  * note 2 : if freq[symbolValue]==0, @return a fake cost of tableLog+1 bits */
+ MEM_STATIC U32 FSE_getMaxNbBits(const void* symbolTTPtr, U32 symbolValue)
+--- a/lib/zstd/common/fse_decompress.c
++++ b/lib/zstd/common/fse_decompress.c
+@@ -1,6 +1,7 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /* ******************************************************************
+  * FSE : Finite State Entropy decoder
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  *
+  *  You can contact the author at :
+  *  - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy
+@@ -22,8 +23,8 @@
+ #define FSE_STATIC_LINKING_ONLY
+ #include "fse.h"
+ #include "error_private.h"
+-#define ZSTD_DEPS_NEED_MALLOC
+-#include "zstd_deps.h"
++#include "zstd_deps.h"  /* ZSTD_memcpy */
++#include "bits.h"       /* ZSTD_highbit32 */
+ 
+ 
+ /* **************************************************************
+@@ -55,19 +56,6 @@
+ #define FSE_FUNCTION_NAME(X,Y) FSE_CAT(X,Y)
+ #define FSE_TYPE_NAME(X,Y) FSE_CAT(X,Y)
+ 
+-
+-/* Function templates */
+-FSE_DTable* FSE_createDTable (unsigned tableLog)
+-{
+-    if (tableLog > FSE_TABLELOG_ABSOLUTE_MAX) tableLog = FSE_TABLELOG_ABSOLUTE_MAX;
+-    return (FSE_DTable*)ZSTD_malloc( FSE_DTABLE_SIZE_U32(tableLog) * sizeof (U32) );
+-}
+-
+-void FSE_freeDTable (FSE_DTable* dt)
+-{
+-    ZSTD_free(dt);
+-}
+-
+ static size_t FSE_buildDTable_internal(FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize)
+ {
+     void* const tdPtr = dt+1;   /* because *dt is unsigned, 32-bits aligned on 32-bits */
+@@ -96,7 +84,7 @@ static size_t FSE_buildDTable_internal(F
+                     symbolNext[s] = 1;
+                 } else {
+                     if (normalizedCounter[s] >= largeLimit) DTableH.fastMode=0;
+-                    symbolNext[s] = normalizedCounter[s];
++                    symbolNext[s] = (U16)normalizedCounter[s];
+         }   }   }
+         ZSTD_memcpy(dt, &DTableH, sizeof(DTableH));
+     }
+@@ -111,8 +99,7 @@ static size_t FSE_buildDTable_internal(F
+          * all symbols have counts <= 8. We ensure we have 8 bytes at the end of
+          * our buffer to handle the over-write.
+          */
+-        {
+-            U64 const add = 0x0101010101010101ull;
++        {   U64 const add = 0x0101010101010101ull;
+             size_t pos = 0;
+             U64 sv = 0;
+             U32 s;
+@@ -123,14 +110,13 @@ static size_t FSE_buildDTable_internal(F
+                 for (i = 8; i < n; i += 8) {
+                     MEM_write64(spread + pos + i, sv);
+                 }
+-                pos += n;
+-            }
+-        }
++                pos += (size_t)n;
++        }   }
+         /* Now we spread those positions across the table.
+-         * The benefit of doing it in two stages is that we avoid the the
++         * The benefit of doing it in two stages is that we avoid the
+          * variable size inner loop, which caused lots of branch misses.
+          * Now we can run through all the positions without any branch misses.
+-         * We unroll the loop twice, since that is what emperically worked best.
++         * We unroll the loop twice, since that is what empirically worked best.
+          */
+         {
+             size_t position = 0;
+@@ -166,7 +152,7 @@ static size_t FSE_buildDTable_internal(F
+         for (u=0; u<tableSize; u++) {
+             FSE_FUNCTION_TYPE const symbol = (FSE_FUNCTION_TYPE)(tableDecode[u].symbol);
+             U32 const nextState = symbolNext[symbol]++;
+-            tableDecode[u].nbBits = (BYTE) (tableLog - BIT_highbit32(nextState) );
++            tableDecode[u].nbBits = (BYTE) (tableLog - ZSTD_highbit32(nextState) );
+             tableDecode[u].newState = (U16) ( (nextState << tableDecode[u].nbBits) - tableSize);
+     }   }
+ 
+@@ -184,49 +170,6 @@ size_t FSE_buildDTable_wksp(FSE_DTable*
+ /*-*******************************************************
+ *  Decompression (Byte symbols)
+ *********************************************************/
+-size_t FSE_buildDTable_rle (FSE_DTable* dt, BYTE symbolValue)
+-{
+-    void* ptr = dt;
+-    FSE_DTableHeader* const DTableH = (FSE_DTableHeader*)ptr;
+-    void* dPtr = dt + 1;
+-    FSE_decode_t* const cell = (FSE_decode_t*)dPtr;
+-
+-    DTableH->tableLog = 0;
+-    DTableH->fastMode = 0;
+-
+-    cell->newState = 0;
+-    cell->symbol = symbolValue;
+-    cell->nbBits = 0;
+-
+-    return 0;
+-}
+-
+-
+-size_t FSE_buildDTable_raw (FSE_DTable* dt, unsigned nbBits)
+-{
+-    void* ptr = dt;
+-    FSE_DTableHeader* const DTableH = (FSE_DTableHeader*)ptr;
+-    void* dPtr = dt + 1;
+-    FSE_decode_t* const dinfo = (FSE_decode_t*)dPtr;
+-    const unsigned tableSize = 1 << nbBits;
+-    const unsigned tableMask = tableSize - 1;
+-    const unsigned maxSV1 = tableMask+1;
+-    unsigned s;
+-
+-    /* Sanity checks */
+-    if (nbBits < 1) return ERROR(GENERIC);         /* min size */
+-
+-    /* Build Decoding Table */
+-    DTableH->tableLog = (U16)nbBits;
+-    DTableH->fastMode = 1;
+-    for (s=0; s<maxSV1; s++) {
+-        dinfo[s].newState = 0;
+-        dinfo[s].symbol = (BYTE)s;
+-        dinfo[s].nbBits = (BYTE)nbBits;
+-    }
+-
+-    return 0;
+-}
+ 
+ FORCE_INLINE_TEMPLATE size_t FSE_decompress_usingDTable_generic(
+           void* dst, size_t maxDstSize,
+@@ -287,32 +230,12 @@ FORCE_INLINE_TEMPLATE size_t FSE_decompr
+             break;
+     }   }
+ 
+-    return op-ostart;
+-}
+-
+-
+-size_t FSE_decompress_usingDTable(void* dst, size_t originalSize,
+-                            const void* cSrc, size_t cSrcSize,
+-                            const FSE_DTable* dt)
+-{
+-    const void* ptr = dt;
+-    const FSE_DTableHeader* DTableH = (const FSE_DTableHeader*)ptr;
+-    const U32 fastMode = DTableH->fastMode;
+-
+-    /* select fast mode (static) */
+-    if (fastMode) return FSE_decompress_usingDTable_generic(dst, originalSize, cSrc, cSrcSize, dt, 1);
+-    return FSE_decompress_usingDTable_generic(dst, originalSize, cSrc, cSrcSize, dt, 0);
+-}
+-
+-
+-size_t FSE_decompress_wksp(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, unsigned maxLog, void* workSpace, size_t wkspSize)
+-{
+-    return FSE_decompress_wksp_bmi2(dst, dstCapacity, cSrc, cSrcSize, maxLog, workSpace, wkspSize, /* bmi2 */ 0);
++    assert(op >= ostart);
++    return (size_t)(op-ostart);
+ }
+ 
+ typedef struct {
+     short ncount[FSE_MAX_SYMBOL_VALUE + 1];
+-    FSE_DTable dtable[]; /* Dynamically sized */
+ } FSE_DecompressWksp;
+ 
+ 
+@@ -327,13 +250,18 @@ FORCE_INLINE_TEMPLATE size_t FSE_decompr
+     unsigned tableLog;
+     unsigned maxSymbolValue = FSE_MAX_SYMBOL_VALUE;
+     FSE_DecompressWksp* const wksp = (FSE_DecompressWksp*)workSpace;
++    size_t const dtablePos = sizeof(FSE_DecompressWksp) / sizeof(FSE_DTable);
++    FSE_DTable* const dtable = (FSE_DTable*)workSpace + dtablePos;
+ 
+-    DEBUG_STATIC_ASSERT((FSE_MAX_SYMBOL_VALUE + 1) % 2 == 0);
++    FSE_STATIC_ASSERT((FSE_MAX_SYMBOL_VALUE + 1) % 2 == 0);
+     if (wkspSize < sizeof(*wksp)) return ERROR(GENERIC);
+ 
++    /* correct offset to dtable depends on this property */
++    FSE_STATIC_ASSERT(sizeof(FSE_DecompressWksp) % sizeof(FSE_DTable) == 0);
++
+     /* normal FSE decoding mode */
+-    {
+-        size_t const NCountLength = FSE_readNCount_bmi2(wksp->ncount, &maxSymbolValue, &tableLog, istart, cSrcSize, bmi2);
++    {   size_t const NCountLength =
++            FSE_readNCount_bmi2(wksp->ncount, &maxSymbolValue, &tableLog, istart, cSrcSize, bmi2);
+         if (FSE_isError(NCountLength)) return NCountLength;
+         if (tableLog > maxLog) return ERROR(tableLog_tooLarge);
+         assert(NCountLength <= cSrcSize);
+@@ -342,19 +270,20 @@ FORCE_INLINE_TEMPLATE size_t FSE_decompr
+     }
+ 
+     if (FSE_DECOMPRESS_WKSP_SIZE(tableLog, maxSymbolValue) > wkspSize) return ERROR(tableLog_tooLarge);
+-    workSpace = wksp->dtable + FSE_DTABLE_SIZE_U32(tableLog);
++    assert(sizeof(*wksp) + FSE_DTABLE_SIZE(tableLog) <= wkspSize);
++    workSpace = (BYTE*)workSpace + sizeof(*wksp) + FSE_DTABLE_SIZE(tableLog);
+     wkspSize -= sizeof(*wksp) + FSE_DTABLE_SIZE(tableLog);
+ 
+-    CHECK_F( FSE_buildDTable_internal(wksp->dtable, wksp->ncount, maxSymbolValue, tableLog, workSpace, wkspSize) );
++    CHECK_F( FSE_buildDTable_internal(dtable, wksp->ncount, maxSymbolValue, tableLog, workSpace, wkspSize) );
+ 
+     {
+-        const void* ptr = wksp->dtable;
++        const void* ptr = dtable;
+         const FSE_DTableHeader* DTableH = (const FSE_DTableHeader*)ptr;
+         const U32 fastMode = DTableH->fastMode;
+ 
+         /* select fast mode (static) */
+-        if (fastMode) return FSE_decompress_usingDTable_generic(dst, dstCapacity, ip, cSrcSize, wksp->dtable, 1);
+-        return FSE_decompress_usingDTable_generic(dst, dstCapacity, ip, cSrcSize, wksp->dtable, 0);
++        if (fastMode) return FSE_decompress_usingDTable_generic(dst, dstCapacity, ip, cSrcSize, dtable, 1);
++        return FSE_decompress_usingDTable_generic(dst, dstCapacity, ip, cSrcSize, dtable, 0);
+     }
+ }
+ 
+@@ -382,9 +311,4 @@ size_t FSE_decompress_wksp_bmi2(void* ds
+     return FSE_decompress_wksp_body_default(dst, dstCapacity, cSrc, cSrcSize, maxLog, workSpace, wkspSize);
+ }
+ 
+-
+-typedef FSE_DTable DTable_max_t[FSE_DTABLE_SIZE_U32(FSE_MAX_TABLELOG)];
+-
+-
+-
+ #endif   /* FSE_COMMONDEFS_ONLY */
+--- a/lib/zstd/common/huf.h
++++ b/lib/zstd/common/huf.h
+@@ -1,7 +1,8 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /* ******************************************************************
+  * huff0 huffman codec,
+  * part of Finite State Entropy library
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  *
+  * You can contact the author at :
+  * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+@@ -18,99 +19,22 @@
+ 
+ /* *** Dependencies *** */
+ #include "zstd_deps.h"    /* size_t */
+-
+-
+-/* *** library symbols visibility *** */
+-/* Note : when linking with -fvisibility=hidden on gcc, or by default on Visual,
+- *        HUF symbols remain "private" (internal symbols for library only).
+- *        Set macro FSE_DLL_EXPORT to 1 if you want HUF symbols visible on DLL interface */
+-#if defined(FSE_DLL_EXPORT) && (FSE_DLL_EXPORT==1) && defined(__GNUC__) && (__GNUC__ >= 4)
+-#  define HUF_PUBLIC_API __attribute__ ((visibility ("default")))
+-#elif defined(FSE_DLL_EXPORT) && (FSE_DLL_EXPORT==1)   /* Visual expected */
+-#  define HUF_PUBLIC_API __declspec(dllexport)
+-#elif defined(FSE_DLL_IMPORT) && (FSE_DLL_IMPORT==1)
+-#  define HUF_PUBLIC_API __declspec(dllimport)  /* not required, just to generate faster code (saves a function pointer load from IAT and an indirect jump) */
+-#else
+-#  define HUF_PUBLIC_API
+-#endif
+-
+-
+-/* ========================== */
+-/* ***  simple functions  *** */
+-/* ========================== */
+-
+-/* HUF_compress() :
+- *  Compress content from buffer 'src', of size 'srcSize', into buffer 'dst'.
+- * 'dst' buffer must be already allocated.
+- *  Compression runs faster if `dstCapacity` >= HUF_compressBound(srcSize).
+- * `srcSize` must be <= `HUF_BLOCKSIZE_MAX` == 128 KB.
+- * @return : size of compressed data (<= `dstCapacity`).
+- *  Special values : if return == 0, srcData is not compressible => Nothing is stored within dst !!!
+- *                   if HUF_isError(return), compression failed (more details using HUF_getErrorName())
+- */
+-HUF_PUBLIC_API size_t HUF_compress(void* dst, size_t dstCapacity,
+-                             const void* src, size_t srcSize);
+-
+-/* HUF_decompress() :
+- *  Decompress HUF data from buffer 'cSrc', of size 'cSrcSize',
+- *  into already allocated buffer 'dst', of minimum size 'dstSize'.
+- * `originalSize` : **must** be the ***exact*** size of original (uncompressed) data.
+- *  Note : in contrast with FSE, HUF_decompress can regenerate
+- *         RLE (cSrcSize==1) and uncompressed (cSrcSize==dstSize) data,
+- *         because it knows size to regenerate (originalSize).
+- * @return : size of regenerated data (== originalSize),
+- *           or an error code, which can be tested using HUF_isError()
+- */
+-HUF_PUBLIC_API size_t HUF_decompress(void* dst,  size_t originalSize,
+-                               const void* cSrc, size_t cSrcSize);
++#include "mem.h"          /* U32 */
++#define FSE_STATIC_LINKING_ONLY
++#include "fse.h"
+ 
+ 
+ /* ***   Tool functions *** */
+-#define HUF_BLOCKSIZE_MAX (128 * 1024)                  /*< maximum input size for a single block compressed with HUF_compress */
+-HUF_PUBLIC_API size_t HUF_compressBound(size_t size);   /*< maximum compressed size (worst case) */
++#define HUF_BLOCKSIZE_MAX (128 * 1024)   /*< maximum input size for a single block compressed with HUF_compress */
++size_t HUF_compressBound(size_t size);   /*< maximum compressed size (worst case) */
+ 
+ /* Error Management */
+-HUF_PUBLIC_API unsigned    HUF_isError(size_t code);       /*< tells if a return value is an error code */
+-HUF_PUBLIC_API const char* HUF_getErrorName(size_t code);  /*< provides error code string (useful for debugging) */
+-
++unsigned    HUF_isError(size_t code);       /*< tells if a return value is an error code */
++const char* HUF_getErrorName(size_t code);  /*< provides error code string (useful for debugging) */
+ 
+-/* ***   Advanced function   *** */
+ 
+-/* HUF_compress2() :
+- *  Same as HUF_compress(), but offers control over `maxSymbolValue` and `tableLog`.
+- * `maxSymbolValue` must be <= HUF_SYMBOLVALUE_MAX .
+- * `tableLog` must be `<= HUF_TABLELOG_MAX` . */
+-HUF_PUBLIC_API size_t HUF_compress2 (void* dst, size_t dstCapacity,
+-                               const void* src, size_t srcSize,
+-                               unsigned maxSymbolValue, unsigned tableLog);
+-
+-/* HUF_compress4X_wksp() :
+- *  Same as HUF_compress2(), but uses externally allocated `workSpace`.
+- * `workspace` must be at least as large as HUF_WORKSPACE_SIZE */
+ #define HUF_WORKSPACE_SIZE ((8 << 10) + 512 /* sorting scratch space */)
+ #define HUF_WORKSPACE_SIZE_U64 (HUF_WORKSPACE_SIZE / sizeof(U64))
+-HUF_PUBLIC_API size_t HUF_compress4X_wksp (void* dst, size_t dstCapacity,
+-                                     const void* src, size_t srcSize,
+-                                     unsigned maxSymbolValue, unsigned tableLog,
+-                                     void* workSpace, size_t wkspSize);
+-
+-#endif   /* HUF_H_298734234 */
+-
+-/* ******************************************************************
+- *  WARNING !!
+- *  The following section contains advanced and experimental definitions
+- *  which shall never be used in the context of a dynamic library,
+- *  because they are not guaranteed to remain stable in the future.
+- *  Only consider them in association with static linking.
+- * *****************************************************************/
+-#if !defined(HUF_H_HUF_STATIC_LINKING_ONLY)
+-#define HUF_H_HUF_STATIC_LINKING_ONLY
+-
+-/* *** Dependencies *** */
+-#include "mem.h"   /* U32 */
+-#define FSE_STATIC_LINKING_ONLY
+-#include "fse.h"
+-
+ 
+ /* *** Constants *** */
+ #define HUF_TABLELOG_MAX      12      /* max runtime value of tableLog (due to static allocation); can be modified up to HUF_TABLELOG_ABSOLUTEMAX */
+@@ -151,25 +75,49 @@ typedef U32 HUF_DTable;
+ /* ****************************************
+ *  Advanced decompression functions
+ ******************************************/
+-size_t HUF_decompress4X1 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /*< single-symbol decoder */
+-#ifndef HUF_FORCE_DECOMPRESS_X1
+-size_t HUF_decompress4X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /*< double-symbols decoder */
+-#endif
+ 
+-size_t HUF_decompress4X_DCtx (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /*< decodes RLE and uncompressed */
+-size_t HUF_decompress4X_hufOnly(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /*< considers RLE and uncompressed as errors */
+-size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize); /*< considers RLE and uncompressed as errors */
+-size_t HUF_decompress4X1_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /*< single-symbol decoder */
+-size_t HUF_decompress4X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize);   /*< single-symbol decoder */
+-#ifndef HUF_FORCE_DECOMPRESS_X1
+-size_t HUF_decompress4X2_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /*< double-symbols decoder */
+-size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize);   /*< double-symbols decoder */
+-#endif
++/*
++ * Huffman flags bitset.
++ * For all flags, 0 is the default value.
++ */
++typedef enum {
++    /*
++     * If compiled with DYNAMIC_BMI2: Set flag only if the CPU supports BMI2 at runtime.
++     * Otherwise: Ignored.
++     */
++    HUF_flags_bmi2 = (1 << 0),
++    /*
++     * If set: Test possible table depths to find the one that produces the smallest header + encoded size.
++     * If unset: Use heuristic to find the table depth.
++     */
++    HUF_flags_optimalDepth = (1 << 1),
++    /*
++     * If set: If the previous table can encode the input, always reuse the previous table.
++     * If unset: If the previous table can encode the input, reuse the previous table if it results in a smaller output.
++     */
++    HUF_flags_preferRepeat = (1 << 2),
++    /*
++     * If set: Sample the input and check if the sample is uncompressible, if it is then don't attempt to compress.
++     * If unset: Always histogram the entire input.
++     */
++    HUF_flags_suspectUncompressible = (1 << 3),
++    /*
++     * If set: Don't use assembly implementations
++     * If unset: Allow using assembly implementations
++     */
++    HUF_flags_disableAsm = (1 << 4),
++    /*
++     * If set: Don't use the fast decoding loop, always use the fallback decoding loop.
++     * If unset: Use the fast decoding loop when possible.
++     */
++    HUF_flags_disableFast = (1 << 5)
++} HUF_flags_e;
+ 
+ 
+ /* ****************************************
+  *  HUF detailed API
+  * ****************************************/
++#define HUF_OPTIMAL_DEPTH_THRESHOLD ZSTD_btultra
+ 
+ /*! HUF_compress() does the following:
+  *  1. count symbol occurrence from source[] into table count[] using FSE_count() (exposed within "fse.h")
+@@ -182,12 +130,12 @@ size_t HUF_decompress4X2_DCtx_wksp(HUF_D
+  *  For example, it's possible to compress several blocks using the same 'CTable',
+  *  or to save and regenerate 'CTable' using external methods.
+  */
+-unsigned HUF_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue);
+-size_t HUF_buildCTable (HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue, unsigned maxNbBits);   /* @return : maxNbBits; CTable and count can overlap. In which case, CTable will overwrite count content */
+-size_t HUF_writeCTable (void* dst, size_t maxDstSize, const HUF_CElt* CTable, unsigned maxSymbolValue, unsigned huffLog);
++unsigned HUF_minTableLog(unsigned symbolCardinality);
++unsigned HUF_cardinality(const unsigned* count, unsigned maxSymbolValue);
++unsigned HUF_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue, void* workSpace,
++ size_t wkspSize, HUF_CElt* table, const unsigned* count, int flags); /* table is used as scratch space for building and testing tables, not a return value */
+ size_t HUF_writeCTable_wksp(void* dst, size_t maxDstSize, const HUF_CElt* CTable, unsigned maxSymbolValue, unsigned huffLog, void* workspace, size_t workspaceSize);
+-size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable);
+-size_t HUF_compress4X_usingCTable_bmi2(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int bmi2);
++size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int flags);
+ size_t HUF_estimateCompressedSize(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue);
+ int HUF_validateCTable(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue);
+ 
+@@ -196,6 +144,7 @@ typedef enum {
+    HUF_repeat_check, /*< Can use the previous table but it must be checked. Note : The previous table must have been constructed by HUF_compress{1, 4}X_repeat */
+    HUF_repeat_valid  /*< Can use the previous table and it is assumed to be valid */
+  } HUF_repeat;
++
+ /* HUF_compress4X_repeat() :
+  *  Same as HUF_compress4X_wksp(), but considers using hufTable if *repeat != HUF_repeat_none.
+  *  If it uses hufTable it does not modify hufTable or repeat.
+@@ -206,13 +155,13 @@ size_t HUF_compress4X_repeat(void* dst,
+                        const void* src, size_t srcSize,
+                        unsigned maxSymbolValue, unsigned tableLog,
+                        void* workSpace, size_t wkspSize,    /*< `workSpace` must be aligned on 4-bytes boundaries, `wkspSize` must be >= HUF_WORKSPACE_SIZE */
+-                       HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2, unsigned suspectUncompressible);
++                       HUF_CElt* hufTable, HUF_repeat* repeat, int flags);
+ 
+ /* HUF_buildCTable_wksp() :
+  *  Same as HUF_buildCTable(), but using externally allocated scratch buffer.
+  * `workSpace` must be aligned on 4-bytes boundaries, and its size must be >= HUF_CTABLE_WORKSPACE_SIZE.
+  */
+-#define HUF_CTABLE_WORKSPACE_SIZE_U32 (2*HUF_SYMBOLVALUE_MAX +1 +1)
++#define HUF_CTABLE_WORKSPACE_SIZE_U32 ((4 * (HUF_SYMBOLVALUE_MAX + 1)) + 192)
+ #define HUF_CTABLE_WORKSPACE_SIZE (HUF_CTABLE_WORKSPACE_SIZE_U32 * sizeof(unsigned))
+ size_t HUF_buildCTable_wksp (HUF_CElt* tree,
+                        const unsigned* count, U32 maxSymbolValue, U32 maxNbBits,
+@@ -238,7 +187,7 @@ size_t HUF_readStats_wksp(BYTE* huffWeig
+                           U32* rankStats, U32* nbSymbolsPtr, U32* tableLogPtr,
+                           const void* src, size_t srcSize,
+                           void* workspace, size_t wkspSize,
+-                          int bmi2);
++                          int flags);
+ 
+ /* HUF_readCTable() :
+  *  Loading a CTable saved with HUF_writeCTable() */
+@@ -246,9 +195,22 @@ size_t HUF_readCTable (HUF_CElt* CTable,
+ 
+ /* HUF_getNbBitsFromCTable() :
+  *  Read nbBits from CTable symbolTable, for symbol `symbolValue` presumed <= HUF_SYMBOLVALUE_MAX
+- *  Note 1 : is not inlined, as HUF_CElt definition is private */
++ *  Note 1 : If symbolValue > HUF_readCTableHeader(symbolTable).maxSymbolValue, returns 0
++ *  Note 2 : is not inlined, as HUF_CElt definition is private
++ */
+ U32 HUF_getNbBitsFromCTable(const HUF_CElt* symbolTable, U32 symbolValue);
+ 
++typedef struct {
++    BYTE tableLog;
++    BYTE maxSymbolValue;
++    BYTE unused[sizeof(size_t) - 2];
++} HUF_CTableHeader;
++
++/* HUF_readCTableHeader() :
++ *  @returns The header from the CTable specifying the tableLog and the maxSymbolValue.
++ */
++HUF_CTableHeader HUF_readCTableHeader(HUF_CElt const* ctable);
++
+ /*
+  * HUF_decompress() does the following:
+  * 1. select the decompression algorithm (X1, X2) based on pre-computed heuristics
+@@ -276,32 +238,12 @@ U32 HUF_selectDecoder (size_t dstSize, s
+ #define HUF_DECOMPRESS_WORKSPACE_SIZE ((2 << 10) + (1 << 9))
+ #define HUF_DECOMPRESS_WORKSPACE_SIZE_U32 (HUF_DECOMPRESS_WORKSPACE_SIZE / sizeof(U32))
+ 
+-#ifndef HUF_FORCE_DECOMPRESS_X2
+-size_t HUF_readDTableX1 (HUF_DTable* DTable, const void* src, size_t srcSize);
+-size_t HUF_readDTableX1_wksp (HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize);
+-#endif
+-#ifndef HUF_FORCE_DECOMPRESS_X1
+-size_t HUF_readDTableX2 (HUF_DTable* DTable, const void* src, size_t srcSize);
+-size_t HUF_readDTableX2_wksp (HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize);
+-#endif
+-
+-size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);
+-#ifndef HUF_FORCE_DECOMPRESS_X2
+-size_t HUF_decompress4X1_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);
+-#endif
+-#ifndef HUF_FORCE_DECOMPRESS_X1
+-size_t HUF_decompress4X2_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);
+-#endif
+-
+ 
+ /* ====================== */
+ /* single stream variants */
+ /* ====================== */
+ 
+-size_t HUF_compress1X (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog);
+-size_t HUF_compress1X_wksp (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize);  /*< `workSpace` must be a table of at least HUF_WORKSPACE_SIZE_U64 U64 */
+-size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable);
+-size_t HUF_compress1X_usingCTable_bmi2(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int bmi2);
++size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int flags);
+ /* HUF_compress1X_repeat() :
+  *  Same as HUF_compress1X_wksp(), but considers using hufTable if *repeat != HUF_repeat_none.
+  *  If it uses hufTable it does not modify hufTable or repeat.
+@@ -312,47 +254,28 @@ size_t HUF_compress1X_repeat(void* dst,
+                        const void* src, size_t srcSize,
+                        unsigned maxSymbolValue, unsigned tableLog,
+                        void* workSpace, size_t wkspSize,   /*< `workSpace` must be aligned on 4-bytes boundaries, `wkspSize` must be >= HUF_WORKSPACE_SIZE */
+-                       HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2, unsigned suspectUncompressible);
++                       HUF_CElt* hufTable, HUF_repeat* repeat, int flags);
+ 
+-size_t HUF_decompress1X1 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /* single-symbol decoder */
+-#ifndef HUF_FORCE_DECOMPRESS_X1
+-size_t HUF_decompress1X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /* double-symbol decoder */
+-#endif
+-
+-size_t HUF_decompress1X_DCtx (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);
+-size_t HUF_decompress1X_DCtx_wksp (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize);
+-#ifndef HUF_FORCE_DECOMPRESS_X2
+-size_t HUF_decompress1X1_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /*< single-symbol decoder */
+-size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize);   /*< single-symbol decoder */
+-#endif
++size_t HUF_decompress1X_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags);
+ #ifndef HUF_FORCE_DECOMPRESS_X1
+-size_t HUF_decompress1X2_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /*< double-symbols decoder */
+-size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize);   /*< double-symbols decoder */
+-#endif
+-
+-size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);   /*< automatic selection of sing or double symbol decoder, based on DTable */
+-#ifndef HUF_FORCE_DECOMPRESS_X2
+-size_t HUF_decompress1X1_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);
+-#endif
+-#ifndef HUF_FORCE_DECOMPRESS_X1
+-size_t HUF_decompress1X2_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);
++size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags);   /*< double-symbols decoder */
+ #endif
+ 
+ /* BMI2 variants.
+  * If the CPU has BMI2 support, pass bmi2=1, otherwise pass bmi2=0.
+  */
+-size_t HUF_decompress1X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2);
++size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int flags);
+ #ifndef HUF_FORCE_DECOMPRESS_X2
+-size_t HUF_decompress1X1_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2);
++size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags);
+ #endif
+-size_t HUF_decompress4X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2);
+-size_t HUF_decompress4X_hufOnly_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2);
++size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int flags);
++size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags);
+ #ifndef HUF_FORCE_DECOMPRESS_X2
+-size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int bmi2);
++size_t HUF_readDTableX1_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int flags);
+ #endif
+ #ifndef HUF_FORCE_DECOMPRESS_X1
+-size_t HUF_readDTableX2_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int bmi2);
++size_t HUF_readDTableX2_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int flags);
+ #endif
+ 
+-#endif /* HUF_STATIC_LINKING_ONLY */
++#endif   /* HUF_H_298734234 */
+ 
+--- a/lib/zstd/common/mem.h
++++ b/lib/zstd/common/mem.h
+@@ -1,6 +1,6 @@
+ /* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -24,6 +24,7 @@
+ /*-****************************************
+ *  Compiler specifics
+ ******************************************/
++#undef MEM_STATIC /* may be already defined from common/compiler.h */
+ #define MEM_STATIC static inline
+ 
+ /*-**************************************************************
+--- a/lib/zstd/common/portability_macros.h
++++ b/lib/zstd/common/portability_macros.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -12,7 +13,7 @@
+ #define ZSTD_PORTABILITY_MACROS_H
+ 
+ /*
+- * This header file contains macro defintions to support portability.
++ * This header file contains macro definitions to support portability.
+  * This header is shared between C and ASM code, so it MUST only
+  * contain macro definitions. It MUST not contain any C code.
+  *
+@@ -45,6 +46,8 @@
+ /* Mark the internal assembly functions as hidden  */
+ #ifdef __ELF__
+ # define ZSTD_HIDE_ASM_FUNCTION(func) .hidden func
++#elif defined(__APPLE__)
++# define ZSTD_HIDE_ASM_FUNCTION(func) .private_extern func
+ #else
+ # define ZSTD_HIDE_ASM_FUNCTION(func)
+ #endif
+@@ -65,7 +68,7 @@
+ #endif
+ 
+ /*
+- * Only enable assembly for GNUC comptabile compilers,
++ * Only enable assembly for GNUC compatible compilers,
+  * because other platforms may not support GAS assembly syntax.
+  *
+  * Only enable assembly for Linux / MacOS, other platforms may
+@@ -90,4 +93,23 @@
+  */
+ #define ZSTD_ENABLE_ASM_X86_64_BMI2 0
+ 
++/*
++ * For x86 ELF targets, add .note.gnu.property section for Intel CET in
++ * assembly sources when CET is enabled.
++ *
++ * Additionally, any function that may be called indirectly must begin
++ * with ZSTD_CET_ENDBRANCH.
++ */
++#if defined(__ELF__) && (defined(__x86_64__) || defined(__i386__)) \
++    && defined(__has_include)
++# if __has_include(<cet.h>)
++#  include <cet.h>
++#  define ZSTD_CET_ENDBRANCH _CET_ENDBR
++# endif
++#endif
++
++#ifndef ZSTD_CET_ENDBRANCH
++# define ZSTD_CET_ENDBRANCH
++#endif
++
+ #endif /* ZSTD_PORTABILITY_MACROS_H */
+--- a/lib/zstd/common/zstd_common.c
++++ b/lib/zstd/common/zstd_common.c
+@@ -1,5 +1,6 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -14,7 +15,6 @@
+ *  Dependencies
+ ***************************************/
+ #define ZSTD_DEPS_NEED_MALLOC
+-#include "zstd_deps.h"   /* ZSTD_malloc, ZSTD_calloc, ZSTD_free, ZSTD_memset */
+ #include "error_private.h"
+ #include "zstd_internal.h"
+ 
+@@ -47,37 +47,3 @@ ZSTD_ErrorCode ZSTD_getErrorCode(size_t
+ /*! ZSTD_getErrorString() :
+  *  provides error code string from enum */
+ const char* ZSTD_getErrorString(ZSTD_ErrorCode code) { return ERR_getErrorString(code); }
+-
+-
+-
+-/*=**************************************************************
+-*  Custom allocator
+-****************************************************************/
+-void* ZSTD_customMalloc(size_t size, ZSTD_customMem customMem)
+-{
+-    if (customMem.customAlloc)
+-        return customMem.customAlloc(customMem.opaque, size);
+-    return ZSTD_malloc(size);
+-}
+-
+-void* ZSTD_customCalloc(size_t size, ZSTD_customMem customMem)
+-{
+-    if (customMem.customAlloc) {
+-        /* calloc implemented as malloc+memset;
+-         * not as efficient as calloc, but next best guess for custom malloc */
+-        void* const ptr = customMem.customAlloc(customMem.opaque, size);
+-        ZSTD_memset(ptr, 0, size);
+-        return ptr;
+-    }
+-    return ZSTD_calloc(1, size);
+-}
+-
+-void ZSTD_customFree(void* ptr, ZSTD_customMem customMem)
+-{
+-    if (ptr!=NULL) {
+-        if (customMem.customFree)
+-            customMem.customFree(customMem.opaque, ptr);
+-        else
+-            ZSTD_free(ptr);
+-    }
+-}
+--- a/lib/zstd/common/zstd_deps.h
++++ b/lib/zstd/common/zstd_deps.h
+@@ -1,6 +1,6 @@
+ /* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -105,3 +105,17 @@ static uint64_t ZSTD_div64(uint64_t divi
+ 
+ #endif /* ZSTD_DEPS_IO */
+ #endif /* ZSTD_DEPS_NEED_IO */
++
++/*
++ * Only requested when MSAN is enabled.
++ * Need:
++ * intptr_t
++ */
++#ifdef ZSTD_DEPS_NEED_STDINT
++#ifndef ZSTD_DEPS_STDINT
++#define ZSTD_DEPS_STDINT
++
++/* intptr_t already provided by ZSTD_DEPS_COMMON */
++
++#endif /* ZSTD_DEPS_STDINT */
++#endif /* ZSTD_DEPS_NEED_STDINT */
+--- a/lib/zstd/common/zstd_internal.h
++++ b/lib/zstd/common/zstd_internal.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -28,7 +29,6 @@
+ #include <linux/zstd.h>
+ #define FSE_STATIC_LINKING_ONLY
+ #include "fse.h"
+-#define HUF_STATIC_LINKING_ONLY
+ #include "huf.h"
+ #include <linux/xxhash.h>                /* XXH_reset, update, digest */
+ #define ZSTD_TRACE 0
+@@ -83,9 +83,9 @@ typedef enum { bt_raw, bt_rle, bt_compre
+ #define ZSTD_FRAMECHECKSUMSIZE 4
+ 
+ #define MIN_SEQUENCES_SIZE 1 /* nbSeq==0 */
+-#define MIN_CBLOCK_SIZE (1 /*litCSize*/ + 1 /* RLE or RAW */ + MIN_SEQUENCES_SIZE /* nbSeq==0 */)   /* for a non-null block */
++#define MIN_CBLOCK_SIZE (1 /*litCSize*/ + 1 /* RLE or RAW */)   /* for a non-null block */
++#define MIN_LITERALS_FOR_4_STREAMS 6
+ 
+-#define HufLog 12
+ typedef enum { set_basic, set_rle, set_compressed, set_repeat } symbolEncodingType_e;
+ 
+ #define LONGNBSEQ 0x7F00
+@@ -93,6 +93,7 @@ typedef enum { set_basic, set_rle, set_c
+ #define MINMATCH 3
+ 
+ #define Litbits  8
++#define LitHufLog 11
+ #define MaxLit ((1<<Litbits) - 1)
+ #define MaxML   52
+ #define MaxLL   35
+@@ -103,6 +104,8 @@ typedef enum { set_basic, set_rle, set_c
+ #define LLFSELog    9
+ #define OffFSELog   8
+ #define MaxFSELog  MAX(MAX(MLFSELog, LLFSELog), OffFSELog)
++#define MaxMLBits 16
++#define MaxLLBits 16
+ 
+ #define ZSTD_MAX_HUF_HEADER_SIZE 128 /* header + <= 127 byte tree description */
+ /* Each table cannot take more than #symbols * FSELog bits */
+@@ -166,7 +169,7 @@ static void ZSTD_copy8(void* dst, const
+     ZSTD_memcpy(dst, src, 8);
+ #endif
+ }
+-#define COPY8(d,s) { ZSTD_copy8(d,s); d+=8; s+=8; }
++#define COPY8(d,s) do { ZSTD_copy8(d,s); d+=8; s+=8; } while (0)
+ 
+ /* Need to use memmove here since the literal buffer can now be located within
+    the dst buffer. In circumstances where the op "catches up" to where the
+@@ -186,7 +189,7 @@ static void ZSTD_copy16(void* dst, const
+     ZSTD_memcpy(dst, copy16_buf, 16);
+ #endif
+ }
+-#define COPY16(d,s) { ZSTD_copy16(d,s); d+=16; s+=16; }
++#define COPY16(d,s) do { ZSTD_copy16(d,s); d+=16; s+=16; } while (0)
+ 
+ #define WILDCOPY_OVERLENGTH 32
+ #define WILDCOPY_VECLEN 16
+@@ -215,7 +218,7 @@ void ZSTD_wildcopy(void* dst, const void
+     if (ovtype == ZSTD_overlap_src_before_dst && diff < WILDCOPY_VECLEN) {
+         /* Handle short offset copies. */
+         do {
+-            COPY8(op, ip)
++            COPY8(op, ip);
+         } while (op < oend);
+     } else {
+         assert(diff >= WILDCOPY_VECLEN || diff <= -WILDCOPY_VECLEN);
+@@ -225,12 +228,6 @@ void ZSTD_wildcopy(void* dst, const void
+          * one COPY16() in the first call. Then, do two calls per loop since
+          * at that point it is more likely to have a high trip count.
+          */
+-#ifdef __aarch64__
+-        do {
+-            COPY16(op, ip);
+-        }
+-        while (op < oend);
+-#else
+         ZSTD_copy16(op, ip);
+         if (16 >= length) return;
+         op += 16;
+@@ -240,7 +237,6 @@ void ZSTD_wildcopy(void* dst, const void
+             COPY16(op, ip);
+         }
+         while (op < oend);
+-#endif
+     }
+ }
+ 
+@@ -289,11 +285,11 @@ typedef enum {
+ typedef struct {
+     seqDef* sequencesStart;
+     seqDef* sequences;      /* ptr to end of sequences */
+-    BYTE* litStart;
+-    BYTE* lit;              /* ptr to end of literals */
+-    BYTE* llCode;
+-    BYTE* mlCode;
+-    BYTE* ofCode;
++    BYTE*  litStart;
++    BYTE*  lit;             /* ptr to end of literals */
++    BYTE*  llCode;
++    BYTE*  mlCode;
++    BYTE*  ofCode;
+     size_t maxNbSeq;
+     size_t maxNbLit;
+ 
+@@ -301,8 +297,8 @@ typedef struct {
+      * in the seqStore that has a value larger than U16 (if it exists). To do so, we increment
+      * the existing value of the litLength or matchLength by 0x10000.
+      */
+-    ZSTD_longLengthType_e   longLengthType;
+-    U32                     longLengthPos;  /* Index of the sequence to apply long length modification to */
++    ZSTD_longLengthType_e longLengthType;
++    U32                   longLengthPos;  /* Index of the sequence to apply long length modification to */
+ } seqStore_t;
+ 
+ typedef struct {
+@@ -321,10 +317,10 @@ MEM_STATIC ZSTD_sequenceLength ZSTD_getS
+     seqLen.matchLength = seq->mlBase + MINMATCH;
+     if (seqStore->longLengthPos == (U32)(seq - seqStore->sequencesStart)) {
+         if (seqStore->longLengthType == ZSTD_llt_literalLength) {
+-            seqLen.litLength += 0xFFFF;
++            seqLen.litLength += 0x10000;
+         }
+         if (seqStore->longLengthType == ZSTD_llt_matchLength) {
+-            seqLen.matchLength += 0xFFFF;
++            seqLen.matchLength += 0x10000;
+         }
+     }
+     return seqLen;
+@@ -337,72 +333,13 @@ MEM_STATIC ZSTD_sequenceLength ZSTD_getS
+  *          `decompressedBound != ZSTD_CONTENTSIZE_ERROR`
+  */
+ typedef struct {
++    size_t nbBlocks;
+     size_t compressedSize;
+     unsigned long long decompressedBound;
+ } ZSTD_frameSizeInfo;   /* decompress & legacy */
+ 
+ const seqStore_t* ZSTD_getSeqStore(const ZSTD_CCtx* ctx);   /* compress & dictBuilder */
+-void ZSTD_seqToCodes(const seqStore_t* seqStorePtr);   /* compress, dictBuilder, decodeCorpus (shouldn't get its definition from here) */
+-
+-/* custom memory allocation functions */
+-void* ZSTD_customMalloc(size_t size, ZSTD_customMem customMem);
+-void* ZSTD_customCalloc(size_t size, ZSTD_customMem customMem);
+-void ZSTD_customFree(void* ptr, ZSTD_customMem customMem);
+-
+-
+-MEM_STATIC U32 ZSTD_highbit32(U32 val)   /* compress, dictBuilder, decodeCorpus */
+-{
+-    assert(val != 0);
+-    {
+-#   if (__GNUC__ >= 3)   /* GCC Intrinsic */
+-        return __builtin_clz (val) ^ 31;
+-#   else   /* Software version */
+-        static const U32 DeBruijnClz[32] = { 0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18, 22, 25, 3, 30, 8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31 };
+-        U32 v = val;
+-        v |= v >> 1;
+-        v |= v >> 2;
+-        v |= v >> 4;
+-        v |= v >> 8;
+-        v |= v >> 16;
+-        return DeBruijnClz[(v * 0x07C4ACDDU) >> 27];
+-#   endif
+-    }
+-}
+-
+-/*
+- * Counts the number of trailing zeros of a `size_t`.
+- * Most compilers should support CTZ as a builtin. A backup
+- * implementation is provided if the builtin isn't supported, but
+- * it may not be terribly efficient.
+- */
+-MEM_STATIC unsigned ZSTD_countTrailingZeros(size_t val)
+-{
+-    if (MEM_64bits()) {
+-#       if (__GNUC__ >= 4)
+-            return __builtin_ctzll((U64)val);
+-#       else
+-            static const int DeBruijnBytePos[64] = {  0,  1,  2,  7,  3, 13,  8, 19,
+-                                                      4, 25, 14, 28,  9, 34, 20, 56,
+-                                                      5, 17, 26, 54, 15, 41, 29, 43,
+-                                                      10, 31, 38, 35, 21, 45, 49, 57,
+-                                                      63,  6, 12, 18, 24, 27, 33, 55,
+-                                                      16, 53, 40, 42, 30, 37, 44, 48,
+-                                                      62, 11, 23, 32, 52, 39, 36, 47,
+-                                                      61, 22, 51, 46, 60, 50, 59, 58 };
+-            return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58];
+-#       endif
+-    } else { /* 32 bits */
+-#       if (__GNUC__ >= 3)
+-            return __builtin_ctz((U32)val);
+-#       else
+-            static const int DeBruijnBytePos[32] = {  0,  1, 28,  2, 29, 14, 24,  3,
+-                                                     30, 22, 20, 15, 25, 17,  4,  8,
+-                                                     31, 27, 13, 23, 21, 19, 16,  7,
+-                                                     26, 12, 18,  6, 11,  5, 10,  9 };
+-            return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27];
+-#       endif
+-    }
+-}
++int ZSTD_seqToCodes(const seqStore_t* seqStorePtr);   /* compress, dictBuilder, decodeCorpus (shouldn't get its definition from here) */
+ 
+ 
+ /* ZSTD_invalidateRepCodes() :
+@@ -420,13 +357,13 @@ typedef struct {
+ 
+ /*! ZSTD_getcBlockSize() :
+  *  Provides the size of compressed block from block header `src` */
+-/* Used by: decompress, fullbench (does not get its definition from here) */
++/*  Used by: decompress, fullbench */
+ size_t ZSTD_getcBlockSize(const void* src, size_t srcSize,
+                           blockProperties_t* bpPtr);
+ 
+ /*! ZSTD_decodeSeqHeaders() :
+  *  decode sequence header from src */
+-/* Used by: decompress, fullbench (does not get its definition from here) */
++/*  Used by: zstd_decompress_block, fullbench */
+ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
+                        const void* src, size_t srcSize);
+ 
+--- a/lib/zstd/compress/clevels.h
++++ b/lib/zstd/compress/clevels.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+--- a/lib/zstd/compress/fse_compress.c
++++ b/lib/zstd/compress/fse_compress.c
+@@ -1,6 +1,7 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /* ******************************************************************
+  * FSE : Finite State Entropy encoder
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  *
+  *  You can contact the author at :
+  *  - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy
+@@ -25,7 +26,8 @@
+ #include "../common/error_private.h"
+ #define ZSTD_DEPS_NEED_MALLOC
+ #define ZSTD_DEPS_NEED_MATH64
+-#include "../common/zstd_deps.h"  /* ZSTD_malloc, ZSTD_free, ZSTD_memcpy, ZSTD_memset */
++#include "../common/zstd_deps.h"  /* ZSTD_memset */
++#include "../common/bits.h" /* ZSTD_highbit32 */
+ 
+ 
+ /* **************************************************************
+@@ -90,7 +92,7 @@ size_t FSE_buildCTable_wksp(FSE_CTable*
+     assert(tableLog < 16);   /* required for threshold strategy to work */
+ 
+     /* For explanations on how to distribute symbol values over the table :
+-     * http://fastcompression.blogspot.fr/2014/02/fse-distributing-symbol-values.html */
++     * https://fastcompression.blogspot.fr/2014/02/fse-distributing-symbol-values.html */
+ 
+      #ifdef __clang_analyzer__
+      ZSTD_memset(tableSymbol, 0, sizeof(*tableSymbol) * tableSize);   /* useless initialization, just to keep scan-build happy */
+@@ -191,7 +193,7 @@ size_t FSE_buildCTable_wksp(FSE_CTable*
+                 break;
+             default :
+                 assert(normalizedCounter[s] > 1);
+-                {   U32 const maxBitsOut = tableLog - BIT_highbit32 ((U32)normalizedCounter[s]-1);
++                {   U32 const maxBitsOut = tableLog - ZSTD_highbit32 ((U32)normalizedCounter[s]-1);
+                     U32 const minStatePlus = (U32)normalizedCounter[s] << maxBitsOut;
+                     symbolTT[s].deltaNbBits = (maxBitsOut << 16) - minStatePlus;
+                     symbolTT[s].deltaFindState = (int)(total - (unsigned)normalizedCounter[s]);
+@@ -224,8 +226,8 @@ size_t FSE_NCountWriteBound(unsigned max
+     size_t const maxHeaderSize = (((maxSymbolValue+1) * tableLog
+                                    + 4 /* bitCount initialized at 4 */
+                                    + 2 /* first two symbols may use one additional bit each */) / 8)
+-                                    + 1 /* round up to whole nb bytes */
+-                                    + 2 /* additional two bytes for bitstream flush */;
++                                   + 1 /* round up to whole nb bytes */
++                                   + 2 /* additional two bytes for bitstream flush */;
+     return maxSymbolValue ? maxHeaderSize : FSE_NCOUNTBOUND;  /* maxSymbolValue==0 ? use default */
+ }
+ 
+@@ -254,7 +256,7 @@ FSE_writeNCount_generic (void* header, s
+     /* Init */
+     remaining = tableSize+1;   /* +1 for extra accuracy */
+     threshold = tableSize;
+-    nbBits = tableLog+1;
++    nbBits = (int)tableLog+1;
+ 
+     while ((symbol < alphabetSize) && (remaining>1)) {  /* stops at 1 */
+         if (previousIs0) {
+@@ -273,7 +275,7 @@ FSE_writeNCount_generic (void* header, s
+             }
+             while (symbol >= start+3) {
+                 start+=3;
+-                bitStream += 3 << bitCount;
++                bitStream += 3U << bitCount;
+                 bitCount += 2;
+             }
+             bitStream += (symbol-start) << bitCount;
+@@ -293,7 +295,7 @@ FSE_writeNCount_generic (void* header, s
+             count++;   /* +1 for extra accuracy */
+             if (count>=threshold)
+                 count += max;   /* [0..max[ [max..threshold[ (...) [threshold+max 2*threshold[ */
+-            bitStream += count << bitCount;
++            bitStream += (U32)count << bitCount;
+             bitCount  += nbBits;
+             bitCount  -= (count<max);
+             previousIs0  = (count==1);
+@@ -321,7 +323,8 @@ FSE_writeNCount_generic (void* header, s
+     out[1] = (BYTE)(bitStream>>8);
+     out+= (bitCount+7) /8;
+ 
+-    return (out-ostart);
++    assert(out >= ostart);
++    return (size_t)(out-ostart);
+ }
+ 
+ 
+@@ -342,21 +345,11 @@ size_t FSE_writeNCount (void* buffer, si
+ *  FSE Compression Code
+ ****************************************************************/
+ 
+-FSE_CTable* FSE_createCTable (unsigned maxSymbolValue, unsigned tableLog)
+-{
+-    size_t size;
+-    if (tableLog > FSE_TABLELOG_ABSOLUTE_MAX) tableLog = FSE_TABLELOG_ABSOLUTE_MAX;
+-    size = FSE_CTABLE_SIZE_U32 (tableLog, maxSymbolValue) * sizeof(U32);
+-    return (FSE_CTable*)ZSTD_malloc(size);
+-}
+-
+-void FSE_freeCTable (FSE_CTable* ct) { ZSTD_free(ct); }
+-
+ /* provides the minimum logSize to safely represent a distribution */
+ static unsigned FSE_minTableLog(size_t srcSize, unsigned maxSymbolValue)
+ {
+-    U32 minBitsSrc = BIT_highbit32((U32)(srcSize)) + 1;
+-    U32 minBitsSymbols = BIT_highbit32(maxSymbolValue) + 2;
++    U32 minBitsSrc = ZSTD_highbit32((U32)(srcSize)) + 1;
++    U32 minBitsSymbols = ZSTD_highbit32(maxSymbolValue) + 2;
+     U32 minBits = minBitsSrc < minBitsSymbols ? minBitsSrc : minBitsSymbols;
+     assert(srcSize > 1); /* Not supported, RLE should be used instead */
+     return minBits;
+@@ -364,7 +357,7 @@ static unsigned FSE_minTableLog(size_t s
+ 
+ unsigned FSE_optimalTableLog_internal(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue, unsigned minus)
+ {
+-    U32 maxBitsSrc = BIT_highbit32((U32)(srcSize - 1)) - minus;
++    U32 maxBitsSrc = ZSTD_highbit32((U32)(srcSize - 1)) - minus;
+     U32 tableLog = maxTableLog;
+     U32 minBits = FSE_minTableLog(srcSize, maxSymbolValue);
+     assert(srcSize > 1); /* Not supported, RLE should be used instead */
+@@ -532,40 +525,6 @@ size_t FSE_normalizeCount (short* normal
+     return tableLog;
+ }
+ 
+-
+-/* fake FSE_CTable, for raw (uncompressed) input */
+-size_t FSE_buildCTable_raw (FSE_CTable* ct, unsigned nbBits)
+-{
+-    const unsigned tableSize = 1 << nbBits;
+-    const unsigned tableMask = tableSize - 1;
+-    const unsigned maxSymbolValue = tableMask;
+-    void* const ptr = ct;
+-    U16* const tableU16 = ( (U16*) ptr) + 2;
+-    void* const FSCT = ((U32*)ptr) + 1 /* header */ + (tableSize>>1);   /* assumption : tableLog >= 1 */
+-    FSE_symbolCompressionTransform* const symbolTT = (FSE_symbolCompressionTransform*) (FSCT);
+-    unsigned s;
+-
+-    /* Sanity checks */
+-    if (nbBits < 1) return ERROR(GENERIC);             /* min size */
+-
+-    /* header */
+-    tableU16[-2] = (U16) nbBits;
+-    tableU16[-1] = (U16) maxSymbolValue;
+-
+-    /* Build table */
+-    for (s=0; s<tableSize; s++)
+-        tableU16[s] = (U16)(tableSize + s);
+-
+-    /* Build Symbol Transformation Table */
+-    {   const U32 deltaNbBits = (nbBits << 16) - (1 << nbBits);
+-        for (s=0; s<=maxSymbolValue; s++) {
+-            symbolTT[s].deltaNbBits = deltaNbBits;
+-            symbolTT[s].deltaFindState = s-1;
+-    }   }
+-
+-    return 0;
+-}
+-
+ /* fake FSE_CTable, for rle input (always same symbol) */
+ size_t FSE_buildCTable_rle (FSE_CTable* ct, BYTE symbolValue)
+ {
+@@ -664,5 +623,4 @@ size_t FSE_compress_usingCTable (void* d
+ 
+ size_t FSE_compressBound(size_t size) { return FSE_COMPRESSBOUND(size); }
+ 
+-
+ #endif   /* FSE_COMMONDEFS_ONLY */
+--- a/lib/zstd/compress/hist.c
++++ b/lib/zstd/compress/hist.c
+@@ -1,7 +1,8 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /* ******************************************************************
+  * hist : Histogram functions
+  * part of Finite State Entropy project
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  *
+  *  You can contact the author at :
+  *  - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy
+--- a/lib/zstd/compress/hist.h
++++ b/lib/zstd/compress/hist.h
+@@ -1,7 +1,8 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /* ******************************************************************
+  * hist : Histogram functions
+  * part of Finite State Entropy project
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  *
+  *  You can contact the author at :
+  *  - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy
+--- a/lib/zstd/compress/huf_compress.c
++++ b/lib/zstd/compress/huf_compress.c
+@@ -1,6 +1,7 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /* ******************************************************************
+  * Huffman encoder, part of New Generation Entropy library
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  *
+  *  You can contact the author at :
+  *  - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy
+@@ -26,9 +27,9 @@
+ #include "hist.h"
+ #define FSE_STATIC_LINKING_ONLY   /* FSE_optimalTableLog_internal */
+ #include "../common/fse.h"        /* header compression */
+-#define HUF_STATIC_LINKING_ONLY
+ #include "../common/huf.h"
+ #include "../common/error_private.h"
++#include "../common/bits.h"       /* ZSTD_highbit32 */
+ 
+ 
+ /* **************************************************************
+@@ -39,13 +40,67 @@
+ 
+ 
+ /* **************************************************************
+-*  Utils
++*  Required declarations
+ ****************************************************************/
+-unsigned HUF_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue)
++typedef struct nodeElt_s {
++    U32 count;
++    U16 parent;
++    BYTE byte;
++    BYTE nbBits;
++} nodeElt;
++
++
++/* **************************************************************
++*  Debug Traces
++****************************************************************/
++
++#if DEBUGLEVEL >= 2
++
++static size_t showU32(const U32* arr, size_t size)
+ {
+-    return FSE_optimalTableLog_internal(maxTableLog, srcSize, maxSymbolValue, 1);
++    size_t u;
++    for (u=0; u<size; u++) {
++        RAWLOG(6, " %u", arr[u]); (void)arr;
++    }
++    RAWLOG(6, " \n");
++    return size;
+ }
+ 
++static size_t HUF_getNbBits(HUF_CElt elt);
++
++static size_t showCTableBits(const HUF_CElt* ctable, size_t size)
++{
++    size_t u;
++    for (u=0; u<size; u++) {
++        RAWLOG(6, " %zu", HUF_getNbBits(ctable[u])); (void)ctable;
++    }
++    RAWLOG(6, " \n");
++    return size;
++
++}
++
++static size_t showHNodeSymbols(const nodeElt* hnode, size_t size)
++{
++    size_t u;
++    for (u=0; u<size; u++) {
++        RAWLOG(6, " %u", hnode[u].byte); (void)hnode;
++    }
++    RAWLOG(6, " \n");
++    return size;
++}
++
++static size_t showHNodeBits(const nodeElt* hnode, size_t size)
++{
++    size_t u;
++    for (u=0; u<size; u++) {
++        RAWLOG(6, " %u", hnode[u].nbBits); (void)hnode;
++    }
++    RAWLOG(6, " \n");
++    return size;
++}
++
++#endif
++
+ 
+ /* *******************************************************
+ *  HUF : Huffman block compression
+@@ -86,7 +141,10 @@ typedef struct {
+     S16 norm[HUF_TABLELOG_MAX+1];
+ } HUF_CompressWeightsWksp;
+ 
+-static size_t HUF_compressWeights(void* dst, size_t dstSize, const void* weightTable, size_t wtSize, void* workspace, size_t workspaceSize)
++static size_t
++HUF_compressWeights(void* dst, size_t dstSize,
++              const void* weightTable, size_t wtSize,
++                    void* workspace, size_t workspaceSize)
+ {
+     BYTE* const ostart = (BYTE*) dst;
+     BYTE* op = ostart;
+@@ -137,7 +195,7 @@ static size_t HUF_getNbBitsFast(HUF_CElt
+ 
+ static size_t HUF_getValue(HUF_CElt elt)
+ {
+-    return elt & ~0xFF;
++    return elt & ~(size_t)0xFF;
+ }
+ 
+ static size_t HUF_getValueFast(HUF_CElt elt)
+@@ -160,6 +218,25 @@ static void HUF_setValue(HUF_CElt* elt,
+     }
+ }
+ 
++HUF_CTableHeader HUF_readCTableHeader(HUF_CElt const* ctable)
++{
++    HUF_CTableHeader header;
++    ZSTD_memcpy(&header, ctable, sizeof(header));
++    return header;
++}
++
++static void HUF_writeCTableHeader(HUF_CElt* ctable, U32 tableLog, U32 maxSymbolValue)
++{
++    HUF_CTableHeader header;
++    HUF_STATIC_ASSERT(sizeof(ctable[0]) == sizeof(header));
++    ZSTD_memset(&header, 0, sizeof(header));
++    assert(tableLog < 256);
++    header.tableLog = (BYTE)tableLog;
++    assert(maxSymbolValue < 256);
++    header.maxSymbolValue = (BYTE)maxSymbolValue;
++    ZSTD_memcpy(ctable, &header, sizeof(header));
++}
++
+ typedef struct {
+     HUF_CompressWeightsWksp wksp;
+     BYTE bitsToWeight[HUF_TABLELOG_MAX + 1];   /* precomputed conversion table */
+@@ -175,6 +252,11 @@ size_t HUF_writeCTable_wksp(void* dst, s
+     U32 n;
+     HUF_WriteCTableWksp* wksp = (HUF_WriteCTableWksp*)HUF_alignUpWorkspace(workspace, &workspaceSize, ZSTD_ALIGNOF(U32));
+ 
++    HUF_STATIC_ASSERT(HUF_CTABLE_WORKSPACE_SIZE >= sizeof(HUF_WriteCTableWksp));
++
++    assert(HUF_readCTableHeader(CTable).maxSymbolValue == maxSymbolValue);
++    assert(HUF_readCTableHeader(CTable).tableLog == huffLog);
++
+     /* check conditions */
+     if (workspaceSize < sizeof(HUF_WriteCTableWksp)) return ERROR(GENERIC);
+     if (maxSymbolValue > HUF_SYMBOLVALUE_MAX) return ERROR(maxSymbolValue_tooLarge);
+@@ -204,16 +286,6 @@ size_t HUF_writeCTable_wksp(void* dst, s
+     return ((maxSymbolValue+1)/2) + 1;
+ }
+ 
+-/*! HUF_writeCTable() :
+-    `CTable` : Huffman tree to save, using huf representation.
+-    @return : size of saved CTable */
+-size_t HUF_writeCTable (void* dst, size_t maxDstSize,
+-                        const HUF_CElt* CTable, unsigned maxSymbolValue, unsigned huffLog)
+-{
+-    HUF_WriteCTableWksp wksp;
+-    return HUF_writeCTable_wksp(dst, maxDstSize, CTable, maxSymbolValue, huffLog, &wksp, sizeof(wksp));
+-}
+-
+ 
+ size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void* src, size_t srcSize, unsigned* hasZeroWeights)
+ {
+@@ -231,7 +303,9 @@ size_t HUF_readCTable (HUF_CElt* CTable,
+     if (tableLog > HUF_TABLELOG_MAX) return ERROR(tableLog_tooLarge);
+     if (nbSymbols > *maxSymbolValuePtr+1) return ERROR(maxSymbolValue_tooSmall);
+ 
+-    CTable[0] = tableLog;
++    *maxSymbolValuePtr = nbSymbols - 1;
++
++    HUF_writeCTableHeader(CTable, tableLog, *maxSymbolValuePtr);
+ 
+     /* Prepare base value per rank */
+     {   U32 n, nextRankStart = 0;
+@@ -263,74 +337,71 @@ size_t HUF_readCTable (HUF_CElt* CTable,
+         { U32 n; for (n=0; n<nbSymbols; n++) HUF_setValue(ct + n, valPerRank[HUF_getNbBits(ct[n])]++); }
+     }
+ 
+-    *maxSymbolValuePtr = nbSymbols - 1;
+     return readSize;
+ }
+ 
+ U32 HUF_getNbBitsFromCTable(HUF_CElt const* CTable, U32 symbolValue)
+ {
+-    const HUF_CElt* ct = CTable + 1;
++    const HUF_CElt* const ct = CTable + 1;
+     assert(symbolValue <= HUF_SYMBOLVALUE_MAX);
++    if (symbolValue > HUF_readCTableHeader(CTable).maxSymbolValue)
++        return 0;
+     return (U32)HUF_getNbBits(ct[symbolValue]);
+ }
+ 
+ 
+-typedef struct nodeElt_s {
+-    U32 count;
+-    U16 parent;
+-    BYTE byte;
+-    BYTE nbBits;
+-} nodeElt;
+-
+ /*
+  * HUF_setMaxHeight():
+- * Enforces maxNbBits on the Huffman tree described in huffNode.
++ * Try to enforce @targetNbBits on the Huffman tree described in @huffNode.
+  *
+- * It sets all nodes with nbBits > maxNbBits to be maxNbBits. Then it adjusts
+- * the tree to so that it is a valid canonical Huffman tree.
++ * It attempts to convert all nodes with nbBits > @targetNbBits
++ * to employ @targetNbBits instead. Then it adjusts the tree
++ * so that it remains a valid canonical Huffman tree.
+  *
+  * @pre               The sum of the ranks of each symbol == 2^largestBits,
+  *                    where largestBits == huffNode[lastNonNull].nbBits.
+  * @post              The sum of the ranks of each symbol == 2^largestBits,
+- *                    where largestBits is the return value <= maxNbBits.
++ *                    where largestBits is the return value (expected <= targetNbBits).
+  *
+- * @param huffNode    The Huffman tree modified in place to enforce maxNbBits.
++ * @param huffNode    The Huffman tree modified in place to enforce targetNbBits.
++ *                    It's presumed sorted, from most frequent to rarest symbol.
+  * @param lastNonNull The symbol with the lowest count in the Huffman tree.
+- * @param maxNbBits   The maximum allowed number of bits, which the Huffman tree
++ * @param targetNbBits  The allowed number of bits, which the Huffman tree
+  *                    may not respect. After this function the Huffman tree will
+- *                    respect maxNbBits.
+- * @return            The maximum number of bits of the Huffman tree after adjustment,
+- *                    necessarily no more than maxNbBits.
++ *                    respect targetNbBits.
++ * @return            The maximum number of bits of the Huffman tree after adjustment.
+  */
+-static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits)
++static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 targetNbBits)
+ {
+     const U32 largestBits = huffNode[lastNonNull].nbBits;
+-    /* early exit : no elt > maxNbBits, so the tree is already valid. */
+-    if (largestBits <= maxNbBits) return largestBits;
++    /* early exit : no elt > targetNbBits, so the tree is already valid. */
++    if (largestBits <= targetNbBits) return largestBits;
++
++    DEBUGLOG(5, "HUF_setMaxHeight (targetNbBits = %u)", targetNbBits);
+ 
+     /* there are several too large elements (at least >= 2) */
+     {   int totalCost = 0;
+-        const U32 baseCost = 1 << (largestBits - maxNbBits);
++        const U32 baseCost = 1 << (largestBits - targetNbBits);
+         int n = (int)lastNonNull;
+ 
+-        /* Adjust any ranks > maxNbBits to maxNbBits.
++        /* Adjust any ranks > targetNbBits to targetNbBits.
+          * Compute totalCost, which is how far the sum of the ranks is
+          * we are over 2^largestBits after adjust the offending ranks.
+          */
+-        while (huffNode[n].nbBits > maxNbBits) {
++        while (huffNode[n].nbBits > targetNbBits) {
+             totalCost += baseCost - (1 << (largestBits - huffNode[n].nbBits));
+-            huffNode[n].nbBits = (BYTE)maxNbBits;
++            huffNode[n].nbBits = (BYTE)targetNbBits;
+             n--;
+         }
+-        /* n stops at huffNode[n].nbBits <= maxNbBits */
+-        assert(huffNode[n].nbBits <= maxNbBits);
+-        /* n end at index of smallest symbol using < maxNbBits */
+-        while (huffNode[n].nbBits == maxNbBits) --n;
++        /* n stops at huffNode[n].nbBits <= targetNbBits */
++        assert(huffNode[n].nbBits <= targetNbBits);
++        /* n end at index of smallest symbol using < targetNbBits */
++        while (huffNode[n].nbBits == targetNbBits) --n;
+ 
+-        /* renorm totalCost from 2^largestBits to 2^maxNbBits
++        /* renorm totalCost from 2^largestBits to 2^targetNbBits
+          * note : totalCost is necessarily a multiple of baseCost */
+-        assert((totalCost & (baseCost - 1)) == 0);
+-        totalCost >>= (largestBits - maxNbBits);
++        assert(((U32)totalCost & (baseCost - 1)) == 0);
++        totalCost >>= (largestBits - targetNbBits);
+         assert(totalCost > 0);
+ 
+         /* repay normalized cost */
+@@ -339,19 +410,19 @@ static U32 HUF_setMaxHeight(nodeElt* huf
+ 
+             /* Get pos of last (smallest = lowest cum. count) symbol per rank */
+             ZSTD_memset(rankLast, 0xF0, sizeof(rankLast));
+-            {   U32 currentNbBits = maxNbBits;
++            {   U32 currentNbBits = targetNbBits;
+                 int pos;
+                 for (pos=n ; pos >= 0; pos--) {
+                     if (huffNode[pos].nbBits >= currentNbBits) continue;
+-                    currentNbBits = huffNode[pos].nbBits;   /* < maxNbBits */
+-                    rankLast[maxNbBits-currentNbBits] = (U32)pos;
++                    currentNbBits = huffNode[pos].nbBits;   /* < targetNbBits */
++                    rankLast[targetNbBits-currentNbBits] = (U32)pos;
+             }   }
+ 
+             while (totalCost > 0) {
+                 /* Try to reduce the next power of 2 above totalCost because we
+                  * gain back half the rank.
+                  */
+-                U32 nBitsToDecrease = BIT_highbit32((U32)totalCost) + 1;
++                U32 nBitsToDecrease = ZSTD_highbit32((U32)totalCost) + 1;
+                 for ( ; nBitsToDecrease > 1; nBitsToDecrease--) {
+                     U32 const highPos = rankLast[nBitsToDecrease];
+                     U32 const lowPos = rankLast[nBitsToDecrease-1];
+@@ -391,7 +462,7 @@ static U32 HUF_setMaxHeight(nodeElt* huf
+                     rankLast[nBitsToDecrease] = noSymbol;
+                 else {
+                     rankLast[nBitsToDecrease]--;
+-                    if (huffNode[rankLast[nBitsToDecrease]].nbBits != maxNbBits-nBitsToDecrease)
++                    if (huffNode[rankLast[nBitsToDecrease]].nbBits != targetNbBits-nBitsToDecrease)
+                         rankLast[nBitsToDecrease] = noSymbol;   /* this rank is now empty */
+                 }
+             }   /* while (totalCost > 0) */
+@@ -403,11 +474,11 @@ static U32 HUF_setMaxHeight(nodeElt* huf
+              * TODO.
+              */
+             while (totalCost < 0) {  /* Sometimes, cost correction overshoot */
+-                /* special case : no rank 1 symbol (using maxNbBits-1);
+-                 * let's create one from largest rank 0 (using maxNbBits).
++                /* special case : no rank 1 symbol (using targetNbBits-1);
++                 * let's create one from largest rank 0 (using targetNbBits).
+                  */
+                 if (rankLast[1] == noSymbol) {
+-                    while (huffNode[n].nbBits == maxNbBits) n--;
++                    while (huffNode[n].nbBits == targetNbBits) n--;
+                     huffNode[n+1].nbBits--;
+                     assert(n >= 0);
+                     rankLast[1] = (U32)(n+1);
+@@ -421,7 +492,7 @@ static U32 HUF_setMaxHeight(nodeElt* huf
+         }   /* repay normalized cost */
+     }   /* there are several too large elements (at least >= 2) */
+ 
+-    return maxNbBits;
++    return targetNbBits;
+ }
+ 
+ typedef struct {
+@@ -429,7 +500,7 @@ typedef struct {
+     U16 curr;
+ } rankPos;
+ 
+-typedef nodeElt huffNodeTable[HUF_CTABLE_WORKSPACE_SIZE_U32];
++typedef nodeElt huffNodeTable[2 * (HUF_SYMBOLVALUE_MAX + 1)];
+ 
+ /* Number of buckets available for HUF_sort() */
+ #define RANK_POSITION_TABLE_SIZE 192
+@@ -448,8 +519,8 @@ typedef struct {
+  * Let buckets 166 to 192 represent all remaining counts up to RANK_POSITION_MAX_COUNT_LOG using log2 bucketing.
+  */
+ #define RANK_POSITION_MAX_COUNT_LOG 32
+-#define RANK_POSITION_LOG_BUCKETS_BEGIN (RANK_POSITION_TABLE_SIZE - 1) - RANK_POSITION_MAX_COUNT_LOG - 1 /* == 158 */
+-#define RANK_POSITION_DISTINCT_COUNT_CUTOFF RANK_POSITION_LOG_BUCKETS_BEGIN + BIT_highbit32(RANK_POSITION_LOG_BUCKETS_BEGIN) /* == 166 */
++#define RANK_POSITION_LOG_BUCKETS_BEGIN ((RANK_POSITION_TABLE_SIZE - 1) - RANK_POSITION_MAX_COUNT_LOG - 1 /* == 158 */)
++#define RANK_POSITION_DISTINCT_COUNT_CUTOFF (RANK_POSITION_LOG_BUCKETS_BEGIN + ZSTD_highbit32(RANK_POSITION_LOG_BUCKETS_BEGIN) /* == 166 */)
+ 
+ /* Return the appropriate bucket index for a given count. See definition of
+  * RANK_POSITION_DISTINCT_COUNT_CUTOFF for explanation of bucketing strategy.
+@@ -457,7 +528,7 @@ typedef struct {
+ static U32 HUF_getIndex(U32 const count) {
+     return (count < RANK_POSITION_DISTINCT_COUNT_CUTOFF)
+         ? count
+-        : BIT_highbit32(count) + RANK_POSITION_LOG_BUCKETS_BEGIN;
++        : ZSTD_highbit32(count) + RANK_POSITION_LOG_BUCKETS_BEGIN;
+ }
+ 
+ /* Helper swap function for HUF_quickSortPartition() */
+@@ -580,7 +651,7 @@ static void HUF_sort(nodeElt huffNode[],
+ 
+     /* Sort each bucket. */
+     for (n = RANK_POSITION_DISTINCT_COUNT_CUTOFF; n < RANK_POSITION_TABLE_SIZE - 1; ++n) {
+-        U32 const bucketSize = rankPosition[n].curr-rankPosition[n].base;
++        int const bucketSize = rankPosition[n].curr - rankPosition[n].base;
+         U32 const bucketStartIdx = rankPosition[n].base;
+         if (bucketSize > 1) {
+             assert(bucketStartIdx < maxSymbolValue1);
+@@ -591,6 +662,7 @@ static void HUF_sort(nodeElt huffNode[],
+     assert(HUF_isSorted(huffNode, maxSymbolValue1));
+ }
+ 
++
+ /* HUF_buildCTable_wksp() :
+  *  Same as HUF_buildCTable(), but using externally allocated scratch buffer.
+  *  `workSpace` must be aligned on 4-bytes boundaries, and be at least as large as sizeof(HUF_buildCTable_wksp_tables).
+@@ -611,6 +683,7 @@ static int HUF_buildTree(nodeElt* huffNo
+     int lowS, lowN;
+     int nodeNb = STARTNODE;
+     int n, nodeRoot;
++    DEBUGLOG(5, "HUF_buildTree (alphabet size = %u)", maxSymbolValue + 1);
+     /* init for parents */
+     nonNullRank = (int)maxSymbolValue;
+     while(huffNode[nonNullRank].count == 0) nonNullRank--;
+@@ -637,6 +710,8 @@ static int HUF_buildTree(nodeElt* huffNo
+     for (n=0; n<=nonNullRank; n++)
+         huffNode[n].nbBits = huffNode[ huffNode[n].parent ].nbBits + 1;
+ 
++    DEBUGLOG(6, "Initial distribution of bits completed (%zu sorted symbols)", showHNodeBits(huffNode, maxSymbolValue+1));
++
+     return nonNullRank;
+ }
+ 
+@@ -671,31 +746,40 @@ static void HUF_buildCTableFromTree(HUF_
+         HUF_setNbBits(ct + huffNode[n].byte, huffNode[n].nbBits);   /* push nbBits per symbol, symbol order */
+     for (n=0; n<alphabetSize; n++)
+         HUF_setValue(ct + n, valPerRank[HUF_getNbBits(ct[n])]++);   /* assign value within rank, symbol order */
+-    CTable[0] = maxNbBits;
++
++    HUF_writeCTableHeader(CTable, maxNbBits, maxSymbolValue);
+ }
+ 
+-size_t HUF_buildCTable_wksp (HUF_CElt* CTable, const unsigned* count, U32 maxSymbolValue, U32 maxNbBits, void* workSpace, size_t wkspSize)
++size_t
++HUF_buildCTable_wksp(HUF_CElt* CTable, const unsigned* count, U32 maxSymbolValue, U32 maxNbBits,
++                     void* workSpace, size_t wkspSize)
+ {
+-    HUF_buildCTable_wksp_tables* const wksp_tables = (HUF_buildCTable_wksp_tables*)HUF_alignUpWorkspace(workSpace, &wkspSize, ZSTD_ALIGNOF(U32));
++    HUF_buildCTable_wksp_tables* const wksp_tables =
++        (HUF_buildCTable_wksp_tables*)HUF_alignUpWorkspace(workSpace, &wkspSize, ZSTD_ALIGNOF(U32));
+     nodeElt* const huffNode0 = wksp_tables->huffNodeTbl;
+     nodeElt* const huffNode = huffNode0+1;
+     int nonNullRank;
+ 
++    HUF_STATIC_ASSERT(HUF_CTABLE_WORKSPACE_SIZE == sizeof(HUF_buildCTable_wksp_tables));
++
++    DEBUGLOG(5, "HUF_buildCTable_wksp (alphabet size = %u)", maxSymbolValue+1);
++
+     /* safety checks */
+     if (wkspSize < sizeof(HUF_buildCTable_wksp_tables))
+-      return ERROR(workSpace_tooSmall);
++        return ERROR(workSpace_tooSmall);
+     if (maxNbBits == 0) maxNbBits = HUF_TABLELOG_DEFAULT;
+     if (maxSymbolValue > HUF_SYMBOLVALUE_MAX)
+-      return ERROR(maxSymbolValue_tooLarge);
++        return ERROR(maxSymbolValue_tooLarge);
+     ZSTD_memset(huffNode0, 0, sizeof(huffNodeTable));
+ 
+     /* sort, decreasing order */
+     HUF_sort(huffNode, count, maxSymbolValue, wksp_tables->rankPosition);
++    DEBUGLOG(6, "sorted symbols completed (%zu symbols)", showHNodeSymbols(huffNode, maxSymbolValue+1));
+ 
+     /* build tree */
+     nonNullRank = HUF_buildTree(huffNode, maxSymbolValue);
+ 
+-    /* enforce maxTableLog */
++    /* determine and enforce maxTableLog */
+     maxNbBits = HUF_setMaxHeight(huffNode, (U32)nonNullRank, maxNbBits);
+     if (maxNbBits > HUF_TABLELOG_MAX) return ERROR(GENERIC);   /* check fit into table */
+ 
+@@ -716,13 +800,20 @@ size_t HUF_estimateCompressedSize(const
+ }
+ 
+ int HUF_validateCTable(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue) {
+-  HUF_CElt const* ct = CTable + 1;
+-  int bad = 0;
+-  int s;
+-  for (s = 0; s <= (int)maxSymbolValue; ++s) {
+-    bad |= (count[s] != 0) & (HUF_getNbBits(ct[s]) == 0);
+-  }
+-  return !bad;
++    HUF_CTableHeader header = HUF_readCTableHeader(CTable);
++    HUF_CElt const* ct = CTable + 1;
++    int bad = 0;
++    int s;
++
++    assert(header.tableLog <= HUF_TABLELOG_ABSOLUTEMAX);
++
++    if (header.maxSymbolValue < maxSymbolValue)
++        return 0;
++
++    for (s = 0; s <= (int)maxSymbolValue; ++s) {
++        bad |= (count[s] != 0) & (HUF_getNbBits(ct[s]) == 0);
++    }
++    return !bad;
+ }
+ 
+ size_t HUF_compressBound(size_t size) { return HUF_COMPRESSBOUND(size); }
+@@ -804,7 +895,7 @@ FORCE_INLINE_TEMPLATE void HUF_addBits(H
+ #if DEBUGLEVEL >= 1
+     {
+         size_t const nbBits = HUF_getNbBits(elt);
+-        size_t const dirtyBits = nbBits == 0 ? 0 : BIT_highbit32((U32)nbBits) + 1;
++        size_t const dirtyBits = nbBits == 0 ? 0 : ZSTD_highbit32((U32)nbBits) + 1;
+         (void)dirtyBits;
+         /* Middle bits are 0. */
+         assert(((elt >> dirtyBits) << (dirtyBits + nbBits)) == 0);
+@@ -884,7 +975,7 @@ static size_t HUF_closeCStream(HUF_CStre
+     {
+         size_t const nbBits = bitC->bitPos[0] & 0xFF;
+         if (bitC->ptr >= bitC->endPtr) return 0; /* overflow detected */
+-        return (bitC->ptr - bitC->startPtr) + (nbBits > 0);
++        return (size_t)(bitC->ptr - bitC->startPtr) + (nbBits > 0);
+     }
+ }
+ 
+@@ -964,17 +1055,17 @@ HUF_compress1X_usingCTable_internal_body
+                                    const void* src, size_t srcSize,
+                                    const HUF_CElt* CTable)
+ {
+-    U32 const tableLog = (U32)CTable[0];
++    U32 const tableLog = HUF_readCTableHeader(CTable).tableLog;
+     HUF_CElt const* ct = CTable + 1;
+     const BYTE* ip = (const BYTE*) src;
+     BYTE* const ostart = (BYTE*)dst;
+     BYTE* const oend = ostart + dstSize;
+-    BYTE* op = ostart;
+     HUF_CStream_t bitC;
+ 
+     /* init */
+     if (dstSize < 8) return 0;   /* not enough space to compress */
+-    { size_t const initErr = HUF_initCStream(&bitC, op, (size_t)(oend-op));
++    { BYTE* op = ostart;
++      size_t const initErr = HUF_initCStream(&bitC, op, (size_t)(oend-op));
+       if (HUF_isError(initErr)) return 0; }
+ 
+     if (dstSize < HUF_tightCompressBound(srcSize, (size_t)tableLog) || tableLog > 11)
+@@ -1045,9 +1136,9 @@ HUF_compress1X_usingCTable_internal_defa
+ static size_t
+ HUF_compress1X_usingCTable_internal(void* dst, size_t dstSize,
+                               const void* src, size_t srcSize,
+-                              const HUF_CElt* CTable, const int bmi2)
++                              const HUF_CElt* CTable, const int flags)
+ {
+-    if (bmi2) {
++    if (flags & HUF_flags_bmi2) {
+         return HUF_compress1X_usingCTable_internal_bmi2(dst, dstSize, src, srcSize, CTable);
+     }
+     return HUF_compress1X_usingCTable_internal_default(dst, dstSize, src, srcSize, CTable);
+@@ -1058,28 +1149,23 @@ HUF_compress1X_usingCTable_internal(void
+ static size_t
+ HUF_compress1X_usingCTable_internal(void* dst, size_t dstSize,
+                               const void* src, size_t srcSize,
+-                              const HUF_CElt* CTable, const int bmi2)
++                              const HUF_CElt* CTable, const int flags)
+ {
+-    (void)bmi2;
++    (void)flags;
+     return HUF_compress1X_usingCTable_internal_body(dst, dstSize, src, srcSize, CTable);
+ }
+ 
+ #endif
+ 
+-size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable)
+-{
+-    return HUF_compress1X_usingCTable_bmi2(dst, dstSize, src, srcSize, CTable, /* bmi2 */ 0);
+-}
+-
+-size_t HUF_compress1X_usingCTable_bmi2(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int bmi2)
++size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int flags)
+ {
+-    return HUF_compress1X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, bmi2);
++    return HUF_compress1X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, flags);
+ }
+ 
+ static size_t
+ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize,
+                               const void* src, size_t srcSize,
+-                              const HUF_CElt* CTable, int bmi2)
++                              const HUF_CElt* CTable, int flags)
+ {
+     size_t const segmentSize = (srcSize+3)/4;   /* first 3 segments */
+     const BYTE* ip = (const BYTE*) src;
+@@ -1093,7 +1179,7 @@ HUF_compress4X_usingCTable_internal(void
+     op += 6;   /* jumpTable */
+ 
+     assert(op <= oend);
+-    {   CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, bmi2) );
++    {   CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, flags) );
+         if (cSize == 0 || cSize > 65535) return 0;
+         MEM_writeLE16(ostart, (U16)cSize);
+         op += cSize;
+@@ -1101,7 +1187,7 @@ HUF_compress4X_usingCTable_internal(void
+ 
+     ip += segmentSize;
+     assert(op <= oend);
+-    {   CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, bmi2) );
++    {   CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, flags) );
+         if (cSize == 0 || cSize > 65535) return 0;
+         MEM_writeLE16(ostart+2, (U16)cSize);
+         op += cSize;
+@@ -1109,7 +1195,7 @@ HUF_compress4X_usingCTable_internal(void
+ 
+     ip += segmentSize;
+     assert(op <= oend);
+-    {   CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, bmi2) );
++    {   CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, flags) );
+         if (cSize == 0 || cSize > 65535) return 0;
+         MEM_writeLE16(ostart+4, (U16)cSize);
+         op += cSize;
+@@ -1118,7 +1204,7 @@ HUF_compress4X_usingCTable_internal(void
+     ip += segmentSize;
+     assert(op <= oend);
+     assert(ip <= iend);
+-    {   CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, (size_t)(iend-ip), CTable, bmi2) );
++    {   CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, (size_t)(iend-ip), CTable, flags) );
+         if (cSize == 0 || cSize > 65535) return 0;
+         op += cSize;
+     }
+@@ -1126,14 +1212,9 @@ HUF_compress4X_usingCTable_internal(void
+     return (size_t)(op-ostart);
+ }
+ 
+-size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable)
+-{
+-    return HUF_compress4X_usingCTable_bmi2(dst, dstSize, src, srcSize, CTable, /* bmi2 */ 0);
+-}
+-
+-size_t HUF_compress4X_usingCTable_bmi2(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int bmi2)
++size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int flags)
+ {
+-    return HUF_compress4X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, bmi2);
++    return HUF_compress4X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, flags);
+ }
+ 
+ typedef enum { HUF_singleStream, HUF_fourStreams } HUF_nbStreams_e;
+@@ -1141,11 +1222,11 @@ typedef enum { HUF_singleStream, HUF_fou
+ static size_t HUF_compressCTable_internal(
+                 BYTE* const ostart, BYTE* op, BYTE* const oend,
+                 const void* src, size_t srcSize,
+-                HUF_nbStreams_e nbStreams, const HUF_CElt* CTable, const int bmi2)
++                HUF_nbStreams_e nbStreams, const HUF_CElt* CTable, const int flags)
+ {
+     size_t const cSize = (nbStreams==HUF_singleStream) ?
+-                         HUF_compress1X_usingCTable_internal(op, (size_t)(oend - op), src, srcSize, CTable, bmi2) :
+-                         HUF_compress4X_usingCTable_internal(op, (size_t)(oend - op), src, srcSize, CTable, bmi2);
++                         HUF_compress1X_usingCTable_internal(op, (size_t)(oend - op), src, srcSize, CTable, flags) :
++                         HUF_compress4X_usingCTable_internal(op, (size_t)(oend - op), src, srcSize, CTable, flags);
+     if (HUF_isError(cSize)) { return cSize; }
+     if (cSize==0) { return 0; }   /* uncompressible */
+     op += cSize;
+@@ -1168,6 +1249,81 @@ typedef struct {
+ #define SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE 4096
+ #define SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO 10  /* Must be >= 2 */
+ 
++unsigned HUF_cardinality(const unsigned* count, unsigned maxSymbolValue)
++{
++    unsigned cardinality = 0;
++    unsigned i;
++
++    for (i = 0; i < maxSymbolValue + 1; i++) {
++        if (count[i] != 0) cardinality += 1;
++    }
++
++    return cardinality;
++}
++
++unsigned HUF_minTableLog(unsigned symbolCardinality)
++{
++    U32 minBitsSymbols = ZSTD_highbit32(symbolCardinality) + 1;
++    return minBitsSymbols;
++}
++
++unsigned HUF_optimalTableLog(
++            unsigned maxTableLog,
++            size_t srcSize,
++            unsigned maxSymbolValue,
++            void* workSpace, size_t wkspSize,
++            HUF_CElt* table,
++      const unsigned* count,
++            int flags)
++{
++    assert(srcSize > 1); /* Not supported, RLE should be used instead */
++    assert(wkspSize >= sizeof(HUF_buildCTable_wksp_tables));
++
++    if (!(flags & HUF_flags_optimalDepth)) {
++        /* cheap evaluation, based on FSE */
++        return FSE_optimalTableLog_internal(maxTableLog, srcSize, maxSymbolValue, 1);
++    }
++
++    {   BYTE* dst = (BYTE*)workSpace + sizeof(HUF_WriteCTableWksp);
++        size_t dstSize = wkspSize - sizeof(HUF_WriteCTableWksp);
++        size_t hSize, newSize;
++        const unsigned symbolCardinality = HUF_cardinality(count, maxSymbolValue);
++        const unsigned minTableLog = HUF_minTableLog(symbolCardinality);
++        size_t optSize = ((size_t) ~0) - 1;
++        unsigned optLog = maxTableLog, optLogGuess;
++
++        DEBUGLOG(6, "HUF_optimalTableLog: probing huf depth (srcSize=%zu)", srcSize);
++
++        /* Search until size increases */
++        for (optLogGuess = minTableLog; optLogGuess <= maxTableLog; optLogGuess++) {
++            DEBUGLOG(7, "checking for huffLog=%u", optLogGuess);
++
++            {   size_t maxBits = HUF_buildCTable_wksp(table, count, maxSymbolValue, optLogGuess, workSpace, wkspSize);
++                if (ERR_isError(maxBits)) continue;
++
++                if (maxBits < optLogGuess && optLogGuess > minTableLog) break;
++
++                hSize = HUF_writeCTable_wksp(dst, dstSize, table, maxSymbolValue, (U32)maxBits, workSpace, wkspSize);
++            }
++
++            if (ERR_isError(hSize)) continue;
++
++            newSize = HUF_estimateCompressedSize(table, count, maxSymbolValue) + hSize;
++
++            if (newSize > optSize + 1) {
++                break;
++            }
++
++            if (newSize < optSize) {
++                optSize = newSize;
++                optLog = optLogGuess;
++            }
++        }
++        assert(optLog <= HUF_TABLELOG_MAX);
++        return optLog;
++    }
++}
++
+ /* HUF_compress_internal() :
+  * `workSpace_align4` must be aligned on 4-bytes boundaries,
+  * and occupies the same space as a table of HUF_WORKSPACE_SIZE_U64 unsigned */
+@@ -1177,14 +1333,14 @@ HUF_compress_internal (void* dst, size_t
+                        unsigned maxSymbolValue, unsigned huffLog,
+                        HUF_nbStreams_e nbStreams,
+                        void* workSpace, size_t wkspSize,
+-                       HUF_CElt* oldHufTable, HUF_repeat* repeat, int preferRepeat,
+-                 const int bmi2, unsigned suspectUncompressible)
++                       HUF_CElt* oldHufTable, HUF_repeat* repeat, int flags)
+ {
+     HUF_compress_tables_t* const table = (HUF_compress_tables_t*)HUF_alignUpWorkspace(workSpace, &wkspSize, ZSTD_ALIGNOF(size_t));
+     BYTE* const ostart = (BYTE*)dst;
+     BYTE* const oend = ostart + dstSize;
+     BYTE* op = ostart;
+ 
++    DEBUGLOG(5, "HUF_compress_internal (srcSize=%zu)", srcSize);
+     HUF_STATIC_ASSERT(sizeof(*table) + HUF_WORKSPACE_MAX_ALIGNMENT <= HUF_WORKSPACE_SIZE);
+ 
+     /* checks & inits */
+@@ -1198,16 +1354,17 @@ HUF_compress_internal (void* dst, size_t
+     if (!huffLog) huffLog = HUF_TABLELOG_DEFAULT;
+ 
+     /* Heuristic : If old table is valid, use it for small inputs */
+-    if (preferRepeat && repeat && *repeat == HUF_repeat_valid) {
++    if ((flags & HUF_flags_preferRepeat) && repeat && *repeat == HUF_repeat_valid) {
+         return HUF_compressCTable_internal(ostart, op, oend,
+                                            src, srcSize,
+-                                           nbStreams, oldHufTable, bmi2);
++                                           nbStreams, oldHufTable, flags);
+     }
+ 
+     /* If uncompressible data is suspected, do a smaller sampling first */
+     DEBUG_STATIC_ASSERT(SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO >= 2);
+-    if (suspectUncompressible && srcSize >= (SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE * SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO)) {
++    if ((flags & HUF_flags_suspectUncompressible) && srcSize >= (SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE * SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO)) {
+         size_t largestTotal = 0;
++        DEBUGLOG(5, "input suspected incompressible : sampling to check");
+         {   unsigned maxSymbolValueBegin = maxSymbolValue;
+             CHECK_V_F(largestBegin, HIST_count_simple (table->count, &maxSymbolValueBegin, (const BYTE*)src, SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE) );
+             largestTotal += largestBegin;
+@@ -1224,6 +1381,7 @@ HUF_compress_internal (void* dst, size_t
+         if (largest == srcSize) { *ostart = ((const BYTE*)src)[0]; return 1; }   /* single symbol, rle */
+         if (largest <= (srcSize >> 7)+4) return 0;   /* heuristic : probably not compressible enough */
+     }
++    DEBUGLOG(6, "histogram detail completed (%zu symbols)", showU32(table->count, maxSymbolValue+1));
+ 
+     /* Check validity of previous table */
+     if ( repeat
+@@ -1232,25 +1390,20 @@ HUF_compress_internal (void* dst, size_t
+         *repeat = HUF_repeat_none;
+     }
+     /* Heuristic : use existing table for small inputs */
+-    if (preferRepeat && repeat && *repeat != HUF_repeat_none) {
++    if ((flags & HUF_flags_preferRepeat) && repeat && *repeat != HUF_repeat_none) {
+         return HUF_compressCTable_internal(ostart, op, oend,
+                                            src, srcSize,
+-                                           nbStreams, oldHufTable, bmi2);
++                                           nbStreams, oldHufTable, flags);
+     }
+ 
+     /* Build Huffman Tree */
+-    huffLog = HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue);
++    huffLog = HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue, &table->wksps, sizeof(table->wksps), table->CTable, table->count, flags);
+     {   size_t const maxBits = HUF_buildCTable_wksp(table->CTable, table->count,
+                                             maxSymbolValue, huffLog,
+                                             &table->wksps.buildCTable_wksp, sizeof(table->wksps.buildCTable_wksp));
+         CHECK_F(maxBits);
+         huffLog = (U32)maxBits;
+-    }
+-    /* Zero unused symbols in CTable, so we can check it for validity */
+-    {
+-        size_t const ctableSize = HUF_CTABLE_SIZE_ST(maxSymbolValue);
+-        size_t const unusedSize = sizeof(table->CTable) - ctableSize * sizeof(HUF_CElt);
+-        ZSTD_memset(table->CTable + ctableSize, 0, unusedSize);
++        DEBUGLOG(6, "bit distribution completed (%zu symbols)", showCTableBits(table->CTable + 1, maxSymbolValue+1));
+     }
+ 
+     /* Write table description header */
+@@ -1263,7 +1416,7 @@ HUF_compress_internal (void* dst, size_t
+             if (oldSize <= hSize + newSize || hSize + 12 >= srcSize) {
+                 return HUF_compressCTable_internal(ostart, op, oend,
+                                                    src, srcSize,
+-                                                   nbStreams, oldHufTable, bmi2);
++                                                   nbStreams, oldHufTable, flags);
+         }   }
+ 
+         /* Use the new huffman table */
+@@ -1275,61 +1428,35 @@ HUF_compress_internal (void* dst, size_t
+     }
+     return HUF_compressCTable_internal(ostart, op, oend,
+                                        src, srcSize,
+-                                       nbStreams, table->CTable, bmi2);
+-}
+-
+-
+-size_t HUF_compress1X_wksp (void* dst, size_t dstSize,
+-                      const void* src, size_t srcSize,
+-                      unsigned maxSymbolValue, unsigned huffLog,
+-                      void* workSpace, size_t wkspSize)
+-{
+-    return HUF_compress_internal(dst, dstSize, src, srcSize,
+-                                 maxSymbolValue, huffLog, HUF_singleStream,
+-                                 workSpace, wkspSize,
+-                                 NULL, NULL, 0, 0 /*bmi2*/, 0);
++                                       nbStreams, table->CTable, flags);
+ }
+ 
+ size_t HUF_compress1X_repeat (void* dst, size_t dstSize,
+                       const void* src, size_t srcSize,
+                       unsigned maxSymbolValue, unsigned huffLog,
+                       void* workSpace, size_t wkspSize,
+-                      HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat,
+-                      int bmi2, unsigned suspectUncompressible)
++                      HUF_CElt* hufTable, HUF_repeat* repeat, int flags)
+ {
++    DEBUGLOG(5, "HUF_compress1X_repeat (srcSize = %zu)", srcSize);
+     return HUF_compress_internal(dst, dstSize, src, srcSize,
+                                  maxSymbolValue, huffLog, HUF_singleStream,
+                                  workSpace, wkspSize, hufTable,
+-                                 repeat, preferRepeat, bmi2, suspectUncompressible);
+-}
+-
+-/* HUF_compress4X_repeat():
+- * compress input using 4 streams.
+- * provide workspace to generate compression tables */
+-size_t HUF_compress4X_wksp (void* dst, size_t dstSize,
+-                      const void* src, size_t srcSize,
+-                      unsigned maxSymbolValue, unsigned huffLog,
+-                      void* workSpace, size_t wkspSize)
+-{
+-    return HUF_compress_internal(dst, dstSize, src, srcSize,
+-                                 maxSymbolValue, huffLog, HUF_fourStreams,
+-                                 workSpace, wkspSize,
+-                                 NULL, NULL, 0, 0 /*bmi2*/, 0);
++                                 repeat, flags);
+ }
+ 
+ /* HUF_compress4X_repeat():
+  * compress input using 4 streams.
+  * consider skipping quickly
+- * re-use an existing huffman compression table */
++ * reuse an existing huffman compression table */
+ size_t HUF_compress4X_repeat (void* dst, size_t dstSize,
+                       const void* src, size_t srcSize,
+                       unsigned maxSymbolValue, unsigned huffLog,
+                       void* workSpace, size_t wkspSize,
+-                      HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2, unsigned suspectUncompressible)
++                      HUF_CElt* hufTable, HUF_repeat* repeat, int flags)
+ {
++    DEBUGLOG(5, "HUF_compress4X_repeat (srcSize = %zu)", srcSize);
+     return HUF_compress_internal(dst, dstSize, src, srcSize,
+                                  maxSymbolValue, huffLog, HUF_fourStreams,
+                                  workSpace, wkspSize,
+-                                 hufTable, repeat, preferRepeat, bmi2, suspectUncompressible);
++                                 hufTable, repeat, flags);
+ }
+-
+--- a/lib/zstd/compress/zstd_compress.c
++++ b/lib/zstd/compress/zstd_compress.c
+@@ -1,5 +1,6 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -11,12 +12,12 @@
+ /*-*************************************
+ *  Dependencies
+ ***************************************/
++#include "../common/allocations.h"  /* ZSTD_customMalloc, ZSTD_customCalloc, ZSTD_customFree */
+ #include "../common/zstd_deps.h"  /* INT_MAX, ZSTD_memset, ZSTD_memcpy */
+ #include "../common/mem.h"
+ #include "hist.h"           /* HIST_countFast_wksp */
+ #define FSE_STATIC_LINKING_ONLY   /* FSE_encodeSymbol */
+ #include "../common/fse.h"
+-#define HUF_STATIC_LINKING_ONLY
+ #include "../common/huf.h"
+ #include "zstd_compress_internal.h"
+ #include "zstd_compress_sequences.h"
+@@ -27,6 +28,7 @@
+ #include "zstd_opt.h"
+ #include "zstd_ldm.h"
+ #include "zstd_compress_superblock.h"
++#include  "../common/bits.h"      /* ZSTD_highbit32, ZSTD_rotateRight_U64 */
+ 
+ /* ***************************************************************
+ *  Tuning parameters
+@@ -55,14 +57,17 @@
+ *  Helper functions
+ ***************************************/
+ /* ZSTD_compressBound()
+- * Note that the result from this function is only compatible with the "normal"
+- * full-block strategy.
+- * When there are a lot of small blocks due to frequent flush in streaming mode
+- * the overhead of headers can make the compressed data to be larger than the
+- * return value of ZSTD_compressBound().
++ * Note that the result from this function is only valid for
++ * the one-pass compression functions.
++ * When employing the streaming mode,
++ * if flushes are frequently altering the size of blocks,
++ * the overhead from block headers can make the compressed data larger
++ * than the return value of ZSTD_compressBound().
+  */
+ size_t ZSTD_compressBound(size_t srcSize) {
+-    return ZSTD_COMPRESSBOUND(srcSize);
++    size_t const r = ZSTD_COMPRESSBOUND(srcSize);
++    if (r==0) return ERROR(srcSize_wrong);
++    return r;
+ }
+ 
+ 
+@@ -168,15 +173,13 @@ static void ZSTD_freeCCtxContent(ZSTD_CC
+ 
+ size_t ZSTD_freeCCtx(ZSTD_CCtx* cctx)
+ {
++    DEBUGLOG(3, "ZSTD_freeCCtx (address: %p)", (void*)cctx);
+     if (cctx==NULL) return 0;   /* support free on NULL */
+     RETURN_ERROR_IF(cctx->staticSize, memory_allocation,
+                     "not compatible with static CCtx");
+-    {
+-        int cctxInWorkspace = ZSTD_cwksp_owns_buffer(&cctx->workspace, cctx);
++    {   int cctxInWorkspace = ZSTD_cwksp_owns_buffer(&cctx->workspace, cctx);
+         ZSTD_freeCCtxContent(cctx);
+-        if (!cctxInWorkspace) {
+-            ZSTD_customFree(cctx, cctx->customMem);
+-        }
++        if (!cctxInWorkspace) ZSTD_customFree(cctx, cctx->customMem);
+     }
+     return 0;
+ }
+@@ -257,9 +260,9 @@ static int ZSTD_allocateChainTable(const
+     return forDDSDict || ((strategy != ZSTD_fast) && !ZSTD_rowMatchFinderUsed(strategy, useRowMatchFinder));
+ }
+ 
+-/* Returns 1 if compression parameters are such that we should
++/* Returns ZSTD_ps_enable if compression parameters are such that we should
+  * enable long distance matching (wlog >= 27, strategy >= btopt).
+- * Returns 0 otherwise.
++ * Returns ZSTD_ps_disable otherwise.
+  */
+ static ZSTD_paramSwitch_e ZSTD_resolveEnableLdm(ZSTD_paramSwitch_e mode,
+                                  const ZSTD_compressionParameters* const cParams) {
+@@ -267,6 +270,34 @@ static ZSTD_paramSwitch_e ZSTD_resolveEn
+     return (cParams->strategy >= ZSTD_btopt && cParams->windowLog >= 27) ? ZSTD_ps_enable : ZSTD_ps_disable;
+ }
+ 
++static int ZSTD_resolveExternalSequenceValidation(int mode) {
++    return mode;
++}
++
++/* Resolves maxBlockSize to the default if no value is present. */
++static size_t ZSTD_resolveMaxBlockSize(size_t maxBlockSize) {
++    if (maxBlockSize == 0) {
++        return ZSTD_BLOCKSIZE_MAX;
++    } else {
++        return maxBlockSize;
++    }
++}
++
++static ZSTD_paramSwitch_e ZSTD_resolveExternalRepcodeSearch(ZSTD_paramSwitch_e value, int cLevel) {
++    if (value != ZSTD_ps_auto) return value;
++    if (cLevel < 10) {
++        return ZSTD_ps_disable;
++    } else {
++        return ZSTD_ps_enable;
++    }
++}
++
++/* Returns 1 if compression parameters are such that CDict hashtable and chaintable indices are tagged.
++ * If so, the tags need to be removed in ZSTD_resetCCtx_byCopyingCDict. */
++static int ZSTD_CDictIndicesAreTagged(const ZSTD_compressionParameters* const cParams) {
++    return cParams->strategy == ZSTD_fast || cParams->strategy == ZSTD_dfast;
++}
++
+ static ZSTD_CCtx_params ZSTD_makeCCtxParamsFromCParams(
+         ZSTD_compressionParameters cParams)
+ {
+@@ -284,6 +315,10 @@ static ZSTD_CCtx_params ZSTD_makeCCtxPar
+     }
+     cctxParams.useBlockSplitter = ZSTD_resolveBlockSplitterMode(cctxParams.useBlockSplitter, &cParams);
+     cctxParams.useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(cctxParams.useRowMatchFinder, &cParams);
++    cctxParams.validateSequences = ZSTD_resolveExternalSequenceValidation(cctxParams.validateSequences);
++    cctxParams.maxBlockSize = ZSTD_resolveMaxBlockSize(cctxParams.maxBlockSize);
++    cctxParams.searchForExternalRepcodes = ZSTD_resolveExternalRepcodeSearch(cctxParams.searchForExternalRepcodes,
++                                                                             cctxParams.compressionLevel);
+     assert(!ZSTD_checkCParams(cParams));
+     return cctxParams;
+ }
+@@ -329,10 +364,13 @@ size_t ZSTD_CCtxParams_init(ZSTD_CCtx_pa
+ #define ZSTD_NO_CLEVEL 0
+ 
+ /*
+- * Initializes the cctxParams from params and compressionLevel.
++ * Initializes `cctxParams` from `params` and `compressionLevel`.
+  * @param compressionLevel If params are derived from a compression level then that compression level, otherwise ZSTD_NO_CLEVEL.
+  */
+-static void ZSTD_CCtxParams_init_internal(ZSTD_CCtx_params* cctxParams, ZSTD_parameters const* params, int compressionLevel)
++static void
++ZSTD_CCtxParams_init_internal(ZSTD_CCtx_params* cctxParams,
++                        const ZSTD_parameters* params,
++                              int compressionLevel)
+ {
+     assert(!ZSTD_checkCParams(params->cParams));
+     ZSTD_memset(cctxParams, 0, sizeof(*cctxParams));
+@@ -345,6 +383,9 @@ static void ZSTD_CCtxParams_init_interna
+     cctxParams->useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(cctxParams->useRowMatchFinder, &params->cParams);
+     cctxParams->useBlockSplitter = ZSTD_resolveBlockSplitterMode(cctxParams->useBlockSplitter, &params->cParams);
+     cctxParams->ldmParams.enableLdm = ZSTD_resolveEnableLdm(cctxParams->ldmParams.enableLdm, &params->cParams);
++    cctxParams->validateSequences = ZSTD_resolveExternalSequenceValidation(cctxParams->validateSequences);
++    cctxParams->maxBlockSize = ZSTD_resolveMaxBlockSize(cctxParams->maxBlockSize);
++    cctxParams->searchForExternalRepcodes = ZSTD_resolveExternalRepcodeSearch(cctxParams->searchForExternalRepcodes, compressionLevel);
+     DEBUGLOG(4, "ZSTD_CCtxParams_init_internal: useRowMatchFinder=%d, useBlockSplitter=%d ldm=%d",
+                 cctxParams->useRowMatchFinder, cctxParams->useBlockSplitter, cctxParams->ldmParams.enableLdm);
+ }
+@@ -359,7 +400,7 @@ size_t ZSTD_CCtxParams_init_advanced(ZST
+ 
+ /*
+  * Sets cctxParams' cParams and fParams from params, but otherwise leaves them alone.
+- * @param param Validated zstd parameters.
++ * @param params Validated zstd parameters.
+  */
+ static void ZSTD_CCtxParams_setZstdParams(
+         ZSTD_CCtx_params* cctxParams, const ZSTD_parameters* params)
+@@ -455,8 +496,8 @@ ZSTD_bounds ZSTD_cParam_getBounds(ZSTD_c
+         return bounds;
+ 
+     case ZSTD_c_enableLongDistanceMatching:
+-        bounds.lowerBound = 0;
+-        bounds.upperBound = 1;
++        bounds.lowerBound = (int)ZSTD_ps_auto;
++        bounds.upperBound = (int)ZSTD_ps_disable;
+         return bounds;
+ 
+     case ZSTD_c_ldmHashLog:
+@@ -549,6 +590,26 @@ ZSTD_bounds ZSTD_cParam_getBounds(ZSTD_c
+         bounds.upperBound = 1;
+         return bounds;
+ 
++    case ZSTD_c_prefetchCDictTables:
++        bounds.lowerBound = (int)ZSTD_ps_auto;
++        bounds.upperBound = (int)ZSTD_ps_disable;
++        return bounds;
++
++    case ZSTD_c_enableSeqProducerFallback:
++        bounds.lowerBound = 0;
++        bounds.upperBound = 1;
++        return bounds;
++
++    case ZSTD_c_maxBlockSize:
++        bounds.lowerBound = ZSTD_BLOCKSIZE_MAX_MIN;
++        bounds.upperBound = ZSTD_BLOCKSIZE_MAX;
++        return bounds;
++
++    case ZSTD_c_searchForExternalRepcodes:
++        bounds.lowerBound = (int)ZSTD_ps_auto;
++        bounds.upperBound = (int)ZSTD_ps_disable;
++        return bounds;
++
+     default:
+         bounds.error = ERROR(parameter_unsupported);
+         return bounds;
+@@ -567,10 +628,11 @@ static size_t ZSTD_cParam_clampBounds(ZS
+     return 0;
+ }
+ 
+-#define BOUNDCHECK(cParam, val) { \
+-    RETURN_ERROR_IF(!ZSTD_cParam_withinBounds(cParam,val), \
+-                    parameter_outOfBound, "Param out of bounds"); \
+-}
++#define BOUNDCHECK(cParam, val)                                       \
++    do {                                                              \
++        RETURN_ERROR_IF(!ZSTD_cParam_withinBounds(cParam,val),        \
++                        parameter_outOfBound, "Param out of bounds"); \
++    } while (0)
+ 
+ 
+ static int ZSTD_isUpdateAuthorized(ZSTD_cParameter param)
+@@ -613,6 +675,10 @@ static int ZSTD_isUpdateAuthorized(ZSTD_
+     case ZSTD_c_useBlockSplitter:
+     case ZSTD_c_useRowMatchFinder:
+     case ZSTD_c_deterministicRefPrefix:
++    case ZSTD_c_prefetchCDictTables:
++    case ZSTD_c_enableSeqProducerFallback:
++    case ZSTD_c_maxBlockSize:
++    case ZSTD_c_searchForExternalRepcodes:
+     default:
+         return 0;
+     }
+@@ -625,7 +691,7 @@ size_t ZSTD_CCtx_setParameter(ZSTD_CCtx*
+         if (ZSTD_isUpdateAuthorized(param)) {
+             cctx->cParamsChanged = 1;
+         } else {
+-            RETURN_ERROR(stage_wrong, "can only set params in ctx init stage");
++            RETURN_ERROR(stage_wrong, "can only set params in cctx init stage");
+     }   }
+ 
+     switch(param)
+@@ -668,6 +734,10 @@ size_t ZSTD_CCtx_setParameter(ZSTD_CCtx*
+     case ZSTD_c_useBlockSplitter:
+     case ZSTD_c_useRowMatchFinder:
+     case ZSTD_c_deterministicRefPrefix:
++    case ZSTD_c_prefetchCDictTables:
++    case ZSTD_c_enableSeqProducerFallback:
++    case ZSTD_c_maxBlockSize:
++    case ZSTD_c_searchForExternalRepcodes:
+         break;
+ 
+     default: RETURN_ERROR(parameter_unsupported, "unknown parameter");
+@@ -723,12 +793,12 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD
+     case ZSTD_c_minMatch :
+         if (value!=0)   /* 0 => use default */
+             BOUNDCHECK(ZSTD_c_minMatch, value);
+-        CCtxParams->cParams.minMatch = value;
++        CCtxParams->cParams.minMatch = (U32)value;
+         return CCtxParams->cParams.minMatch;
+ 
+     case ZSTD_c_targetLength :
+         BOUNDCHECK(ZSTD_c_targetLength, value);
+-        CCtxParams->cParams.targetLength = value;
++        CCtxParams->cParams.targetLength = (U32)value;
+         return CCtxParams->cParams.targetLength;
+ 
+     case ZSTD_c_strategy :
+@@ -741,12 +811,12 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD
+         /* Content size written in frame header _when known_ (default:1) */
+         DEBUGLOG(4, "set content size flag = %u", (value!=0));
+         CCtxParams->fParams.contentSizeFlag = value != 0;
+-        return CCtxParams->fParams.contentSizeFlag;
++        return (size_t)CCtxParams->fParams.contentSizeFlag;
+ 
+     case ZSTD_c_checksumFlag :
+         /* A 32-bits content checksum will be calculated and written at end of frame (default:0) */
+         CCtxParams->fParams.checksumFlag = value != 0;
+-        return CCtxParams->fParams.checksumFlag;
++        return (size_t)CCtxParams->fParams.checksumFlag;
+ 
+     case ZSTD_c_dictIDFlag : /* When applicable, dictionary's dictID is provided in frame header (default:1) */
+         DEBUGLOG(4, "set dictIDFlag = %u", (value!=0));
+@@ -755,18 +825,18 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD
+ 
+     case ZSTD_c_forceMaxWindow :
+         CCtxParams->forceWindow = (value != 0);
+-        return CCtxParams->forceWindow;
++        return (size_t)CCtxParams->forceWindow;
+ 
+     case ZSTD_c_forceAttachDict : {
+         const ZSTD_dictAttachPref_e pref = (ZSTD_dictAttachPref_e)value;
+-        BOUNDCHECK(ZSTD_c_forceAttachDict, pref);
++        BOUNDCHECK(ZSTD_c_forceAttachDict, (int)pref);
+         CCtxParams->attachDictPref = pref;
+         return CCtxParams->attachDictPref;
+     }
+ 
+     case ZSTD_c_literalCompressionMode : {
+         const ZSTD_paramSwitch_e lcm = (ZSTD_paramSwitch_e)value;
+-        BOUNDCHECK(ZSTD_c_literalCompressionMode, lcm);
++        BOUNDCHECK(ZSTD_c_literalCompressionMode, (int)lcm);
+         CCtxParams->literalCompressionMode = lcm;
+         return CCtxParams->literalCompressionMode;
+     }
+@@ -789,47 +859,50 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD
+ 
+     case ZSTD_c_enableDedicatedDictSearch :
+         CCtxParams->enableDedicatedDictSearch = (value!=0);
+-        return CCtxParams->enableDedicatedDictSearch;
++        return (size_t)CCtxParams->enableDedicatedDictSearch;
+ 
+     case ZSTD_c_enableLongDistanceMatching :
++        BOUNDCHECK(ZSTD_c_enableLongDistanceMatching, value);
+         CCtxParams->ldmParams.enableLdm = (ZSTD_paramSwitch_e)value;
+         return CCtxParams->ldmParams.enableLdm;
+ 
+     case ZSTD_c_ldmHashLog :
+         if (value!=0)   /* 0 ==> auto */
+             BOUNDCHECK(ZSTD_c_ldmHashLog, value);
+-        CCtxParams->ldmParams.hashLog = value;
++        CCtxParams->ldmParams.hashLog = (U32)value;
+         return CCtxParams->ldmParams.hashLog;
+ 
+     case ZSTD_c_ldmMinMatch :
+         if (value!=0)   /* 0 ==> default */
+             BOUNDCHECK(ZSTD_c_ldmMinMatch, value);
+-        CCtxParams->ldmParams.minMatchLength = value;
++        CCtxParams->ldmParams.minMatchLength = (U32)value;
+         return CCtxParams->ldmParams.minMatchLength;
+ 
+     case ZSTD_c_ldmBucketSizeLog :
+         if (value!=0)   /* 0 ==> default */
+             BOUNDCHECK(ZSTD_c_ldmBucketSizeLog, value);
+-        CCtxParams->ldmParams.bucketSizeLog = value;
++        CCtxParams->ldmParams.bucketSizeLog = (U32)value;
+         return CCtxParams->ldmParams.bucketSizeLog;
+ 
+     case ZSTD_c_ldmHashRateLog :
+         if (value!=0)   /* 0 ==> default */
+             BOUNDCHECK(ZSTD_c_ldmHashRateLog, value);
+-        CCtxParams->ldmParams.hashRateLog = value;
++        CCtxParams->ldmParams.hashRateLog = (U32)value;
+         return CCtxParams->ldmParams.hashRateLog;
+ 
+     case ZSTD_c_targetCBlockSize :
+-        if (value!=0)   /* 0 ==> default */
++        if (value!=0) {  /* 0 ==> default */
++            value = MAX(value, ZSTD_TARGETCBLOCKSIZE_MIN);
+             BOUNDCHECK(ZSTD_c_targetCBlockSize, value);
+-        CCtxParams->targetCBlockSize = value;
++        }
++        CCtxParams->targetCBlockSize = (U32)value;
+         return CCtxParams->targetCBlockSize;
+ 
+     case ZSTD_c_srcSizeHint :
+         if (value!=0)    /* 0 ==> default */
+             BOUNDCHECK(ZSTD_c_srcSizeHint, value);
+         CCtxParams->srcSizeHint = value;
+-        return CCtxParams->srcSizeHint;
++        return (size_t)CCtxParams->srcSizeHint;
+ 
+     case ZSTD_c_stableInBuffer:
+         BOUNDCHECK(ZSTD_c_stableInBuffer, value);
+@@ -849,7 +922,7 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD
+     case ZSTD_c_validateSequences:
+         BOUNDCHECK(ZSTD_c_validateSequences, value);
+         CCtxParams->validateSequences = value;
+-        return CCtxParams->validateSequences;
++        return (size_t)CCtxParams->validateSequences;
+ 
+     case ZSTD_c_useBlockSplitter:
+         BOUNDCHECK(ZSTD_c_useBlockSplitter, value);
+@@ -864,7 +937,28 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD
+     case ZSTD_c_deterministicRefPrefix:
+         BOUNDCHECK(ZSTD_c_deterministicRefPrefix, value);
+         CCtxParams->deterministicRefPrefix = !!value;
+-        return CCtxParams->deterministicRefPrefix;
++        return (size_t)CCtxParams->deterministicRefPrefix;
++
++    case ZSTD_c_prefetchCDictTables:
++        BOUNDCHECK(ZSTD_c_prefetchCDictTables, value);
++        CCtxParams->prefetchCDictTables = (ZSTD_paramSwitch_e)value;
++        return CCtxParams->prefetchCDictTables;
++
++    case ZSTD_c_enableSeqProducerFallback:
++        BOUNDCHECK(ZSTD_c_enableSeqProducerFallback, value);
++        CCtxParams->enableMatchFinderFallback = value;
++        return (size_t)CCtxParams->enableMatchFinderFallback;
++
++    case ZSTD_c_maxBlockSize:
++        if (value!=0)    /* 0 ==> default */
++            BOUNDCHECK(ZSTD_c_maxBlockSize, value);
++        CCtxParams->maxBlockSize = value;
++        return CCtxParams->maxBlockSize;
++
++    case ZSTD_c_searchForExternalRepcodes:
++        BOUNDCHECK(ZSTD_c_searchForExternalRepcodes, value);
++        CCtxParams->searchForExternalRepcodes = (ZSTD_paramSwitch_e)value;
++        return CCtxParams->searchForExternalRepcodes;
+ 
+     default: RETURN_ERROR(parameter_unsupported, "unknown parameter");
+     }
+@@ -980,6 +1074,18 @@ size_t ZSTD_CCtxParams_getParameter(
+     case ZSTD_c_deterministicRefPrefix:
+         *value = (int)CCtxParams->deterministicRefPrefix;
+         break;
++    case ZSTD_c_prefetchCDictTables:
++        *value = (int)CCtxParams->prefetchCDictTables;
++        break;
++    case ZSTD_c_enableSeqProducerFallback:
++        *value = CCtxParams->enableMatchFinderFallback;
++        break;
++    case ZSTD_c_maxBlockSize:
++        *value = (int)CCtxParams->maxBlockSize;
++        break;
++    case ZSTD_c_searchForExternalRepcodes:
++        *value = (int)CCtxParams->searchForExternalRepcodes;
++        break;
+     default: RETURN_ERROR(parameter_unsupported, "unknown parameter");
+     }
+     return 0;
+@@ -1006,9 +1112,47 @@ size_t ZSTD_CCtx_setParametersUsingCCtxP
+     return 0;
+ }
+ 
++size_t ZSTD_CCtx_setCParams(ZSTD_CCtx* cctx, ZSTD_compressionParameters cparams)
++{
++    ZSTD_STATIC_ASSERT(sizeof(cparams) == 7 * 4 /* all params are listed below */);
++    DEBUGLOG(4, "ZSTD_CCtx_setCParams");
++    /* only update if all parameters are valid */
++    FORWARD_IF_ERROR(ZSTD_checkCParams(cparams), "");
++    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_windowLog, cparams.windowLog), "");
++    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_chainLog, cparams.chainLog), "");
++    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_hashLog, cparams.hashLog), "");
++    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_searchLog, cparams.searchLog), "");
++    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_minMatch, cparams.minMatch), "");
++    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_targetLength, cparams.targetLength), "");
++    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_strategy, cparams.strategy), "");
++    return 0;
++}
++
++size_t ZSTD_CCtx_setFParams(ZSTD_CCtx* cctx, ZSTD_frameParameters fparams)
++{
++    ZSTD_STATIC_ASSERT(sizeof(fparams) == 3 * 4 /* all params are listed below */);
++    DEBUGLOG(4, "ZSTD_CCtx_setFParams");
++    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_contentSizeFlag, fparams.contentSizeFlag != 0), "");
++    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_checksumFlag, fparams.checksumFlag != 0), "");
++    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_dictIDFlag, fparams.noDictIDFlag == 0), "");
++    return 0;
++}
++
++size_t ZSTD_CCtx_setParams(ZSTD_CCtx* cctx, ZSTD_parameters params)
++{
++    DEBUGLOG(4, "ZSTD_CCtx_setParams");
++    /* First check cParams, because we want to update all or none. */
++    FORWARD_IF_ERROR(ZSTD_checkCParams(params.cParams), "");
++    /* Next set fParams, because this could fail if the cctx isn't in init stage. */
++    FORWARD_IF_ERROR(ZSTD_CCtx_setFParams(cctx, params.fParams), "");
++    /* Finally set cParams, which should succeed. */
++    FORWARD_IF_ERROR(ZSTD_CCtx_setCParams(cctx, params.cParams), "");
++    return 0;
++}
++
+ size_t ZSTD_CCtx_setPledgedSrcSize(ZSTD_CCtx* cctx, unsigned long long pledgedSrcSize)
+ {
+-    DEBUGLOG(4, "ZSTD_CCtx_setPledgedSrcSize to %u bytes", (U32)pledgedSrcSize);
++    DEBUGLOG(4, "ZSTD_CCtx_setPledgedSrcSize to %llu bytes", pledgedSrcSize);
+     RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong,
+                     "Can't set pledgedSrcSize when not in init stage.");
+     cctx->pledgedSrcSizePlusOne = pledgedSrcSize+1;
+@@ -1024,9 +1168,9 @@ static void ZSTD_dedicatedDictSearch_rev
+         ZSTD_compressionParameters* cParams);
+ 
+ /*
+- * Initializes the local dict using the requested parameters.
+- * NOTE: This does not use the pledged src size, because it may be used for more
+- * than one compression.
++ * Initializes the local dictionary using requested parameters.
++ * NOTE: Initialization does not employ the pledged src size,
++ * because the dictionary may be used for multiple compressions.
+  */
+ static size_t ZSTD_initLocalDict(ZSTD_CCtx* cctx)
+ {
+@@ -1039,8 +1183,8 @@ static size_t ZSTD_initLocalDict(ZSTD_CC
+         return 0;
+     }
+     if (dl->cdict != NULL) {
+-        assert(cctx->cdict == dl->cdict);
+         /* Local dictionary already initialized. */
++        assert(cctx->cdict == dl->cdict);
+         return 0;
+     }
+     assert(dl->dictSize > 0);
+@@ -1060,26 +1204,30 @@ static size_t ZSTD_initLocalDict(ZSTD_CC
+ }
+ 
+ size_t ZSTD_CCtx_loadDictionary_advanced(
+-        ZSTD_CCtx* cctx, const void* dict, size_t dictSize,
+-        ZSTD_dictLoadMethod_e dictLoadMethod, ZSTD_dictContentType_e dictContentType)
++        ZSTD_CCtx* cctx,
++        const void* dict, size_t dictSize,
++        ZSTD_dictLoadMethod_e dictLoadMethod,
++        ZSTD_dictContentType_e dictContentType)
+ {
+-    RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong,
+-                    "Can't load a dictionary when ctx is not in init stage.");
+     DEBUGLOG(4, "ZSTD_CCtx_loadDictionary_advanced (size: %u)", (U32)dictSize);
+-    ZSTD_clearAllDicts(cctx);  /* in case one already exists */
+-    if (dict == NULL || dictSize == 0)  /* no dictionary mode */
++    RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong,
++                    "Can't load a dictionary when cctx is not in init stage.");
++    ZSTD_clearAllDicts(cctx);  /* erase any previously set dictionary */
++    if (dict == NULL || dictSize == 0)  /* no dictionary */
+         return 0;
+     if (dictLoadMethod == ZSTD_dlm_byRef) {
+         cctx->localDict.dict = dict;
+     } else {
++        /* copy dictionary content inside CCtx to own its lifetime */
+         void* dictBuffer;
+         RETURN_ERROR_IF(cctx->staticSize, memory_allocation,
+-                        "no malloc for static CCtx");
++                        "static CCtx can't allocate for an internal copy of dictionary");
+         dictBuffer = ZSTD_customMalloc(dictSize, cctx->customMem);
+-        RETURN_ERROR_IF(!dictBuffer, memory_allocation, "NULL pointer!");
++        RETURN_ERROR_IF(dictBuffer==NULL, memory_allocation,
++                        "allocation failed for dictionary content");
+         ZSTD_memcpy(dictBuffer, dict, dictSize);
+-        cctx->localDict.dictBuffer = dictBuffer;
+-        cctx->localDict.dict = dictBuffer;
++        cctx->localDict.dictBuffer = dictBuffer;  /* owned ptr to free */
++        cctx->localDict.dict = dictBuffer;        /* read-only reference */
+     }
+     cctx->localDict.dictSize = dictSize;
+     cctx->localDict.dictContentType = dictContentType;
+@@ -1149,7 +1297,7 @@ size_t ZSTD_CCtx_reset(ZSTD_CCtx* cctx,
+     if ( (reset == ZSTD_reset_parameters)
+       || (reset == ZSTD_reset_session_and_parameters) ) {
+         RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong,
+-                        "Can't reset parameters only when not in init stage.");
++                        "Reset parameters is only possible during init stage.");
+         ZSTD_clearAllDicts(cctx);
+         return ZSTD_CCtxParams_reset(&cctx->requestedParams);
+     }
+@@ -1178,11 +1326,12 @@ size_t ZSTD_checkCParams(ZSTD_compressio
+ static ZSTD_compressionParameters
+ ZSTD_clampCParams(ZSTD_compressionParameters cParams)
+ {
+-#   define CLAMP_TYPE(cParam, val, type) {                                \
+-        ZSTD_bounds const bounds = ZSTD_cParam_getBounds(cParam);         \
+-        if ((int)val<bounds.lowerBound) val=(type)bounds.lowerBound;      \
+-        else if ((int)val>bounds.upperBound) val=(type)bounds.upperBound; \
+-    }
++#   define CLAMP_TYPE(cParam, val, type)                                      \
++        do {                                                                  \
++            ZSTD_bounds const bounds = ZSTD_cParam_getBounds(cParam);         \
++            if ((int)val<bounds.lowerBound) val=(type)bounds.lowerBound;      \
++            else if ((int)val>bounds.upperBound) val=(type)bounds.upperBound; \
++        } while (0)
+ #   define CLAMP(cParam, val) CLAMP_TYPE(cParam, val, unsigned)
+     CLAMP(ZSTD_c_windowLog, cParams.windowLog);
+     CLAMP(ZSTD_c_chainLog,  cParams.chainLog);
+@@ -1247,12 +1396,55 @@ static ZSTD_compressionParameters
+ ZSTD_adjustCParams_internal(ZSTD_compressionParameters cPar,
+                             unsigned long long srcSize,
+                             size_t dictSize,
+-                            ZSTD_cParamMode_e mode)
++                            ZSTD_cParamMode_e mode,
++                            ZSTD_paramSwitch_e useRowMatchFinder)
+ {
+     const U64 minSrcSize = 513; /* (1<<9) + 1 */
+     const U64 maxWindowResize = 1ULL << (ZSTD_WINDOWLOG_MAX-1);
+     assert(ZSTD_checkCParams(cPar)==0);
+ 
++    /* Cascade the selected strategy down to the next-highest one built into
++     * this binary. */
++#ifdef ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR
++    if (cPar.strategy == ZSTD_btultra2) {
++        cPar.strategy = ZSTD_btultra;
++    }
++    if (cPar.strategy == ZSTD_btultra) {
++        cPar.strategy = ZSTD_btopt;
++    }
++#endif
++#ifdef ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR
++    if (cPar.strategy == ZSTD_btopt) {
++        cPar.strategy = ZSTD_btlazy2;
++    }
++#endif
++#ifdef ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR
++    if (cPar.strategy == ZSTD_btlazy2) {
++        cPar.strategy = ZSTD_lazy2;
++    }
++#endif
++#ifdef ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR
++    if (cPar.strategy == ZSTD_lazy2) {
++        cPar.strategy = ZSTD_lazy;
++    }
++#endif
++#ifdef ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR
++    if (cPar.strategy == ZSTD_lazy) {
++        cPar.strategy = ZSTD_greedy;
++    }
++#endif
++#ifdef ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR
++    if (cPar.strategy == ZSTD_greedy) {
++        cPar.strategy = ZSTD_dfast;
++    }
++#endif
++#ifdef ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR
++    if (cPar.strategy == ZSTD_dfast) {
++        cPar.strategy = ZSTD_fast;
++        cPar.targetLength = 0;
++    }
++#endif
++
+     switch (mode) {
+     case ZSTD_cpm_unknown:
+     case ZSTD_cpm_noAttachDict:
+@@ -1281,8 +1473,8 @@ ZSTD_adjustCParams_internal(ZSTD_compres
+     }
+ 
+     /* resize windowLog if input is small enough, to use less memory */
+-    if ( (srcSize < maxWindowResize)
+-      && (dictSize < maxWindowResize) )  {
++    if ( (srcSize <= maxWindowResize)
++      && (dictSize <= maxWindowResize) )  {
+         U32 const tSize = (U32)(srcSize + dictSize);
+         static U32 const hashSizeMin = 1 << ZSTD_HASHLOG_MIN;
+         U32 const srcLog = (tSize < hashSizeMin) ? ZSTD_HASHLOG_MIN :
+@@ -1300,6 +1492,42 @@ ZSTD_adjustCParams_internal(ZSTD_compres
+     if (cPar.windowLog < ZSTD_WINDOWLOG_ABSOLUTEMIN)
+         cPar.windowLog = ZSTD_WINDOWLOG_ABSOLUTEMIN;  /* minimum wlog required for valid frame header */
+ 
++    /* We can't use more than 32 bits of hash in total, so that means that we require:
++     * (hashLog + 8) <= 32 && (chainLog + 8) <= 32
++     */
++    if (mode == ZSTD_cpm_createCDict && ZSTD_CDictIndicesAreTagged(&cPar)) {
++        U32 const maxShortCacheHashLog = 32 - ZSTD_SHORT_CACHE_TAG_BITS;
++        if (cPar.hashLog > maxShortCacheHashLog) {
++            cPar.hashLog = maxShortCacheHashLog;
++        }
++        if (cPar.chainLog > maxShortCacheHashLog) {
++            cPar.chainLog = maxShortCacheHashLog;
++        }
++    }
++
++
++    /* At this point, we aren't 100% sure if we are using the row match finder.
++     * Unless it is explicitly disabled, conservatively assume that it is enabled.
++     * In this case it will only be disabled for small sources, so shrinking the
++     * hash log a little bit shouldn't result in any ratio loss.
++     */
++    if (useRowMatchFinder == ZSTD_ps_auto)
++        useRowMatchFinder = ZSTD_ps_enable;
++
++    /* We can't hash more than 32-bits in total. So that means that we require:
++     * (hashLog - rowLog + 8) <= 32
++     */
++    if (ZSTD_rowMatchFinderUsed(cPar.strategy, useRowMatchFinder)) {
++        /* Switch to 32-entry rows if searchLog is 5 (or more) */
++        U32 const rowLog = BOUNDED(4, cPar.searchLog, 6);
++        U32 const maxRowHashLog = 32 - ZSTD_ROW_HASH_TAG_BITS;
++        U32 const maxHashLog = maxRowHashLog + rowLog;
++        assert(cPar.hashLog >= rowLog);
++        if (cPar.hashLog > maxHashLog) {
++            cPar.hashLog = maxHashLog;
++        }
++    }
++
+     return cPar;
+ }
+ 
+@@ -1310,7 +1538,7 @@ ZSTD_adjustCParams(ZSTD_compressionParam
+ {
+     cPar = ZSTD_clampCParams(cPar);   /* resulting cPar is necessarily valid (all parameters within range) */
+     if (srcSize == 0) srcSize = ZSTD_CONTENTSIZE_UNKNOWN;
+-    return ZSTD_adjustCParams_internal(cPar, srcSize, dictSize, ZSTD_cpm_unknown);
++    return ZSTD_adjustCParams_internal(cPar, srcSize, dictSize, ZSTD_cpm_unknown, ZSTD_ps_auto);
+ }
+ 
+ static ZSTD_compressionParameters ZSTD_getCParams_internal(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize, ZSTD_cParamMode_e mode);
+@@ -1341,7 +1569,7 @@ ZSTD_compressionParameters ZSTD_getCPara
+     ZSTD_overrideCParams(&cParams, &CCtxParams->cParams);
+     assert(!ZSTD_checkCParams(cParams));
+     /* srcSizeHint == 0 means 0 */
+-    return ZSTD_adjustCParams_internal(cParams, srcSizeHint, dictSize, mode);
++    return ZSTD_adjustCParams_internal(cParams, srcSizeHint, dictSize, mode, CCtxParams->useRowMatchFinder);
+ }
+ 
+ static size_t
+@@ -1367,10 +1595,10 @@ ZSTD_sizeof_matchState(const ZSTD_compre
+       + ZSTD_cwksp_aligned_alloc_size((MaxLL+1) * sizeof(U32))
+       + ZSTD_cwksp_aligned_alloc_size((MaxOff+1) * sizeof(U32))
+       + ZSTD_cwksp_aligned_alloc_size((1<<Litbits) * sizeof(U32))
+-      + ZSTD_cwksp_aligned_alloc_size((ZSTD_OPT_NUM+1) * sizeof(ZSTD_match_t))
+-      + ZSTD_cwksp_aligned_alloc_size((ZSTD_OPT_NUM+1) * sizeof(ZSTD_optimal_t));
++      + ZSTD_cwksp_aligned_alloc_size(ZSTD_OPT_SIZE * sizeof(ZSTD_match_t))
++      + ZSTD_cwksp_aligned_alloc_size(ZSTD_OPT_SIZE * sizeof(ZSTD_optimal_t));
+     size_t const lazyAdditionalSpace = ZSTD_rowMatchFinderUsed(cParams->strategy, useRowMatchFinder)
+-                                            ? ZSTD_cwksp_aligned_alloc_size(hSize*sizeof(U16))
++                                            ? ZSTD_cwksp_aligned_alloc_size(hSize)
+                                             : 0;
+     size_t const optSpace = (forCCtx && (cParams->strategy >= ZSTD_btopt))
+                                 ? optPotentialSpace
+@@ -1386,6 +1614,13 @@ ZSTD_sizeof_matchState(const ZSTD_compre
+     return tableSpace + optSpace + slackSpace + lazyAdditionalSpace;
+ }
+ 
++/* Helper function for calculating memory requirements.
++ * Gives a tighter bound than ZSTD_sequenceBound() by taking minMatch into account. */
++static size_t ZSTD_maxNbSeq(size_t blockSize, unsigned minMatch, int useSequenceProducer) {
++    U32 const divider = (minMatch==3 || useSequenceProducer) ? 3 : 4;
++    return blockSize / divider;
++}
++
+ static size_t ZSTD_estimateCCtxSize_usingCCtxParams_internal(
+         const ZSTD_compressionParameters* cParams,
+         const ldmParams_t* ldmParams,
+@@ -1393,12 +1628,13 @@ static size_t ZSTD_estimateCCtxSize_usin
+         const ZSTD_paramSwitch_e useRowMatchFinder,
+         const size_t buffInSize,
+         const size_t buffOutSize,
+-        const U64 pledgedSrcSize)
++        const U64 pledgedSrcSize,
++        int useSequenceProducer,
++        size_t maxBlockSize)
+ {
+     size_t const windowSize = (size_t) BOUNDED(1ULL, 1ULL << cParams->windowLog, pledgedSrcSize);
+-    size_t const blockSize = MIN(ZSTD_BLOCKSIZE_MAX, windowSize);
+-    U32    const divider = (cParams->minMatch==3) ? 3 : 4;
+-    size_t const maxNbSeq = blockSize / divider;
++    size_t const blockSize = MIN(ZSTD_resolveMaxBlockSize(maxBlockSize), windowSize);
++    size_t const maxNbSeq = ZSTD_maxNbSeq(blockSize, cParams->minMatch, useSequenceProducer);
+     size_t const tokenSpace = ZSTD_cwksp_alloc_size(WILDCOPY_OVERLENGTH + blockSize)
+                             + ZSTD_cwksp_aligned_alloc_size(maxNbSeq * sizeof(seqDef))
+                             + 3 * ZSTD_cwksp_alloc_size(maxNbSeq * sizeof(BYTE));
+@@ -1417,6 +1653,11 @@ static size_t ZSTD_estimateCCtxSize_usin
+ 
+     size_t const cctxSpace = isStatic ? ZSTD_cwksp_alloc_size(sizeof(ZSTD_CCtx)) : 0;
+ 
++    size_t const maxNbExternalSeq = ZSTD_sequenceBound(blockSize);
++    size_t const externalSeqSpace = useSequenceProducer
++        ? ZSTD_cwksp_aligned_alloc_size(maxNbExternalSeq * sizeof(ZSTD_Sequence))
++        : 0;
++
+     size_t const neededSpace =
+         cctxSpace +
+         entropySpace +
+@@ -1425,7 +1666,8 @@ static size_t ZSTD_estimateCCtxSize_usin
+         ldmSeqSpace +
+         matchStateSize +
+         tokenSpace +
+-        bufferSpace;
++        bufferSpace +
++        externalSeqSpace;
+ 
+     DEBUGLOG(5, "estimate workspace : %u", (U32)neededSpace);
+     return neededSpace;
+@@ -1443,7 +1685,7 @@ size_t ZSTD_estimateCCtxSize_usingCCtxPa
+      * be needed. However, we still allocate two 0-sized buffers, which can
+      * take space under ASAN. */
+     return ZSTD_estimateCCtxSize_usingCCtxParams_internal(
+-        &cParams, &params->ldmParams, 1, useRowMatchFinder, 0, 0, ZSTD_CONTENTSIZE_UNKNOWN);
++        &cParams, &params->ldmParams, 1, useRowMatchFinder, 0, 0, ZSTD_CONTENTSIZE_UNKNOWN, ZSTD_hasExtSeqProd(params), params->maxBlockSize);
+ }
+ 
+ size_t ZSTD_estimateCCtxSize_usingCParams(ZSTD_compressionParameters cParams)
+@@ -1493,7 +1735,7 @@ size_t ZSTD_estimateCStreamSize_usingCCt
+     RETURN_ERROR_IF(params->nbWorkers > 0, GENERIC, "Estimate CCtx size is supported for single-threaded compression only.");
+     {   ZSTD_compressionParameters const cParams =
+                 ZSTD_getCParamsFromCCtxParams(params, ZSTD_CONTENTSIZE_UNKNOWN, 0, ZSTD_cpm_noAttachDict);
+-        size_t const blockSize = MIN(ZSTD_BLOCKSIZE_MAX, (size_t)1 << cParams.windowLog);
++        size_t const blockSize = MIN(ZSTD_resolveMaxBlockSize(params->maxBlockSize), (size_t)1 << cParams.windowLog);
+         size_t const inBuffSize = (params->inBufferMode == ZSTD_bm_buffered)
+                 ? ((size_t)1 << cParams.windowLog) + blockSize
+                 : 0;
+@@ -1504,7 +1746,7 @@ size_t ZSTD_estimateCStreamSize_usingCCt
+ 
+         return ZSTD_estimateCCtxSize_usingCCtxParams_internal(
+             &cParams, &params->ldmParams, 1, useRowMatchFinder, inBuffSize, outBuffSize,
+-            ZSTD_CONTENTSIZE_UNKNOWN);
++            ZSTD_CONTENTSIZE_UNKNOWN, ZSTD_hasExtSeqProd(params), params->maxBlockSize);
+     }
+ }
+ 
+@@ -1637,6 +1879,19 @@ typedef enum {
+     ZSTD_resetTarget_CCtx
+ } ZSTD_resetTarget_e;
+ 
++/* Mixes bits in a 64 bits in a value, based on XXH3_rrmxmx */
++static U64 ZSTD_bitmix(U64 val, U64 len) {
++    val ^= ZSTD_rotateRight_U64(val, 49) ^ ZSTD_rotateRight_U64(val, 24);
++    val *= 0x9FB21C651E98DF25ULL;
++    val ^= (val >> 35) + len ;
++    val *= 0x9FB21C651E98DF25ULL;
++    return val ^ (val >> 28);
++}
++
++/* Mixes in the hashSalt and hashSaltEntropy to create a new hashSalt */
++static void ZSTD_advanceHashSalt(ZSTD_matchState_t* ms) {
++    ms->hashSalt = ZSTD_bitmix(ms->hashSalt, 8) ^ ZSTD_bitmix((U64) ms->hashSaltEntropy, 4);
++}
+ 
+ static size_t
+ ZSTD_reset_matchState(ZSTD_matchState_t* ms,
+@@ -1664,6 +1919,7 @@ ZSTD_reset_matchState(ZSTD_matchState_t*
+     }
+ 
+     ms->hashLog3 = hashLog3;
++    ms->lazySkipping = 0;
+ 
+     ZSTD_invalidateMatchState(ms);
+ 
+@@ -1685,22 +1941,19 @@ ZSTD_reset_matchState(ZSTD_matchState_t*
+         ZSTD_cwksp_clean_tables(ws);
+     }
+ 
+-    /* opt parser space */
+-    if ((forWho == ZSTD_resetTarget_CCtx) && (cParams->strategy >= ZSTD_btopt)) {
+-        DEBUGLOG(4, "reserving optimal parser space");
+-        ms->opt.litFreq = (unsigned*)ZSTD_cwksp_reserve_aligned(ws, (1<<Litbits) * sizeof(unsigned));
+-        ms->opt.litLengthFreq = (unsigned*)ZSTD_cwksp_reserve_aligned(ws, (MaxLL+1) * sizeof(unsigned));
+-        ms->opt.matchLengthFreq = (unsigned*)ZSTD_cwksp_reserve_aligned(ws, (MaxML+1) * sizeof(unsigned));
+-        ms->opt.offCodeFreq = (unsigned*)ZSTD_cwksp_reserve_aligned(ws, (MaxOff+1) * sizeof(unsigned));
+-        ms->opt.matchTable = (ZSTD_match_t*)ZSTD_cwksp_reserve_aligned(ws, (ZSTD_OPT_NUM+1) * sizeof(ZSTD_match_t));
+-        ms->opt.priceTable = (ZSTD_optimal_t*)ZSTD_cwksp_reserve_aligned(ws, (ZSTD_OPT_NUM+1) * sizeof(ZSTD_optimal_t));
+-    }
+-
+     if (ZSTD_rowMatchFinderUsed(cParams->strategy, useRowMatchFinder)) {
+-        {   /* Row match finder needs an additional table of hashes ("tags") */
+-            size_t const tagTableSize = hSize*sizeof(U16);
+-            ms->tagTable = (U16*)ZSTD_cwksp_reserve_aligned(ws, tagTableSize);
+-            if (ms->tagTable) ZSTD_memset(ms->tagTable, 0, tagTableSize);
++        /* Row match finder needs an additional table of hashes ("tags") */
++        size_t const tagTableSize = hSize;
++        /* We want to generate a new salt in case we reset a Cctx, but we always want to use
++         * 0 when we reset a Cdict */
++        if(forWho == ZSTD_resetTarget_CCtx) {
++            ms->tagTable = (BYTE*) ZSTD_cwksp_reserve_aligned_init_once(ws, tagTableSize);
++            ZSTD_advanceHashSalt(ms);
++        } else {
++            /* When we are not salting we want to always memset the memory */
++            ms->tagTable = (BYTE*) ZSTD_cwksp_reserve_aligned(ws, tagTableSize);
++            ZSTD_memset(ms->tagTable, 0, tagTableSize);
++            ms->hashSalt = 0;
+         }
+         {   /* Switch to 32-entry rows if searchLog is 5 (or more) */
+             U32 const rowLog = BOUNDED(4, cParams->searchLog, 6);
+@@ -1709,6 +1962,17 @@ ZSTD_reset_matchState(ZSTD_matchState_t*
+         }
+     }
+ 
++    /* opt parser space */
++    if ((forWho == ZSTD_resetTarget_CCtx) && (cParams->strategy >= ZSTD_btopt)) {
++        DEBUGLOG(4, "reserving optimal parser space");
++        ms->opt.litFreq = (unsigned*)ZSTD_cwksp_reserve_aligned(ws, (1<<Litbits) * sizeof(unsigned));
++        ms->opt.litLengthFreq = (unsigned*)ZSTD_cwksp_reserve_aligned(ws, (MaxLL+1) * sizeof(unsigned));
++        ms->opt.matchLengthFreq = (unsigned*)ZSTD_cwksp_reserve_aligned(ws, (MaxML+1) * sizeof(unsigned));
++        ms->opt.offCodeFreq = (unsigned*)ZSTD_cwksp_reserve_aligned(ws, (MaxOff+1) * sizeof(unsigned));
++        ms->opt.matchTable = (ZSTD_match_t*)ZSTD_cwksp_reserve_aligned(ws, ZSTD_OPT_SIZE * sizeof(ZSTD_match_t));
++        ms->opt.priceTable = (ZSTD_optimal_t*)ZSTD_cwksp_reserve_aligned(ws, ZSTD_OPT_SIZE * sizeof(ZSTD_optimal_t));
++    }
++
+     ms->cParams = *cParams;
+ 
+     RETURN_ERROR_IF(ZSTD_cwksp_reserve_failed(ws), memory_allocation,
+@@ -1768,6 +2032,7 @@ static size_t ZSTD_resetCCtx_internal(ZS
+     assert(params->useRowMatchFinder != ZSTD_ps_auto);
+     assert(params->useBlockSplitter != ZSTD_ps_auto);
+     assert(params->ldmParams.enableLdm != ZSTD_ps_auto);
++    assert(params->maxBlockSize != 0);
+     if (params->ldmParams.enableLdm == ZSTD_ps_enable) {
+         /* Adjust long distance matching parameters */
+         ZSTD_ldm_adjustParameters(&zc->appliedParams.ldmParams, &params->cParams);
+@@ -1776,9 +2041,8 @@ static size_t ZSTD_resetCCtx_internal(ZS
+     }
+ 
+     {   size_t const windowSize = MAX(1, (size_t)MIN(((U64)1 << params->cParams.windowLog), pledgedSrcSize));
+-        size_t const blockSize = MIN(ZSTD_BLOCKSIZE_MAX, windowSize);
+-        U32    const divider = (params->cParams.minMatch==3) ? 3 : 4;
+-        size_t const maxNbSeq = blockSize / divider;
++        size_t const blockSize = MIN(params->maxBlockSize, windowSize);
++        size_t const maxNbSeq = ZSTD_maxNbSeq(blockSize, params->cParams.minMatch, ZSTD_hasExtSeqProd(params));
+         size_t const buffOutSize = (zbuff == ZSTDb_buffered && params->outBufferMode == ZSTD_bm_buffered)
+                 ? ZSTD_compressBound(blockSize) + 1
+                 : 0;
+@@ -1795,8 +2059,7 @@ static size_t ZSTD_resetCCtx_internal(ZS
+         size_t const neededSpace =
+             ZSTD_estimateCCtxSize_usingCCtxParams_internal(
+                 &params->cParams, &params->ldmParams, zc->staticSize != 0, params->useRowMatchFinder,
+-                buffInSize, buffOutSize, pledgedSrcSize);
+-        int resizeWorkspace;
++                buffInSize, buffOutSize, pledgedSrcSize, ZSTD_hasExtSeqProd(params), params->maxBlockSize);
+ 
+         FORWARD_IF_ERROR(neededSpace, "cctx size estimate failed!");
+ 
+@@ -1805,7 +2068,7 @@ static size_t ZSTD_resetCCtx_internal(ZS
+         {   /* Check if workspace is large enough, alloc a new one if needed */
+             int const workspaceTooSmall = ZSTD_cwksp_sizeof(ws) < neededSpace;
+             int const workspaceWasteful = ZSTD_cwksp_check_wasteful(ws, neededSpace);
+-            resizeWorkspace = workspaceTooSmall || workspaceWasteful;
++            int resizeWorkspace = workspaceTooSmall || workspaceWasteful;
+             DEBUGLOG(4, "Need %zu B workspace", neededSpace);
+             DEBUGLOG(4, "windowSize: %zu - blockSize: %zu", windowSize, blockSize);
+ 
+@@ -1838,6 +2101,7 @@ static size_t ZSTD_resetCCtx_internal(ZS
+ 
+         /* init params */
+         zc->blockState.matchState.cParams = params->cParams;
++        zc->blockState.matchState.prefetchCDictTables = params->prefetchCDictTables == ZSTD_ps_enable;
+         zc->pledgedSrcSizePlusOne = pledgedSrcSize+1;
+         zc->consumedSrcSize = 0;
+         zc->producedCSize = 0;
+@@ -1854,13 +2118,46 @@ static size_t ZSTD_resetCCtx_internal(ZS
+ 
+         ZSTD_reset_compressedBlockState(zc->blockState.prevCBlock);
+ 
++        FORWARD_IF_ERROR(ZSTD_reset_matchState(
++                &zc->blockState.matchState,
++                ws,
++                &params->cParams,
++                params->useRowMatchFinder,
++                crp,
++                needsIndexReset,
++                ZSTD_resetTarget_CCtx), "");
++
++        zc->seqStore.sequencesStart = (seqDef*)ZSTD_cwksp_reserve_aligned(ws, maxNbSeq * sizeof(seqDef));
++
++        /* ldm hash table */
++        if (params->ldmParams.enableLdm == ZSTD_ps_enable) {
++            /* TODO: avoid memset? */
++            size_t const ldmHSize = ((size_t)1) << params->ldmParams.hashLog;
++            zc->ldmState.hashTable = (ldmEntry_t*)ZSTD_cwksp_reserve_aligned(ws, ldmHSize * sizeof(ldmEntry_t));
++            ZSTD_memset(zc->ldmState.hashTable, 0, ldmHSize * sizeof(ldmEntry_t));
++            zc->ldmSequences = (rawSeq*)ZSTD_cwksp_reserve_aligned(ws, maxNbLdmSeq * sizeof(rawSeq));
++            zc->maxNbLdmSequences = maxNbLdmSeq;
++
++            ZSTD_window_init(&zc->ldmState.window);
++            zc->ldmState.loadedDictEnd = 0;
++        }
++
++        /* reserve space for block-level external sequences */
++        if (ZSTD_hasExtSeqProd(params)) {
++            size_t const maxNbExternalSeq = ZSTD_sequenceBound(blockSize);
++            zc->extSeqBufCapacity = maxNbExternalSeq;
++            zc->extSeqBuf =
++                (ZSTD_Sequence*)ZSTD_cwksp_reserve_aligned(ws, maxNbExternalSeq * sizeof(ZSTD_Sequence));
++        }
++
++        /* buffers */
++
+         /* ZSTD_wildcopy() is used to copy into the literals buffer,
+          * so we have to oversize the buffer by WILDCOPY_OVERLENGTH bytes.
+          */
+         zc->seqStore.litStart = ZSTD_cwksp_reserve_buffer(ws, blockSize + WILDCOPY_OVERLENGTH);
+         zc->seqStore.maxNbLit = blockSize;
+ 
+-        /* buffers */
+         zc->bufferedPolicy = zbuff;
+         zc->inBuffSize = buffInSize;
+         zc->inBuff = (char*)ZSTD_cwksp_reserve_buffer(ws, buffInSize);
+@@ -1883,32 +2180,9 @@ static size_t ZSTD_resetCCtx_internal(ZS
+         zc->seqStore.llCode = ZSTD_cwksp_reserve_buffer(ws, maxNbSeq * sizeof(BYTE));
+         zc->seqStore.mlCode = ZSTD_cwksp_reserve_buffer(ws, maxNbSeq * sizeof(BYTE));
+         zc->seqStore.ofCode = ZSTD_cwksp_reserve_buffer(ws, maxNbSeq * sizeof(BYTE));
+-        zc->seqStore.sequencesStart = (seqDef*)ZSTD_cwksp_reserve_aligned(ws, maxNbSeq * sizeof(seqDef));
+-
+-        FORWARD_IF_ERROR(ZSTD_reset_matchState(
+-            &zc->blockState.matchState,
+-            ws,
+-            &params->cParams,
+-            params->useRowMatchFinder,
+-            crp,
+-            needsIndexReset,
+-            ZSTD_resetTarget_CCtx), "");
+-
+-        /* ldm hash table */
+-        if (params->ldmParams.enableLdm == ZSTD_ps_enable) {
+-            /* TODO: avoid memset? */
+-            size_t const ldmHSize = ((size_t)1) << params->ldmParams.hashLog;
+-            zc->ldmState.hashTable = (ldmEntry_t*)ZSTD_cwksp_reserve_aligned(ws, ldmHSize * sizeof(ldmEntry_t));
+-            ZSTD_memset(zc->ldmState.hashTable, 0, ldmHSize * sizeof(ldmEntry_t));
+-            zc->ldmSequences = (rawSeq*)ZSTD_cwksp_reserve_aligned(ws, maxNbLdmSeq * sizeof(rawSeq));
+-            zc->maxNbLdmSequences = maxNbLdmSeq;
+-
+-            ZSTD_window_init(&zc->ldmState.window);
+-            zc->ldmState.loadedDictEnd = 0;
+-        }
+ 
+         DEBUGLOG(3, "wksp: finished allocating, %zd bytes remain available", ZSTD_cwksp_available_space(ws));
+-        assert(ZSTD_cwksp_estimated_space_within_bounds(ws, neededSpace, resizeWorkspace));
++        assert(ZSTD_cwksp_estimated_space_within_bounds(ws, neededSpace));
+ 
+         zc->initialized = 1;
+ 
+@@ -1980,7 +2254,8 @@ ZSTD_resetCCtx_byAttachingCDict(ZSTD_CCt
+         }
+ 
+         params.cParams = ZSTD_adjustCParams_internal(adjusted_cdict_cParams, pledgedSrcSize,
+-                                                     cdict->dictContentSize, ZSTD_cpm_attachDict);
++                                                     cdict->dictContentSize, ZSTD_cpm_attachDict,
++                                                     params.useRowMatchFinder);
+         params.cParams.windowLog = windowLog;
+         params.useRowMatchFinder = cdict->useRowMatchFinder;    /* cdict overrides */
+         FORWARD_IF_ERROR(ZSTD_resetCCtx_internal(cctx, &params, pledgedSrcSize,
+@@ -2019,6 +2294,22 @@ ZSTD_resetCCtx_byAttachingCDict(ZSTD_CCt
+     return 0;
+ }
+ 
++static void ZSTD_copyCDictTableIntoCCtx(U32* dst, U32 const* src, size_t tableSize,
++                                        ZSTD_compressionParameters const* cParams) {
++    if (ZSTD_CDictIndicesAreTagged(cParams)){
++        /* Remove tags from the CDict table if they are present.
++         * See docs on "short cache" in zstd_compress_internal.h for context. */
++        size_t i;
++        for (i = 0; i < tableSize; i++) {
++            U32 const taggedIndex = src[i];
++            U32 const index = taggedIndex >> ZSTD_SHORT_CACHE_TAG_BITS;
++            dst[i] = index;
++        }
++    } else {
++        ZSTD_memcpy(dst, src, tableSize * sizeof(U32));
++    }
++}
++
+ static size_t ZSTD_resetCCtx_byCopyingCDict(ZSTD_CCtx* cctx,
+                             const ZSTD_CDict* cdict,
+                             ZSTD_CCtx_params params,
+@@ -2054,21 +2345,23 @@ static size_t ZSTD_resetCCtx_byCopyingCD
+                                                             : 0;
+         size_t const hSize =  (size_t)1 << cdict_cParams->hashLog;
+ 
+-        ZSTD_memcpy(cctx->blockState.matchState.hashTable,
+-               cdict->matchState.hashTable,
+-               hSize * sizeof(U32));
++        ZSTD_copyCDictTableIntoCCtx(cctx->blockState.matchState.hashTable,
++                                cdict->matchState.hashTable,
++                                hSize, cdict_cParams);
++
+         /* Do not copy cdict's chainTable if cctx has parameters such that it would not use chainTable */
+         if (ZSTD_allocateChainTable(cctx->appliedParams.cParams.strategy, cctx->appliedParams.useRowMatchFinder, 0 /* forDDSDict */)) {
+-            ZSTD_memcpy(cctx->blockState.matchState.chainTable,
+-               cdict->matchState.chainTable,
+-               chainSize * sizeof(U32));
++            ZSTD_copyCDictTableIntoCCtx(cctx->blockState.matchState.chainTable,
++                                    cdict->matchState.chainTable,
++                                    chainSize, cdict_cParams);
+         }
+         /* copy tag table */
+         if (ZSTD_rowMatchFinderUsed(cdict_cParams->strategy, cdict->useRowMatchFinder)) {
+-            size_t const tagTableSize = hSize*sizeof(U16);
++            size_t const tagTableSize = hSize;
+             ZSTD_memcpy(cctx->blockState.matchState.tagTable,
+-                cdict->matchState.tagTable,
+-                tagTableSize);
++                        cdict->matchState.tagTable,
++                        tagTableSize);
++            cctx->blockState.matchState.hashSalt = cdict->matchState.hashSalt;
+         }
+     }
+ 
+@@ -2147,6 +2440,7 @@ static size_t ZSTD_copyCCtx_internal(ZST
+         params.useBlockSplitter = srcCCtx->appliedParams.useBlockSplitter;
+         params.ldmParams = srcCCtx->appliedParams.ldmParams;
+         params.fParams = fParams;
++        params.maxBlockSize = srcCCtx->appliedParams.maxBlockSize;
+         ZSTD_resetCCtx_internal(dstCCtx, &params, pledgedSrcSize,
+                                 /* loadedDictSize */ 0,
+                                 ZSTDcrp_leaveDirty, zbuff);
+@@ -2294,7 +2588,7 @@ static void ZSTD_reduceIndex (ZSTD_match
+ 
+ /* See doc/zstd_compression_format.md for detailed format description */
+ 
+-void ZSTD_seqToCodes(const seqStore_t* seqStorePtr)
++int ZSTD_seqToCodes(const seqStore_t* seqStorePtr)
+ {
+     const seqDef* const sequences = seqStorePtr->sequencesStart;
+     BYTE* const llCodeTable = seqStorePtr->llCode;
+@@ -2302,18 +2596,24 @@ void ZSTD_seqToCodes(const seqStore_t* s
+     BYTE* const mlCodeTable = seqStorePtr->mlCode;
+     U32 const nbSeq = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart);
+     U32 u;
++    int longOffsets = 0;
+     assert(nbSeq <= seqStorePtr->maxNbSeq);
+     for (u=0; u<nbSeq; u++) {
+         U32 const llv = sequences[u].litLength;
++        U32 const ofCode = ZSTD_highbit32(sequences[u].offBase);
+         U32 const mlv = sequences[u].mlBase;
+         llCodeTable[u] = (BYTE)ZSTD_LLcode(llv);
+-        ofCodeTable[u] = (BYTE)ZSTD_highbit32(sequences[u].offBase);
++        ofCodeTable[u] = (BYTE)ofCode;
+         mlCodeTable[u] = (BYTE)ZSTD_MLcode(mlv);
++        assert(!(MEM_64bits() && ofCode >= STREAM_ACCUMULATOR_MIN));
++        if (MEM_32bits() && ofCode >= STREAM_ACCUMULATOR_MIN)
++            longOffsets = 1;
+     }
+     if (seqStorePtr->longLengthType==ZSTD_llt_literalLength)
+         llCodeTable[seqStorePtr->longLengthPos] = MaxLL;
+     if (seqStorePtr->longLengthType==ZSTD_llt_matchLength)
+         mlCodeTable[seqStorePtr->longLengthPos] = MaxML;
++    return longOffsets;
+ }
+ 
+ /* ZSTD_useTargetCBlockSize():
+@@ -2347,6 +2647,7 @@ typedef struct {
+     U32 MLtype;
+     size_t size;
+     size_t lastCountSize; /* Accounts for bug in 1.3.4. More detail in ZSTD_entropyCompressSeqStore_internal() */
++    int longOffsets;
+ } ZSTD_symbolEncodingTypeStats_t;
+ 
+ /* ZSTD_buildSequencesStatistics():
+@@ -2357,11 +2658,13 @@ typedef struct {
+  * entropyWkspSize must be of size at least ENTROPY_WORKSPACE_SIZE - (MaxSeq + 1)*sizeof(U32)
+  */
+ static ZSTD_symbolEncodingTypeStats_t
+-ZSTD_buildSequencesStatistics(seqStore_t* seqStorePtr, size_t nbSeq,
+-                        const ZSTD_fseCTables_t* prevEntropy, ZSTD_fseCTables_t* nextEntropy,
+-                              BYTE* dst, const BYTE* const dstEnd,
+-                              ZSTD_strategy strategy, unsigned* countWorkspace,
+-                              void* entropyWorkspace, size_t entropyWkspSize) {
++ZSTD_buildSequencesStatistics(
++                const seqStore_t* seqStorePtr, size_t nbSeq,
++                const ZSTD_fseCTables_t* prevEntropy, ZSTD_fseCTables_t* nextEntropy,
++                      BYTE* dst, const BYTE* const dstEnd,
++                      ZSTD_strategy strategy, unsigned* countWorkspace,
++                      void* entropyWorkspace, size_t entropyWkspSize)
++{
+     BYTE* const ostart = dst;
+     const BYTE* const oend = dstEnd;
+     BYTE* op = ostart;
+@@ -2375,7 +2678,7 @@ ZSTD_buildSequencesStatistics(seqStore_t
+ 
+     stats.lastCountSize = 0;
+     /* convert length/distances into codes */
+-    ZSTD_seqToCodes(seqStorePtr);
++    stats.longOffsets = ZSTD_seqToCodes(seqStorePtr);
+     assert(op <= oend);
+     assert(nbSeq != 0); /* ZSTD_selectEncodingType() divides by nbSeq */
+     /* build CTable for Literal Lengths */
+@@ -2480,22 +2783,22 @@ ZSTD_buildSequencesStatistics(seqStore_t
+  */
+ #define SUSPECT_UNCOMPRESSIBLE_LITERAL_RATIO 20
+ MEM_STATIC size_t
+-ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr,
+-                          const ZSTD_entropyCTables_t* prevEntropy,
+-                                ZSTD_entropyCTables_t* nextEntropy,
+-                          const ZSTD_CCtx_params* cctxParams,
+-                                void* dst, size_t dstCapacity,
+-                                void* entropyWorkspace, size_t entropyWkspSize,
+-                          const int bmi2)
++ZSTD_entropyCompressSeqStore_internal(
++                        const seqStore_t* seqStorePtr,
++                        const ZSTD_entropyCTables_t* prevEntropy,
++                              ZSTD_entropyCTables_t* nextEntropy,
++                        const ZSTD_CCtx_params* cctxParams,
++                              void* dst, size_t dstCapacity,
++                              void* entropyWorkspace, size_t entropyWkspSize,
++                        const int bmi2)
+ {
+-    const int longOffsets = cctxParams->cParams.windowLog > STREAM_ACCUMULATOR_MIN;
+     ZSTD_strategy const strategy = cctxParams->cParams.strategy;
+     unsigned* count = (unsigned*)entropyWorkspace;
+     FSE_CTable* CTable_LitLength = nextEntropy->fse.litlengthCTable;
+     FSE_CTable* CTable_OffsetBits = nextEntropy->fse.offcodeCTable;
+     FSE_CTable* CTable_MatchLength = nextEntropy->fse.matchlengthCTable;
+     const seqDef* const sequences = seqStorePtr->sequencesStart;
+-    const size_t nbSeq = seqStorePtr->sequences - seqStorePtr->sequencesStart;
++    const size_t nbSeq = (size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart);
+     const BYTE* const ofCodeTable = seqStorePtr->ofCode;
+     const BYTE* const llCodeTable = seqStorePtr->llCode;
+     const BYTE* const mlCodeTable = seqStorePtr->mlCode;
+@@ -2503,29 +2806,31 @@ ZSTD_entropyCompressSeqStore_internal(se
+     BYTE* const oend = ostart + dstCapacity;
+     BYTE* op = ostart;
+     size_t lastCountSize;
++    int longOffsets = 0;
+ 
+     entropyWorkspace = count + (MaxSeq + 1);
+     entropyWkspSize -= (MaxSeq + 1) * sizeof(*count);
+ 
+-    DEBUGLOG(4, "ZSTD_entropyCompressSeqStore_internal (nbSeq=%zu)", nbSeq);
++    DEBUGLOG(5, "ZSTD_entropyCompressSeqStore_internal (nbSeq=%zu, dstCapacity=%zu)", nbSeq, dstCapacity);
+     ZSTD_STATIC_ASSERT(HUF_WORKSPACE_SIZE >= (1<<MAX(MLFSELog,LLFSELog)));
+     assert(entropyWkspSize >= HUF_WORKSPACE_SIZE);
+ 
+     /* Compress literals */
+     {   const BYTE* const literals = seqStorePtr->litStart;
+-        size_t const numSequences = seqStorePtr->sequences - seqStorePtr->sequencesStart;
+-        size_t const numLiterals = seqStorePtr->lit - seqStorePtr->litStart;
++        size_t const numSequences = (size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart);
++        size_t const numLiterals = (size_t)(seqStorePtr->lit - seqStorePtr->litStart);
+         /* Base suspicion of uncompressibility on ratio of literals to sequences */
+         unsigned const suspectUncompressible = (numSequences == 0) || (numLiterals / numSequences >= SUSPECT_UNCOMPRESSIBLE_LITERAL_RATIO);
+         size_t const litSize = (size_t)(seqStorePtr->lit - literals);
++
+         size_t const cSize = ZSTD_compressLiterals(
+-                                    &prevEntropy->huf, &nextEntropy->huf,
+-                                    cctxParams->cParams.strategy,
+-                                    ZSTD_literalsCompressionIsDisabled(cctxParams),
+                                     op, dstCapacity,
+                                     literals, litSize,
+                                     entropyWorkspace, entropyWkspSize,
+-                                    bmi2, suspectUncompressible);
++                                    &prevEntropy->huf, &nextEntropy->huf,
++                                    cctxParams->cParams.strategy,
++                                    ZSTD_literalsCompressionIsDisabled(cctxParams),
++                                    suspectUncompressible, bmi2);
+         FORWARD_IF_ERROR(cSize, "ZSTD_compressLiterals failed");
+         assert(cSize <= dstCapacity);
+         op += cSize;
+@@ -2551,11 +2856,10 @@ ZSTD_entropyCompressSeqStore_internal(se
+         ZSTD_memcpy(&nextEntropy->fse, &prevEntropy->fse, sizeof(prevEntropy->fse));
+         return (size_t)(op - ostart);
+     }
+-    {
+-        ZSTD_symbolEncodingTypeStats_t stats;
+-        BYTE* seqHead = op++;
++    {   BYTE* const seqHead = op++;
+         /* build stats for sequences */
+-        stats = ZSTD_buildSequencesStatistics(seqStorePtr, nbSeq,
++        const ZSTD_symbolEncodingTypeStats_t stats =
++                ZSTD_buildSequencesStatistics(seqStorePtr, nbSeq,
+                                              &prevEntropy->fse, &nextEntropy->fse,
+                                               op, oend,
+                                               strategy, count,
+@@ -2564,6 +2868,7 @@ ZSTD_entropyCompressSeqStore_internal(se
+         *seqHead = (BYTE)((stats.LLtype<<6) + (stats.Offtype<<4) + (stats.MLtype<<2));
+         lastCountSize = stats.lastCountSize;
+         op += stats.size;
++        longOffsets = stats.longOffsets;
+     }
+ 
+     {   size_t const bitstreamSize = ZSTD_encodeSequences(
+@@ -2598,14 +2903,15 @@ ZSTD_entropyCompressSeqStore_internal(se
+ }
+ 
+ MEM_STATIC size_t
+-ZSTD_entropyCompressSeqStore(seqStore_t* seqStorePtr,
+-                       const ZSTD_entropyCTables_t* prevEntropy,
+-                             ZSTD_entropyCTables_t* nextEntropy,
+-                       const ZSTD_CCtx_params* cctxParams,
+-                             void* dst, size_t dstCapacity,
+-                             size_t srcSize,
+-                             void* entropyWorkspace, size_t entropyWkspSize,
+-                             int bmi2)
++ZSTD_entropyCompressSeqStore(
++                    const seqStore_t* seqStorePtr,
++                    const ZSTD_entropyCTables_t* prevEntropy,
++                          ZSTD_entropyCTables_t* nextEntropy,
++                    const ZSTD_CCtx_params* cctxParams,
++                          void* dst, size_t dstCapacity,
++                          size_t srcSize,
++                          void* entropyWorkspace, size_t entropyWkspSize,
++                          int bmi2)
+ {
+     size_t const cSize = ZSTD_entropyCompressSeqStore_internal(
+                             seqStorePtr, prevEntropy, nextEntropy, cctxParams,
+@@ -2615,15 +2921,21 @@ ZSTD_entropyCompressSeqStore(seqStore_t*
+     /* When srcSize <= dstCapacity, there is enough space to write a raw uncompressed block.
+      * Since we ran out of space, block must be not compressible, so fall back to raw uncompressed block.
+      */
+-    if ((cSize == ERROR(dstSize_tooSmall)) & (srcSize <= dstCapacity))
++    if ((cSize == ERROR(dstSize_tooSmall)) & (srcSize <= dstCapacity)) {
++        DEBUGLOG(4, "not enough dstCapacity (%zu) for ZSTD_entropyCompressSeqStore_internal()=> do not compress block", dstCapacity);
+         return 0;  /* block not compressed */
++    }
+     FORWARD_IF_ERROR(cSize, "ZSTD_entropyCompressSeqStore_internal failed");
+ 
+     /* Check compressibility */
+     {   size_t const maxCSize = srcSize - ZSTD_minGain(srcSize, cctxParams->cParams.strategy);
+         if (cSize >= maxCSize) return 0;  /* block not compressed */
+     }
+-    DEBUGLOG(4, "ZSTD_entropyCompressSeqStore() cSize: %zu", cSize);
++    DEBUGLOG(5, "ZSTD_entropyCompressSeqStore() cSize: %zu", cSize);
++    /* libzstd decoder before  > v1.5.4 is not compatible with compressed blocks of size ZSTD_BLOCKSIZE_MAX exactly.
++     * This restriction is indirectly already fulfilled by respecting ZSTD_minGain() condition above.
++     */
++    assert(cSize < ZSTD_BLOCKSIZE_MAX);
+     return cSize;
+ }
+ 
+@@ -2635,40 +2947,43 @@ ZSTD_blockCompressor ZSTD_selectBlockCom
+     static const ZSTD_blockCompressor blockCompressor[4][ZSTD_STRATEGY_MAX+1] = {
+         { ZSTD_compressBlock_fast  /* default for 0 */,
+           ZSTD_compressBlock_fast,
+-          ZSTD_compressBlock_doubleFast,
+-          ZSTD_compressBlock_greedy,
+-          ZSTD_compressBlock_lazy,
+-          ZSTD_compressBlock_lazy2,
+-          ZSTD_compressBlock_btlazy2,
+-          ZSTD_compressBlock_btopt,
+-          ZSTD_compressBlock_btultra,
+-          ZSTD_compressBlock_btultra2 },
++          ZSTD_COMPRESSBLOCK_DOUBLEFAST,
++          ZSTD_COMPRESSBLOCK_GREEDY,
++          ZSTD_COMPRESSBLOCK_LAZY,
++          ZSTD_COMPRESSBLOCK_LAZY2,
++          ZSTD_COMPRESSBLOCK_BTLAZY2,
++          ZSTD_COMPRESSBLOCK_BTOPT,
++          ZSTD_COMPRESSBLOCK_BTULTRA,
++          ZSTD_COMPRESSBLOCK_BTULTRA2
++        },
+         { ZSTD_compressBlock_fast_extDict  /* default for 0 */,
+           ZSTD_compressBlock_fast_extDict,
+-          ZSTD_compressBlock_doubleFast_extDict,
+-          ZSTD_compressBlock_greedy_extDict,
+-          ZSTD_compressBlock_lazy_extDict,
+-          ZSTD_compressBlock_lazy2_extDict,
+-          ZSTD_compressBlock_btlazy2_extDict,
+-          ZSTD_compressBlock_btopt_extDict,
+-          ZSTD_compressBlock_btultra_extDict,
+-          ZSTD_compressBlock_btultra_extDict },
++          ZSTD_COMPRESSBLOCK_DOUBLEFAST_EXTDICT,
++          ZSTD_COMPRESSBLOCK_GREEDY_EXTDICT,
++          ZSTD_COMPRESSBLOCK_LAZY_EXTDICT,
++          ZSTD_COMPRESSBLOCK_LAZY2_EXTDICT,
++          ZSTD_COMPRESSBLOCK_BTLAZY2_EXTDICT,
++          ZSTD_COMPRESSBLOCK_BTOPT_EXTDICT,
++          ZSTD_COMPRESSBLOCK_BTULTRA_EXTDICT,
++          ZSTD_COMPRESSBLOCK_BTULTRA_EXTDICT
++        },
+         { ZSTD_compressBlock_fast_dictMatchState  /* default for 0 */,
+           ZSTD_compressBlock_fast_dictMatchState,
+-          ZSTD_compressBlock_doubleFast_dictMatchState,
+-          ZSTD_compressBlock_greedy_dictMatchState,
+-          ZSTD_compressBlock_lazy_dictMatchState,
+-          ZSTD_compressBlock_lazy2_dictMatchState,
+-          ZSTD_compressBlock_btlazy2_dictMatchState,
+-          ZSTD_compressBlock_btopt_dictMatchState,
+-          ZSTD_compressBlock_btultra_dictMatchState,
+-          ZSTD_compressBlock_btultra_dictMatchState },
++          ZSTD_COMPRESSBLOCK_DOUBLEFAST_DICTMATCHSTATE,
++          ZSTD_COMPRESSBLOCK_GREEDY_DICTMATCHSTATE,
++          ZSTD_COMPRESSBLOCK_LAZY_DICTMATCHSTATE,
++          ZSTD_COMPRESSBLOCK_LAZY2_DICTMATCHSTATE,
++          ZSTD_COMPRESSBLOCK_BTLAZY2_DICTMATCHSTATE,
++          ZSTD_COMPRESSBLOCK_BTOPT_DICTMATCHSTATE,
++          ZSTD_COMPRESSBLOCK_BTULTRA_DICTMATCHSTATE,
++          ZSTD_COMPRESSBLOCK_BTULTRA_DICTMATCHSTATE
++        },
+         { NULL  /* default for 0 */,
+           NULL,
+           NULL,
+-          ZSTD_compressBlock_greedy_dedicatedDictSearch,
+-          ZSTD_compressBlock_lazy_dedicatedDictSearch,
+-          ZSTD_compressBlock_lazy2_dedicatedDictSearch,
++          ZSTD_COMPRESSBLOCK_GREEDY_DEDICATEDDICTSEARCH,
++          ZSTD_COMPRESSBLOCK_LAZY_DEDICATEDDICTSEARCH,
++          ZSTD_COMPRESSBLOCK_LAZY2_DEDICATEDDICTSEARCH,
+           NULL,
+           NULL,
+           NULL,
+@@ -2681,18 +2996,26 @@ ZSTD_blockCompressor ZSTD_selectBlockCom
+     DEBUGLOG(4, "Selected block compressor: dictMode=%d strat=%d rowMatchfinder=%d", (int)dictMode, (int)strat, (int)useRowMatchFinder);
+     if (ZSTD_rowMatchFinderUsed(strat, useRowMatchFinder)) {
+         static const ZSTD_blockCompressor rowBasedBlockCompressors[4][3] = {
+-            { ZSTD_compressBlock_greedy_row,
+-            ZSTD_compressBlock_lazy_row,
+-            ZSTD_compressBlock_lazy2_row },
+-            { ZSTD_compressBlock_greedy_extDict_row,
+-            ZSTD_compressBlock_lazy_extDict_row,
+-            ZSTD_compressBlock_lazy2_extDict_row },
+-            { ZSTD_compressBlock_greedy_dictMatchState_row,
+-            ZSTD_compressBlock_lazy_dictMatchState_row,
+-            ZSTD_compressBlock_lazy2_dictMatchState_row },
+-            { ZSTD_compressBlock_greedy_dedicatedDictSearch_row,
+-            ZSTD_compressBlock_lazy_dedicatedDictSearch_row,
+-            ZSTD_compressBlock_lazy2_dedicatedDictSearch_row }
++            {
++                ZSTD_COMPRESSBLOCK_GREEDY_ROW,
++                ZSTD_COMPRESSBLOCK_LAZY_ROW,
++                ZSTD_COMPRESSBLOCK_LAZY2_ROW
++            },
++            {
++                ZSTD_COMPRESSBLOCK_GREEDY_EXTDICT_ROW,
++                ZSTD_COMPRESSBLOCK_LAZY_EXTDICT_ROW,
++                ZSTD_COMPRESSBLOCK_LAZY2_EXTDICT_ROW
++            },
++            {
++                ZSTD_COMPRESSBLOCK_GREEDY_DICTMATCHSTATE_ROW,
++                ZSTD_COMPRESSBLOCK_LAZY_DICTMATCHSTATE_ROW,
++                ZSTD_COMPRESSBLOCK_LAZY2_DICTMATCHSTATE_ROW
++            },
++            {
++                ZSTD_COMPRESSBLOCK_GREEDY_DEDICATEDDICTSEARCH_ROW,
++                ZSTD_COMPRESSBLOCK_LAZY_DEDICATEDDICTSEARCH_ROW,
++                ZSTD_COMPRESSBLOCK_LAZY2_DEDICATEDDICTSEARCH_ROW
++            }
+         };
+         DEBUGLOG(4, "Selecting a row-based matchfinder");
+         assert(useRowMatchFinder != ZSTD_ps_auto);
+@@ -2718,6 +3041,72 @@ void ZSTD_resetSeqStore(seqStore_t* ssPt
+     ssPtr->longLengthType = ZSTD_llt_none;
+ }
+ 
++/* ZSTD_postProcessSequenceProducerResult() :
++ * Validates and post-processes sequences obtained through the external matchfinder API:
++ *   - Checks whether nbExternalSeqs represents an error condition.
++ *   - Appends a block delimiter to outSeqs if one is not already present.
++ *     See zstd.h for context regarding block delimiters.
++ * Returns the number of sequences after post-processing, or an error code. */
++static size_t ZSTD_postProcessSequenceProducerResult(
++    ZSTD_Sequence* outSeqs, size_t nbExternalSeqs, size_t outSeqsCapacity, size_t srcSize
++) {
++    RETURN_ERROR_IF(
++        nbExternalSeqs > outSeqsCapacity,
++        sequenceProducer_failed,
++        "External sequence producer returned error code %lu",
++        (unsigned long)nbExternalSeqs
++    );
++
++    RETURN_ERROR_IF(
++        nbExternalSeqs == 0 && srcSize > 0,
++        sequenceProducer_failed,
++        "Got zero sequences from external sequence producer for a non-empty src buffer!"
++    );
++
++    if (srcSize == 0) {
++        ZSTD_memset(&outSeqs[0], 0, sizeof(ZSTD_Sequence));
++        return 1;
++    }
++
++    {
++        ZSTD_Sequence const lastSeq = outSeqs[nbExternalSeqs - 1];
++
++        /* We can return early if lastSeq is already a block delimiter. */
++        if (lastSeq.offset == 0 && lastSeq.matchLength == 0) {
++            return nbExternalSeqs;
++        }
++
++        /* This error condition is only possible if the external matchfinder
++         * produced an invalid parse, by definition of ZSTD_sequenceBound(). */
++        RETURN_ERROR_IF(
++            nbExternalSeqs == outSeqsCapacity,
++            sequenceProducer_failed,
++            "nbExternalSeqs == outSeqsCapacity but lastSeq is not a block delimiter!"
++        );
++
++        /* lastSeq is not a block delimiter, so we need to append one. */
++        ZSTD_memset(&outSeqs[nbExternalSeqs], 0, sizeof(ZSTD_Sequence));
++        return nbExternalSeqs + 1;
++    }
++}
++
++/* ZSTD_fastSequenceLengthSum() :
++ * Returns sum(litLen) + sum(matchLen) + lastLits for *seqBuf*.
++ * Similar to another function in zstd_compress.c (determine_blockSize),
++ * except it doesn't check for a block delimiter to end summation.
++ * Removing the early exit allows the compiler to auto-vectorize (https://godbolt.org/z/cY1cajz9P).
++ * This function can be deleted and replaced by determine_blockSize after we resolve issue #3456. */
++static size_t ZSTD_fastSequenceLengthSum(ZSTD_Sequence const* seqBuf, size_t seqBufSize) {
++    size_t matchLenSum, litLenSum, i;
++    matchLenSum = 0;
++    litLenSum = 0;
++    for (i = 0; i < seqBufSize; i++) {
++        litLenSum += seqBuf[i].litLength;
++        matchLenSum += seqBuf[i].matchLength;
++    }
++    return litLenSum + matchLenSum;
++}
++
+ typedef enum { ZSTDbss_compress, ZSTDbss_noCompress } ZSTD_buildSeqStore_e;
+ 
+ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize)
+@@ -2727,7 +3116,9 @@ static size_t ZSTD_buildSeqStore(ZSTD_CC
+     assert(srcSize <= ZSTD_BLOCKSIZE_MAX);
+     /* Assert that we have correctly flushed the ctx params into the ms's copy */
+     ZSTD_assertEqualCParams(zc->appliedParams.cParams, ms->cParams);
+-    if (srcSize < MIN_CBLOCK_SIZE+ZSTD_blockHeaderSize+1) {
++    /* TODO: See 3090. We reduced MIN_CBLOCK_SIZE from 3 to 2 so to compensate we are adding
++     * additional 1. We need to revisit and change this logic to be more consistent */
++    if (srcSize < MIN_CBLOCK_SIZE+ZSTD_blockHeaderSize+1+1) {
+         if (zc->appliedParams.cParams.strategy >= ZSTD_btopt) {
+             ZSTD_ldm_skipRawSeqStoreBytes(&zc->externSeqStore, srcSize);
+         } else {
+@@ -2763,6 +3154,15 @@ static size_t ZSTD_buildSeqStore(ZSTD_CC
+         }
+         if (zc->externSeqStore.pos < zc->externSeqStore.size) {
+             assert(zc->appliedParams.ldmParams.enableLdm == ZSTD_ps_disable);
++
++            /* External matchfinder + LDM is technically possible, just not implemented yet.
++             * We need to revisit soon and implement it. */
++            RETURN_ERROR_IF(
++                ZSTD_hasExtSeqProd(&zc->appliedParams),
++                parameter_combination_unsupported,
++                "Long-distance matching with external sequence producer enabled is not currently supported."
++            );
++
+             /* Updates ldmSeqStore.pos */
+             lastLLSize =
+                 ZSTD_ldm_blockCompress(&zc->externSeqStore,
+@@ -2774,6 +3174,14 @@ static size_t ZSTD_buildSeqStore(ZSTD_CC
+         } else if (zc->appliedParams.ldmParams.enableLdm == ZSTD_ps_enable) {
+             rawSeqStore_t ldmSeqStore = kNullRawSeqStore;
+ 
++            /* External matchfinder + LDM is technically possible, just not implemented yet.
++             * We need to revisit soon and implement it. */
++            RETURN_ERROR_IF(
++                ZSTD_hasExtSeqProd(&zc->appliedParams),
++                parameter_combination_unsupported,
++                "Long-distance matching with external sequence producer enabled is not currently supported."
++            );
++
+             ldmSeqStore.seq = zc->ldmSequences;
+             ldmSeqStore.capacity = zc->maxNbLdmSequences;
+             /* Updates ldmSeqStore.size */
+@@ -2788,10 +3196,74 @@ static size_t ZSTD_buildSeqStore(ZSTD_CC
+                                        zc->appliedParams.useRowMatchFinder,
+                                        src, srcSize);
+             assert(ldmSeqStore.pos == ldmSeqStore.size);
+-        } else {   /* not long range mode */
+-            ZSTD_blockCompressor const blockCompressor = ZSTD_selectBlockCompressor(zc->appliedParams.cParams.strategy,
+-                                                                                    zc->appliedParams.useRowMatchFinder,
+-                                                                                    dictMode);
++        } else if (ZSTD_hasExtSeqProd(&zc->appliedParams)) {
++            assert(
++                zc->extSeqBufCapacity >= ZSTD_sequenceBound(srcSize)
++            );
++            assert(zc->appliedParams.extSeqProdFunc != NULL);
++
++            {   U32 const windowSize = (U32)1 << zc->appliedParams.cParams.windowLog;
++
++                size_t const nbExternalSeqs = (zc->appliedParams.extSeqProdFunc)(
++                    zc->appliedParams.extSeqProdState,
++                    zc->extSeqBuf,
++                    zc->extSeqBufCapacity,
++                    src, srcSize,
++                    NULL, 0,  /* dict and dictSize, currently not supported */
++                    zc->appliedParams.compressionLevel,
++                    windowSize
++                );
++
++                size_t const nbPostProcessedSeqs = ZSTD_postProcessSequenceProducerResult(
++                    zc->extSeqBuf,
++                    nbExternalSeqs,
++                    zc->extSeqBufCapacity,
++                    srcSize
++                );
++
++                /* Return early if there is no error, since we don't need to worry about last literals */
++                if (!ZSTD_isError(nbPostProcessedSeqs)) {
++                    ZSTD_sequencePosition seqPos = {0,0,0};
++                    size_t const seqLenSum = ZSTD_fastSequenceLengthSum(zc->extSeqBuf, nbPostProcessedSeqs);
++                    RETURN_ERROR_IF(seqLenSum > srcSize, externalSequences_invalid, "External sequences imply too large a block!");
++                    FORWARD_IF_ERROR(
++                        ZSTD_copySequencesToSeqStoreExplicitBlockDelim(
++                            zc, &seqPos,
++                            zc->extSeqBuf, nbPostProcessedSeqs,
++                            src, srcSize,
++                            zc->appliedParams.searchForExternalRepcodes
++                        ),
++                        "Failed to copy external sequences to seqStore!"
++                    );
++                    ms->ldmSeqStore = NULL;
++                    DEBUGLOG(5, "Copied %lu sequences from external sequence producer to internal seqStore.", (unsigned long)nbExternalSeqs);
++                    return ZSTDbss_compress;
++                }
++
++                /* Propagate the error if fallback is disabled */
++                if (!zc->appliedParams.enableMatchFinderFallback) {
++                    return nbPostProcessedSeqs;
++                }
++
++                /* Fallback to software matchfinder */
++                {   ZSTD_blockCompressor const blockCompressor =
++                        ZSTD_selectBlockCompressor(
++                            zc->appliedParams.cParams.strategy,
++                            zc->appliedParams.useRowMatchFinder,
++                            dictMode);
++                    ms->ldmSeqStore = NULL;
++                    DEBUGLOG(
++                        5,
++                        "External sequence producer returned error code %lu. Falling back to internal parser.",
++                        (unsigned long)nbExternalSeqs
++                    );
++                    lastLLSize = blockCompressor(ms, &zc->seqStore, zc->blockState.nextCBlock->rep, src, srcSize);
++            }   }
++        } else {   /* not long range mode and no external matchfinder */
++            ZSTD_blockCompressor const blockCompressor = ZSTD_selectBlockCompressor(
++                    zc->appliedParams.cParams.strategy,
++                    zc->appliedParams.useRowMatchFinder,
++                    dictMode);
+             ms->ldmSeqStore = NULL;
+             lastLLSize = blockCompressor(ms, &zc->seqStore, zc->blockState.nextCBlock->rep, src, srcSize);
+         }
+@@ -2801,29 +3273,38 @@ static size_t ZSTD_buildSeqStore(ZSTD_CC
+     return ZSTDbss_compress;
+ }
+ 
+-static void ZSTD_copyBlockSequences(ZSTD_CCtx* zc)
++static size_t ZSTD_copyBlockSequences(SeqCollector* seqCollector, const seqStore_t* seqStore, const U32 prevRepcodes[ZSTD_REP_NUM])
+ {
+-    const seqStore_t* seqStore = ZSTD_getSeqStore(zc);
+-    const seqDef* seqStoreSeqs = seqStore->sequencesStart;
+-    size_t seqStoreSeqSize = seqStore->sequences - seqStoreSeqs;
+-    size_t seqStoreLiteralsSize = (size_t)(seqStore->lit - seqStore->litStart);
+-    size_t literalsRead = 0;
+-    size_t lastLLSize;
+-
+-    ZSTD_Sequence* outSeqs = &zc->seqCollector.seqStart[zc->seqCollector.seqIndex];
++    const seqDef* inSeqs = seqStore->sequencesStart;
++    const size_t nbInSequences = seqStore->sequences - inSeqs;
++    const size_t nbInLiterals = (size_t)(seqStore->lit - seqStore->litStart);
++
++    ZSTD_Sequence* outSeqs = seqCollector->seqIndex == 0 ? seqCollector->seqStart : seqCollector->seqStart + seqCollector->seqIndex;
++    const size_t nbOutSequences = nbInSequences + 1;
++    size_t nbOutLiterals = 0;
++    repcodes_t repcodes;
+     size_t i;
+-    repcodes_t updatedRepcodes;
+ 
+-    assert(zc->seqCollector.seqIndex + 1 < zc->seqCollector.maxSequences);
+-    /* Ensure we have enough space for last literals "sequence" */
+-    assert(zc->seqCollector.maxSequences >= seqStoreSeqSize + 1);
+-    ZSTD_memcpy(updatedRepcodes.rep, zc->blockState.prevCBlock->rep, sizeof(repcodes_t));
+-    for (i = 0; i < seqStoreSeqSize; ++i) {
+-        U32 rawOffset = seqStoreSeqs[i].offBase - ZSTD_REP_NUM;
+-        outSeqs[i].litLength = seqStoreSeqs[i].litLength;
+-        outSeqs[i].matchLength = seqStoreSeqs[i].mlBase + MINMATCH;
++    /* Bounds check that we have enough space for every input sequence
++     * and the block delimiter
++     */
++    assert(seqCollector->seqIndex <= seqCollector->maxSequences);
++    RETURN_ERROR_IF(
++        nbOutSequences > (size_t)(seqCollector->maxSequences - seqCollector->seqIndex),
++        dstSize_tooSmall,
++        "Not enough space to copy sequences");
++
++    ZSTD_memcpy(&repcodes, prevRepcodes, sizeof(repcodes));
++    for (i = 0; i < nbInSequences; ++i) {
++        U32 rawOffset;
++        outSeqs[i].litLength = inSeqs[i].litLength;
++        outSeqs[i].matchLength = inSeqs[i].mlBase + MINMATCH;
+         outSeqs[i].rep = 0;
+ 
++        /* Handle the possible single length >= 64K
++         * There can only be one because we add MINMATCH to every match length,
++         * and blocks are at most 128K.
++         */
+         if (i == seqStore->longLengthPos) {
+             if (seqStore->longLengthType == ZSTD_llt_literalLength) {
+                 outSeqs[i].litLength += 0x10000;
+@@ -2832,37 +3313,55 @@ static void ZSTD_copyBlockSequences(ZSTD
+             }
+         }
+ 
+-        if (seqStoreSeqs[i].offBase <= ZSTD_REP_NUM) {
+-            /* Derive the correct offset corresponding to a repcode */
+-            outSeqs[i].rep = seqStoreSeqs[i].offBase;
++        /* Determine the raw offset given the offBase, which may be a repcode. */
++        if (OFFBASE_IS_REPCODE(inSeqs[i].offBase)) {
++            const U32 repcode = OFFBASE_TO_REPCODE(inSeqs[i].offBase);
++            assert(repcode > 0);
++            outSeqs[i].rep = repcode;
+             if (outSeqs[i].litLength != 0) {
+-                rawOffset = updatedRepcodes.rep[outSeqs[i].rep - 1];
++                rawOffset = repcodes.rep[repcode - 1];
+             } else {
+-                if (outSeqs[i].rep == 3) {
+-                    rawOffset = updatedRepcodes.rep[0] - 1;
++                if (repcode == 3) {
++                    assert(repcodes.rep[0] > 1);
++                    rawOffset = repcodes.rep[0] - 1;
+                 } else {
+-                    rawOffset = updatedRepcodes.rep[outSeqs[i].rep];
++                    rawOffset = repcodes.rep[repcode];
+                 }
+             }
++        } else {
++            rawOffset = OFFBASE_TO_OFFSET(inSeqs[i].offBase);
+         }
+         outSeqs[i].offset = rawOffset;
+-        /* seqStoreSeqs[i].offset == offCode+1, and ZSTD_updateRep() expects offCode
+-           so we provide seqStoreSeqs[i].offset - 1 */
+-        ZSTD_updateRep(updatedRepcodes.rep,
+-                       seqStoreSeqs[i].offBase - 1,
+-                       seqStoreSeqs[i].litLength == 0);
+-        literalsRead += outSeqs[i].litLength;
++
++        /* Update repcode history for the sequence */
++        ZSTD_updateRep(repcodes.rep,
++                       inSeqs[i].offBase,
++                       inSeqs[i].litLength == 0);
++
++        nbOutLiterals += outSeqs[i].litLength;
+     }
+     /* Insert last literals (if any exist) in the block as a sequence with ml == off == 0.
+      * If there are no last literals, then we'll emit (of: 0, ml: 0, ll: 0), which is a marker
+      * for the block boundary, according to the API.
+      */
+-    assert(seqStoreLiteralsSize >= literalsRead);
+-    lastLLSize = seqStoreLiteralsSize - literalsRead;
+-    outSeqs[i].litLength = (U32)lastLLSize;
+-    outSeqs[i].matchLength = outSeqs[i].offset = outSeqs[i].rep = 0;
+-    seqStoreSeqSize++;
+-    zc->seqCollector.seqIndex += seqStoreSeqSize;
++    assert(nbInLiterals >= nbOutLiterals);
++    {
++        const size_t lastLLSize = nbInLiterals - nbOutLiterals;
++        outSeqs[nbInSequences].litLength = (U32)lastLLSize;
++        outSeqs[nbInSequences].matchLength = 0;
++        outSeqs[nbInSequences].offset = 0;
++        assert(nbOutSequences == nbInSequences + 1);
++    }
++    seqCollector->seqIndex += nbOutSequences;
++    assert(seqCollector->seqIndex <= seqCollector->maxSequences);
++
++    return 0;
++}
++
++size_t ZSTD_sequenceBound(size_t srcSize) {
++    const size_t maxNbSeq = (srcSize / ZSTD_MINMATCH_MIN) + 1;
++    const size_t maxNbDelims = (srcSize / ZSTD_BLOCKSIZE_MAX_MIN) + 1;
++    return maxNbSeq + maxNbDelims;
+ }
+ 
+ size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs,
+@@ -2871,6 +3370,16 @@ size_t ZSTD_generateSequences(ZSTD_CCtx*
+     const size_t dstCapacity = ZSTD_compressBound(srcSize);
+     void* dst = ZSTD_customMalloc(dstCapacity, ZSTD_defaultCMem);
+     SeqCollector seqCollector;
++    {
++        int targetCBlockSize;
++        FORWARD_IF_ERROR(ZSTD_CCtx_getParameter(zc, ZSTD_c_targetCBlockSize, &targetCBlockSize), "");
++        RETURN_ERROR_IF(targetCBlockSize != 0, parameter_unsupported, "targetCBlockSize != 0");
++    }
++    {
++        int nbWorkers;
++        FORWARD_IF_ERROR(ZSTD_CCtx_getParameter(zc, ZSTD_c_nbWorkers, &nbWorkers), "");
++        RETURN_ERROR_IF(nbWorkers != 0, parameter_unsupported, "nbWorkers != 0");
++    }
+ 
+     RETURN_ERROR_IF(dst == NULL, memory_allocation, "NULL pointer!");
+ 
+@@ -2880,8 +3389,12 @@ size_t ZSTD_generateSequences(ZSTD_CCtx*
+     seqCollector.maxSequences = outSeqsSize;
+     zc->seqCollector = seqCollector;
+ 
+-    ZSTD_compress2(zc, dst, dstCapacity, src, srcSize);
+-    ZSTD_customFree(dst, ZSTD_defaultCMem);
++    {
++        const size_t ret = ZSTD_compress2(zc, dst, dstCapacity, src, srcSize);
++        ZSTD_customFree(dst, ZSTD_defaultCMem);
++        FORWARD_IF_ERROR(ret, "ZSTD_compress2 failed");
++    }
++    assert(zc->seqCollector.seqIndex <= ZSTD_sequenceBound(srcSize));
+     return zc->seqCollector.seqIndex;
+ }
+ 
+@@ -2910,19 +3423,17 @@ static int ZSTD_isRLE(const BYTE* src, s
+     const size_t unrollMask = unrollSize - 1;
+     const size_t prefixLength = length & unrollMask;
+     size_t i;
+-    size_t u;
+     if (length == 1) return 1;
+     /* Check if prefix is RLE first before using unrolled loop */
+     if (prefixLength && ZSTD_count(ip+1, ip, ip+prefixLength) != prefixLength-1) {
+         return 0;
+     }
+     for (i = prefixLength; i != length; i += unrollSize) {
++        size_t u;
+         for (u = 0; u < unrollSize; u += sizeof(size_t)) {
+             if (MEM_readST(ip + i + u) != valueST) {
+                 return 0;
+-            }
+-        }
+-    }
++    }   }   }
+     return 1;
+ }
+ 
+@@ -2938,7 +3449,8 @@ static int ZSTD_maybeRLE(seqStore_t cons
+     return nbSeqs < 4 && nbLits < 10;
+ }
+ 
+-static void ZSTD_blockState_confirmRepcodesAndEntropyTables(ZSTD_blockState_t* const bs)
++static void
++ZSTD_blockState_confirmRepcodesAndEntropyTables(ZSTD_blockState_t* const bs)
+ {
+     ZSTD_compressedBlockState_t* const tmp = bs->prevCBlock;
+     bs->prevCBlock = bs->nextCBlock;
+@@ -2946,7 +3458,9 @@ static void ZSTD_blockState_confirmRepco
+ }
+ 
+ /* Writes the block header */
+-static void writeBlockHeader(void* op, size_t cSize, size_t blockSize, U32 lastBlock) {
++static void
++writeBlockHeader(void* op, size_t cSize, size_t blockSize, U32 lastBlock)
++{
+     U32 const cBlockHeader = cSize == 1 ?
+                         lastBlock + (((U32)bt_rle)<<1) + (U32)(blockSize << 3) :
+                         lastBlock + (((U32)bt_compressed)<<1) + (U32)(cSize << 3);
+@@ -2959,13 +3473,16 @@ static void writeBlockHeader(void* op, s
+  *  Stores literals block type (raw, rle, compressed, repeat) and
+  *  huffman description table to hufMetadata.
+  *  Requires ENTROPY_WORKSPACE_SIZE workspace
+- *  @return : size of huffman description table or error code */
+-static size_t ZSTD_buildBlockEntropyStats_literals(void* const src, size_t srcSize,
+-                                            const ZSTD_hufCTables_t* prevHuf,
+-                                                  ZSTD_hufCTables_t* nextHuf,
+-                                                  ZSTD_hufCTablesMetadata_t* hufMetadata,
+-                                                  const int literalsCompressionIsDisabled,
+-                                                  void* workspace, size_t wkspSize)
++ * @return : size of huffman description table, or an error code
++ */
++static size_t
++ZSTD_buildBlockEntropyStats_literals(void* const src, size_t srcSize,
++                               const ZSTD_hufCTables_t* prevHuf,
++                                     ZSTD_hufCTables_t* nextHuf,
++                                     ZSTD_hufCTablesMetadata_t* hufMetadata,
++                               const int literalsCompressionIsDisabled,
++                                     void* workspace, size_t wkspSize,
++                                     int hufFlags)
+ {
+     BYTE* const wkspStart = (BYTE*)workspace;
+     BYTE* const wkspEnd = wkspStart + wkspSize;
+@@ -2973,9 +3490,9 @@ static size_t ZSTD_buildBlockEntropyStat
+     unsigned* const countWksp = (unsigned*)workspace;
+     const size_t countWkspSize = (HUF_SYMBOLVALUE_MAX + 1) * sizeof(unsigned);
+     BYTE* const nodeWksp = countWkspStart + countWkspSize;
+-    const size_t nodeWkspSize = wkspEnd-nodeWksp;
++    const size_t nodeWkspSize = (size_t)(wkspEnd - nodeWksp);
+     unsigned maxSymbolValue = HUF_SYMBOLVALUE_MAX;
+-    unsigned huffLog = HUF_TABLELOG_DEFAULT;
++    unsigned huffLog = LitHufLog;
+     HUF_repeat repeat = prevHuf->repeatMode;
+     DEBUGLOG(5, "ZSTD_buildBlockEntropyStats_literals (srcSize=%zu)", srcSize);
+ 
+@@ -2990,73 +3507,77 @@ static size_t ZSTD_buildBlockEntropyStat
+ 
+     /* small ? don't even attempt compression (speed opt) */
+ #ifndef COMPRESS_LITERALS_SIZE_MIN
+-#define COMPRESS_LITERALS_SIZE_MIN 63
++# define COMPRESS_LITERALS_SIZE_MIN 63  /* heuristic */
+ #endif
+     {   size_t const minLitSize = (prevHuf->repeatMode == HUF_repeat_valid) ? 6 : COMPRESS_LITERALS_SIZE_MIN;
+         if (srcSize <= minLitSize) {
+             DEBUGLOG(5, "set_basic - too small");
+             hufMetadata->hType = set_basic;
+             return 0;
+-        }
+-    }
++    }   }
+ 
+     /* Scan input and build symbol stats */
+-    {   size_t const largest = HIST_count_wksp (countWksp, &maxSymbolValue, (const BYTE*)src, srcSize, workspace, wkspSize);
++    {   size_t const largest =
++            HIST_count_wksp (countWksp, &maxSymbolValue,
++                            (const BYTE*)src, srcSize,
++                            workspace, wkspSize);
+         FORWARD_IF_ERROR(largest, "HIST_count_wksp failed");
+         if (largest == srcSize) {
++            /* only one literal symbol */
+             DEBUGLOG(5, "set_rle");
+             hufMetadata->hType = set_rle;
+             return 0;
+         }
+         if (largest <= (srcSize >> 7)+4) {
++            /* heuristic: likely not compressible */
+             DEBUGLOG(5, "set_basic - no gain");
+             hufMetadata->hType = set_basic;
+             return 0;
+-        }
+-    }
++    }   }
+ 
+     /* Validate the previous Huffman table */
+-    if (repeat == HUF_repeat_check && !HUF_validateCTable((HUF_CElt const*)prevHuf->CTable, countWksp, maxSymbolValue)) {
++    if (repeat == HUF_repeat_check
++      && !HUF_validateCTable((HUF_CElt const*)prevHuf->CTable, countWksp, maxSymbolValue)) {
+         repeat = HUF_repeat_none;
+     }
+ 
+     /* Build Huffman Tree */
+     ZSTD_memset(nextHuf->CTable, 0, sizeof(nextHuf->CTable));
+-    huffLog = HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue);
++    huffLog = HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue, nodeWksp, nodeWkspSize, nextHuf->CTable, countWksp, hufFlags);
++    assert(huffLog <= LitHufLog);
+     {   size_t const maxBits = HUF_buildCTable_wksp((HUF_CElt*)nextHuf->CTable, countWksp,
+                                                     maxSymbolValue, huffLog,
+                                                     nodeWksp, nodeWkspSize);
+         FORWARD_IF_ERROR(maxBits, "HUF_buildCTable_wksp");
+         huffLog = (U32)maxBits;
+-        {   /* Build and write the CTable */
+-            size_t const newCSize = HUF_estimateCompressedSize(
+-                    (HUF_CElt*)nextHuf->CTable, countWksp, maxSymbolValue);
+-            size_t const hSize = HUF_writeCTable_wksp(
+-                    hufMetadata->hufDesBuffer, sizeof(hufMetadata->hufDesBuffer),
+-                    (HUF_CElt*)nextHuf->CTable, maxSymbolValue, huffLog,
+-                    nodeWksp, nodeWkspSize);
+-            /* Check against repeating the previous CTable */
+-            if (repeat != HUF_repeat_none) {
+-                size_t const oldCSize = HUF_estimateCompressedSize(
+-                        (HUF_CElt const*)prevHuf->CTable, countWksp, maxSymbolValue);
+-                if (oldCSize < srcSize && (oldCSize <= hSize + newCSize || hSize + 12 >= srcSize)) {
+-                    DEBUGLOG(5, "set_repeat - smaller");
+-                    ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
+-                    hufMetadata->hType = set_repeat;
+-                    return 0;
+-                }
+-            }
+-            if (newCSize + hSize >= srcSize) {
+-                DEBUGLOG(5, "set_basic - no gains");
++    }
++    {   /* Build and write the CTable */
++        size_t const newCSize = HUF_estimateCompressedSize(
++                (HUF_CElt*)nextHuf->CTable, countWksp, maxSymbolValue);
++        size_t const hSize = HUF_writeCTable_wksp(
++                hufMetadata->hufDesBuffer, sizeof(hufMetadata->hufDesBuffer),
++                (HUF_CElt*)nextHuf->CTable, maxSymbolValue, huffLog,
++                nodeWksp, nodeWkspSize);
++        /* Check against repeating the previous CTable */
++        if (repeat != HUF_repeat_none) {
++            size_t const oldCSize = HUF_estimateCompressedSize(
++                    (HUF_CElt const*)prevHuf->CTable, countWksp, maxSymbolValue);
++            if (oldCSize < srcSize && (oldCSize <= hSize + newCSize || hSize + 12 >= srcSize)) {
++                DEBUGLOG(5, "set_repeat - smaller");
+                 ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
+-                hufMetadata->hType = set_basic;
++                hufMetadata->hType = set_repeat;
+                 return 0;
+-            }
+-            DEBUGLOG(5, "set_compressed (hSize=%u)", (U32)hSize);
+-            hufMetadata->hType = set_compressed;
+-            nextHuf->repeatMode = HUF_repeat_check;
+-            return hSize;
++        }   }
++        if (newCSize + hSize >= srcSize) {
++            DEBUGLOG(5, "set_basic - no gains");
++            ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
++            hufMetadata->hType = set_basic;
++            return 0;
+         }
++        DEBUGLOG(5, "set_compressed (hSize=%u)", (U32)hSize);
++        hufMetadata->hType = set_compressed;
++        nextHuf->repeatMode = HUF_repeat_check;
++        return hSize;
+     }
+ }
+ 
+@@ -3066,8 +3587,9 @@ static size_t ZSTD_buildBlockEntropyStat
+  * and updates nextEntropy to the appropriate repeatMode.
+  */
+ static ZSTD_symbolEncodingTypeStats_t
+-ZSTD_buildDummySequencesStatistics(ZSTD_fseCTables_t* nextEntropy) {
+-    ZSTD_symbolEncodingTypeStats_t stats = {set_basic, set_basic, set_basic, 0, 0};
++ZSTD_buildDummySequencesStatistics(ZSTD_fseCTables_t* nextEntropy)
++{
++    ZSTD_symbolEncodingTypeStats_t stats = {set_basic, set_basic, set_basic, 0, 0, 0};
+     nextEntropy->litlength_repeatMode = FSE_repeat_none;
+     nextEntropy->offcode_repeatMode = FSE_repeat_none;
+     nextEntropy->matchlength_repeatMode = FSE_repeat_none;
+@@ -3078,16 +3600,18 @@ ZSTD_buildDummySequencesStatistics(ZSTD_
+  *  Builds entropy for the sequences.
+  *  Stores symbol compression modes and fse table to fseMetadata.
+  *  Requires ENTROPY_WORKSPACE_SIZE wksp.
+- *  @return : size of fse tables or error code */
+-static size_t ZSTD_buildBlockEntropyStats_sequences(seqStore_t* seqStorePtr,
+-                                              const ZSTD_fseCTables_t* prevEntropy,
+-                                                    ZSTD_fseCTables_t* nextEntropy,
+-                                              const ZSTD_CCtx_params* cctxParams,
+-                                                    ZSTD_fseCTablesMetadata_t* fseMetadata,
+-                                                    void* workspace, size_t wkspSize)
++ * @return : size of fse tables or error code */
++static size_t
++ZSTD_buildBlockEntropyStats_sequences(
++                const seqStore_t* seqStorePtr,
++                const ZSTD_fseCTables_t* prevEntropy,
++                      ZSTD_fseCTables_t* nextEntropy,
++                const ZSTD_CCtx_params* cctxParams,
++                      ZSTD_fseCTablesMetadata_t* fseMetadata,
++                      void* workspace, size_t wkspSize)
+ {
+     ZSTD_strategy const strategy = cctxParams->cParams.strategy;
+-    size_t const nbSeq = seqStorePtr->sequences - seqStorePtr->sequencesStart;
++    size_t const nbSeq = (size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart);
+     BYTE* const ostart = fseMetadata->fseTablesBuffer;
+     BYTE* const oend = ostart + sizeof(fseMetadata->fseTablesBuffer);
+     BYTE* op = ostart;
+@@ -3114,23 +3638,28 @@ static size_t ZSTD_buildBlockEntropyStat
+ /* ZSTD_buildBlockEntropyStats() :
+  *  Builds entropy for the block.
+  *  Requires workspace size ENTROPY_WORKSPACE_SIZE
+- *
+- *  @return : 0 on success or error code
++ * @return : 0 on success, or an error code
++ *  Note : also employed in superblock
+  */
+-size_t ZSTD_buildBlockEntropyStats(seqStore_t* seqStorePtr,
+-                             const ZSTD_entropyCTables_t* prevEntropy,
+-                                   ZSTD_entropyCTables_t* nextEntropy,
+-                             const ZSTD_CCtx_params* cctxParams,
+-                                   ZSTD_entropyCTablesMetadata_t* entropyMetadata,
+-                                   void* workspace, size_t wkspSize)
+-{
+-    size_t const litSize = seqStorePtr->lit - seqStorePtr->litStart;
++size_t ZSTD_buildBlockEntropyStats(
++            const seqStore_t* seqStorePtr,
++            const ZSTD_entropyCTables_t* prevEntropy,
++                  ZSTD_entropyCTables_t* nextEntropy,
++            const ZSTD_CCtx_params* cctxParams,
++                  ZSTD_entropyCTablesMetadata_t* entropyMetadata,
++                  void* workspace, size_t wkspSize)
++{
++    size_t const litSize = (size_t)(seqStorePtr->lit - seqStorePtr->litStart);
++    int const huf_useOptDepth = (cctxParams->cParams.strategy >= HUF_OPTIMAL_DEPTH_THRESHOLD);
++    int const hufFlags = huf_useOptDepth ? HUF_flags_optimalDepth : 0;
++
+     entropyMetadata->hufMetadata.hufDesSize =
+         ZSTD_buildBlockEntropyStats_literals(seqStorePtr->litStart, litSize,
+                                             &prevEntropy->huf, &nextEntropy->huf,
+                                             &entropyMetadata->hufMetadata,
+                                             ZSTD_literalsCompressionIsDisabled(cctxParams),
+-                                            workspace, wkspSize);
++                                            workspace, wkspSize, hufFlags);
++
+     FORWARD_IF_ERROR(entropyMetadata->hufMetadata.hufDesSize, "ZSTD_buildBlockEntropyStats_literals failed");
+     entropyMetadata->fseMetadata.fseTablesSize =
+         ZSTD_buildBlockEntropyStats_sequences(seqStorePtr,
+@@ -3143,11 +3672,12 @@ size_t ZSTD_buildBlockEntropyStats(seqSt
+ }
+ 
+ /* Returns the size estimate for the literals section (header + content) of a block */
+-static size_t ZSTD_estimateBlockSize_literal(const BYTE* literals, size_t litSize,
+-                                                const ZSTD_hufCTables_t* huf,
+-                                                const ZSTD_hufCTablesMetadata_t* hufMetadata,
+-                                                void* workspace, size_t wkspSize,
+-                                                int writeEntropy)
++static size_t
++ZSTD_estimateBlockSize_literal(const BYTE* literals, size_t litSize,
++                               const ZSTD_hufCTables_t* huf,
++                               const ZSTD_hufCTablesMetadata_t* hufMetadata,
++                               void* workspace, size_t wkspSize,
++                               int writeEntropy)
+ {
+     unsigned* const countWksp = (unsigned*)workspace;
+     unsigned maxSymbolValue = HUF_SYMBOLVALUE_MAX;
+@@ -3169,12 +3699,13 @@ static size_t ZSTD_estimateBlockSize_lit
+ }
+ 
+ /* Returns the size estimate for the FSE-compressed symbols (of, ml, ll) of a block */
+-static size_t ZSTD_estimateBlockSize_symbolType(symbolEncodingType_e type,
+-                        const BYTE* codeTable, size_t nbSeq, unsigned maxCode,
+-                        const FSE_CTable* fseCTable,
+-                        const U8* additionalBits,
+-                        short const* defaultNorm, U32 defaultNormLog, U32 defaultMax,
+-                        void* workspace, size_t wkspSize)
++static size_t
++ZSTD_estimateBlockSize_symbolType(symbolEncodingType_e type,
++                    const BYTE* codeTable, size_t nbSeq, unsigned maxCode,
++                    const FSE_CTable* fseCTable,
++                    const U8* additionalBits,
++                    short const* defaultNorm, U32 defaultNormLog, U32 defaultMax,
++                    void* workspace, size_t wkspSize)
+ {
+     unsigned* const countWksp = (unsigned*)workspace;
+     const BYTE* ctp = codeTable;
+@@ -3206,99 +3737,107 @@ static size_t ZSTD_estimateBlockSize_sym
+ }
+ 
+ /* Returns the size estimate for the sequences section (header + content) of a block */
+-static size_t ZSTD_estimateBlockSize_sequences(const BYTE* ofCodeTable,
+-                                                  const BYTE* llCodeTable,
+-                                                  const BYTE* mlCodeTable,
+-                                                  size_t nbSeq,
+-                                                  const ZSTD_fseCTables_t* fseTables,
+-                                                  const ZSTD_fseCTablesMetadata_t* fseMetadata,
+-                                                  void* workspace, size_t wkspSize,
+-                                                  int writeEntropy)
++static size_t
++ZSTD_estimateBlockSize_sequences(const BYTE* ofCodeTable,
++                                 const BYTE* llCodeTable,
++                                 const BYTE* mlCodeTable,
++                                 size_t nbSeq,
++                                 const ZSTD_fseCTables_t* fseTables,
++                                 const ZSTD_fseCTablesMetadata_t* fseMetadata,
++                                 void* workspace, size_t wkspSize,
++                                 int writeEntropy)
+ {
+     size_t sequencesSectionHeaderSize = 1 /* seqHead */ + 1 /* min seqSize size */ + (nbSeq >= 128) + (nbSeq >= LONGNBSEQ);
+     size_t cSeqSizeEstimate = 0;
+     cSeqSizeEstimate += ZSTD_estimateBlockSize_symbolType(fseMetadata->ofType, ofCodeTable, nbSeq, MaxOff,
+-                                         fseTables->offcodeCTable, NULL,
+-                                         OF_defaultNorm, OF_defaultNormLog, DefaultMaxOff,
+-                                         workspace, wkspSize);
++                                    fseTables->offcodeCTable, NULL,
++                                    OF_defaultNorm, OF_defaultNormLog, DefaultMaxOff,
++                                    workspace, wkspSize);
+     cSeqSizeEstimate += ZSTD_estimateBlockSize_symbolType(fseMetadata->llType, llCodeTable, nbSeq, MaxLL,
+-                                         fseTables->litlengthCTable, LL_bits,
+-                                         LL_defaultNorm, LL_defaultNormLog, MaxLL,
+-                                         workspace, wkspSize);
++                                    fseTables->litlengthCTable, LL_bits,
++                                    LL_defaultNorm, LL_defaultNormLog, MaxLL,
++                                    workspace, wkspSize);
+     cSeqSizeEstimate += ZSTD_estimateBlockSize_symbolType(fseMetadata->mlType, mlCodeTable, nbSeq, MaxML,
+-                                         fseTables->matchlengthCTable, ML_bits,
+-                                         ML_defaultNorm, ML_defaultNormLog, MaxML,
+-                                         workspace, wkspSize);
++                                    fseTables->matchlengthCTable, ML_bits,
++                                    ML_defaultNorm, ML_defaultNormLog, MaxML,
++                                    workspace, wkspSize);
+     if (writeEntropy) cSeqSizeEstimate += fseMetadata->fseTablesSize;
+     return cSeqSizeEstimate + sequencesSectionHeaderSize;
+ }
+ 
+ /* Returns the size estimate for a given stream of literals, of, ll, ml */
+-static size_t ZSTD_estimateBlockSize(const BYTE* literals, size_t litSize,
+-                                     const BYTE* ofCodeTable,
+-                                     const BYTE* llCodeTable,
+-                                     const BYTE* mlCodeTable,
+-                                     size_t nbSeq,
+-                                     const ZSTD_entropyCTables_t* entropy,
+-                                     const ZSTD_entropyCTablesMetadata_t* entropyMetadata,
+-                                     void* workspace, size_t wkspSize,
+-                                     int writeLitEntropy, int writeSeqEntropy) {
++static size_t
++ZSTD_estimateBlockSize(const BYTE* literals, size_t litSize,
++                       const BYTE* ofCodeTable,
++                       const BYTE* llCodeTable,
++                       const BYTE* mlCodeTable,
++                       size_t nbSeq,
++                       const ZSTD_entropyCTables_t* entropy,
++                       const ZSTD_entropyCTablesMetadata_t* entropyMetadata,
++                       void* workspace, size_t wkspSize,
++                       int writeLitEntropy, int writeSeqEntropy)
++{
+     size_t const literalsSize = ZSTD_estimateBlockSize_literal(literals, litSize,
+-                                                         &entropy->huf, &entropyMetadata->hufMetadata,
+-                                                         workspace, wkspSize, writeLitEntropy);
++                                    &entropy->huf, &entropyMetadata->hufMetadata,
++                                    workspace, wkspSize, writeLitEntropy);
+     size_t const seqSize = ZSTD_estimateBlockSize_sequences(ofCodeTable, llCodeTable, mlCodeTable,
+-                                                         nbSeq, &entropy->fse, &entropyMetadata->fseMetadata,
+-                                                         workspace, wkspSize, writeSeqEntropy);
++                                    nbSeq, &entropy->fse, &entropyMetadata->fseMetadata,
++                                    workspace, wkspSize, writeSeqEntropy);
+     return seqSize + literalsSize + ZSTD_blockHeaderSize;
+ }
+ 
+ /* Builds entropy statistics and uses them for blocksize estimation.
+  *
+- * Returns the estimated compressed size of the seqStore, or a zstd error.
++ * @return: estimated compressed size of the seqStore, or a zstd error.
+  */
+-static size_t ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(seqStore_t* seqStore, ZSTD_CCtx* zc) {
+-    ZSTD_entropyCTablesMetadata_t* entropyMetadata = &zc->blockSplitCtx.entropyMetadata;
++static size_t
++ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(seqStore_t* seqStore, ZSTD_CCtx* zc)
++{
++    ZSTD_entropyCTablesMetadata_t* const entropyMetadata = &zc->blockSplitCtx.entropyMetadata;
+     DEBUGLOG(6, "ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize()");
+     FORWARD_IF_ERROR(ZSTD_buildBlockEntropyStats(seqStore,
+                     &zc->blockState.prevCBlock->entropy,
+                     &zc->blockState.nextCBlock->entropy,
+                     &zc->appliedParams,
+                     entropyMetadata,
+-                    zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE /* statically allocated in resetCCtx */), "");
+-    return ZSTD_estimateBlockSize(seqStore->litStart, (size_t)(seqStore->lit - seqStore->litStart),
++                    zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE), "");
++    return ZSTD_estimateBlockSize(
++                    seqStore->litStart, (size_t)(seqStore->lit - seqStore->litStart),
+                     seqStore->ofCode, seqStore->llCode, seqStore->mlCode,
+                     (size_t)(seqStore->sequences - seqStore->sequencesStart),
+-                    &zc->blockState.nextCBlock->entropy, entropyMetadata, zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE,
++                    &zc->blockState.nextCBlock->entropy,
++                    entropyMetadata,
++                    zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE,
+                     (int)(entropyMetadata->hufMetadata.hType == set_compressed), 1);
+ }
+ 
+ /* Returns literals bytes represented in a seqStore */
+-static size_t ZSTD_countSeqStoreLiteralsBytes(const seqStore_t* const seqStore) {
++static size_t ZSTD_countSeqStoreLiteralsBytes(const seqStore_t* const seqStore)
++{
+     size_t literalsBytes = 0;
+-    size_t const nbSeqs = seqStore->sequences - seqStore->sequencesStart;
++    size_t const nbSeqs = (size_t)(seqStore->sequences - seqStore->sequencesStart);
+     size_t i;
+     for (i = 0; i < nbSeqs; ++i) {
+-        seqDef seq = seqStore->sequencesStart[i];
++        seqDef const seq = seqStore->sequencesStart[i];
+         literalsBytes += seq.litLength;
+         if (i == seqStore->longLengthPos && seqStore->longLengthType == ZSTD_llt_literalLength) {
+             literalsBytes += 0x10000;
+-        }
+-    }
++    }   }
+     return literalsBytes;
+ }
+ 
+ /* Returns match bytes represented in a seqStore */
+-static size_t ZSTD_countSeqStoreMatchBytes(const seqStore_t* const seqStore) {
++static size_t ZSTD_countSeqStoreMatchBytes(const seqStore_t* const seqStore)
++{
+     size_t matchBytes = 0;
+-    size_t const nbSeqs = seqStore->sequences - seqStore->sequencesStart;
++    size_t const nbSeqs = (size_t)(seqStore->sequences - seqStore->sequencesStart);
+     size_t i;
+     for (i = 0; i < nbSeqs; ++i) {
+         seqDef seq = seqStore->sequencesStart[i];
+         matchBytes += seq.mlBase + MINMATCH;
+         if (i == seqStore->longLengthPos && seqStore->longLengthType == ZSTD_llt_matchLength) {
+             matchBytes += 0x10000;
+-        }
+-    }
++    }   }
+     return matchBytes;
+ }
+ 
+@@ -3307,15 +3846,12 @@ static size_t ZSTD_countSeqStoreMatchByt
+  */
+ static void ZSTD_deriveSeqStoreChunk(seqStore_t* resultSeqStore,
+                                const seqStore_t* originalSeqStore,
+-                                     size_t startIdx, size_t endIdx) {
+-    BYTE* const litEnd = originalSeqStore->lit;
+-    size_t literalsBytes;
+-    size_t literalsBytesPreceding = 0;
+-
++                                     size_t startIdx, size_t endIdx)
++{
+     *resultSeqStore = *originalSeqStore;
+     if (startIdx > 0) {
+         resultSeqStore->sequences = originalSeqStore->sequencesStart + startIdx;
+-        literalsBytesPreceding = ZSTD_countSeqStoreLiteralsBytes(resultSeqStore);
++        resultSeqStore->litStart += ZSTD_countSeqStoreLiteralsBytes(resultSeqStore);
+     }
+ 
+     /* Move longLengthPos into the correct position if necessary */
+@@ -3328,13 +3864,12 @@ static void ZSTD_deriveSeqStoreChunk(seq
+     }
+     resultSeqStore->sequencesStart = originalSeqStore->sequencesStart + startIdx;
+     resultSeqStore->sequences = originalSeqStore->sequencesStart + endIdx;
+-    literalsBytes = ZSTD_countSeqStoreLiteralsBytes(resultSeqStore);
+-    resultSeqStore->litStart += literalsBytesPreceding;
+     if (endIdx == (size_t)(originalSeqStore->sequences - originalSeqStore->sequencesStart)) {
+         /* This accounts for possible last literals if the derived chunk reaches the end of the block */
+-        resultSeqStore->lit = litEnd;
++        assert(resultSeqStore->lit == originalSeqStore->lit);
+     } else {
+-        resultSeqStore->lit = resultSeqStore->litStart+literalsBytes;
++        size_t const literalsBytes = ZSTD_countSeqStoreLiteralsBytes(resultSeqStore);
++        resultSeqStore->lit = resultSeqStore->litStart + literalsBytes;
+     }
+     resultSeqStore->llCode += startIdx;
+     resultSeqStore->mlCode += startIdx;
+@@ -3342,20 +3877,26 @@ static void ZSTD_deriveSeqStoreChunk(seq
+ }
+ 
+ /*
+- * Returns the raw offset represented by the combination of offCode, ll0, and repcode history.
+- * offCode must represent a repcode in the numeric representation of ZSTD_storeSeq().
++ * Returns the raw offset represented by the combination of offBase, ll0, and repcode history.
++ * offBase must represent a repcode in the numeric representation of ZSTD_storeSeq().
+  */
+ static U32
+-ZSTD_resolveRepcodeToRawOffset(const U32 rep[ZSTD_REP_NUM], const U32 offCode, const U32 ll0)
++ZSTD_resolveRepcodeToRawOffset(const U32 rep[ZSTD_REP_NUM], const U32 offBase, const U32 ll0)
+ {
+-    U32 const adjustedOffCode = STORED_REPCODE(offCode) - 1 + ll0;  /* [ 0 - 3 ] */
+-    assert(STORED_IS_REPCODE(offCode));
+-    if (adjustedOffCode == ZSTD_REP_NUM) {
+-        /* litlength == 0 and offCode == 2 implies selection of first repcode - 1 */
+-        assert(rep[0] > 0);
++    U32 const adjustedRepCode = OFFBASE_TO_REPCODE(offBase) - 1 + ll0;  /* [ 0 - 3 ] */
++    assert(OFFBASE_IS_REPCODE(offBase));
++    if (adjustedRepCode == ZSTD_REP_NUM) {
++        assert(ll0);
++        /* litlength == 0 and offCode == 2 implies selection of first repcode - 1
++         * This is only valid if it results in a valid offset value, aka > 0.
++         * Note : it may happen that `rep[0]==1` in exceptional circumstances.
++         * In which case this function will return 0, which is an invalid offset.
++         * It's not an issue though, since this value will be
++         * compared and discarded within ZSTD_seqStore_resolveOffCodes().
++         */
+         return rep[0] - 1;
+     }
+-    return rep[adjustedOffCode];
++    return rep[adjustedRepCode];
+ }
+ 
+ /*
+@@ -3371,30 +3912,33 @@ ZSTD_resolveRepcodeToRawOffset(const U32
+  *        1-3 : repcode 1-3
+  *        4+ : real_offset+3
+  */
+-static void ZSTD_seqStore_resolveOffCodes(repcodes_t* const dRepcodes, repcodes_t* const cRepcodes,
+-                                          seqStore_t* const seqStore, U32 const nbSeq) {
++static void
++ZSTD_seqStore_resolveOffCodes(repcodes_t* const dRepcodes, repcodes_t* const cRepcodes,
++                        const seqStore_t* const seqStore, U32 const nbSeq)
++{
+     U32 idx = 0;
++    U32 const longLitLenIdx = seqStore->longLengthType == ZSTD_llt_literalLength ? seqStore->longLengthPos : nbSeq;
+     for (; idx < nbSeq; ++idx) {
+         seqDef* const seq = seqStore->sequencesStart + idx;
+-        U32 const ll0 = (seq->litLength == 0);
+-        U32 const offCode = OFFBASE_TO_STORED(seq->offBase);
+-        assert(seq->offBase > 0);
+-        if (STORED_IS_REPCODE(offCode)) {
+-            U32 const dRawOffset = ZSTD_resolveRepcodeToRawOffset(dRepcodes->rep, offCode, ll0);
+-            U32 const cRawOffset = ZSTD_resolveRepcodeToRawOffset(cRepcodes->rep, offCode, ll0);
++        U32 const ll0 = (seq->litLength == 0) && (idx != longLitLenIdx);
++        U32 const offBase = seq->offBase;
++        assert(offBase > 0);
++        if (OFFBASE_IS_REPCODE(offBase)) {
++            U32 const dRawOffset = ZSTD_resolveRepcodeToRawOffset(dRepcodes->rep, offBase, ll0);
++            U32 const cRawOffset = ZSTD_resolveRepcodeToRawOffset(cRepcodes->rep, offBase, ll0);
+             /* Adjust simulated decompression repcode history if we come across a mismatch. Replace
+              * the repcode with the offset it actually references, determined by the compression
+              * repcode history.
+              */
+             if (dRawOffset != cRawOffset) {
+-                seq->offBase = cRawOffset + ZSTD_REP_NUM;
++                seq->offBase = OFFSET_TO_OFFBASE(cRawOffset);
+             }
+         }
+         /* Compression repcode history is always updated with values directly from the unmodified seqStore.
+          * Decompression repcode history may use modified seq->offset value taken from compression repcode history.
+          */
+-        ZSTD_updateRep(dRepcodes->rep, OFFBASE_TO_STORED(seq->offBase), ll0);
+-        ZSTD_updateRep(cRepcodes->rep, offCode, ll0);
++        ZSTD_updateRep(dRepcodes->rep, seq->offBase, ll0);
++        ZSTD_updateRep(cRepcodes->rep, offBase, ll0);
+     }
+ }
+ 
+@@ -3404,10 +3948,11 @@ static void ZSTD_seqStore_resolveOffCode
+  * Returns the total size of that block (including header) or a ZSTD error code.
+  */
+ static size_t
+-ZSTD_compressSeqStore_singleBlock(ZSTD_CCtx* zc, seqStore_t* const seqStore,
++ZSTD_compressSeqStore_singleBlock(ZSTD_CCtx* zc,
++                            const seqStore_t* const seqStore,
+                                   repcodes_t* const dRep, repcodes_t* const cRep,
+                                   void* dst, size_t dstCapacity,
+-                                  const void* src, size_t srcSize,
++                            const void* src, size_t srcSize,
+                                   U32 lastBlock, U32 isPartition)
+ {
+     const U32 rleMaxLength = 25;
+@@ -3442,8 +3987,9 @@ ZSTD_compressSeqStore_singleBlock(ZSTD_C
+         cSeqsSize = 1;
+     }
+ 
++    /* Sequence collection not supported when block splitting */
+     if (zc->seqCollector.collectSequences) {
+-        ZSTD_copyBlockSequences(zc);
++        FORWARD_IF_ERROR(ZSTD_copyBlockSequences(&zc->seqCollector, seqStore, dRepOriginal.rep), "copyBlockSequences failed");
+         ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState);
+         return 0;
+     }
+@@ -3481,45 +4027,49 @@ typedef struct {
+ 
+ /* Helper function to perform the recursive search for block splits.
+  * Estimates the cost of seqStore prior to split, and estimates the cost of splitting the sequences in half.
+- * If advantageous to split, then we recurse down the two sub-blocks. If not, or if an error occurred in estimation, then
+- * we do not recurse.
++ * If advantageous to split, then we recurse down the two sub-blocks.
++ * If not, or if an error occurred in estimation, then we do not recurse.
+  *
+- * Note: The recursion depth is capped by a heuristic minimum number of sequences, defined by MIN_SEQUENCES_BLOCK_SPLITTING.
++ * Note: The recursion depth is capped by a heuristic minimum number of sequences,
++ * defined by MIN_SEQUENCES_BLOCK_SPLITTING.
+  * In theory, this means the absolute largest recursion depth is 10 == log2(maxNbSeqInBlock/MIN_SEQUENCES_BLOCK_SPLITTING).
+  * In practice, recursion depth usually doesn't go beyond 4.
+  *
+- * Furthermore, the number of splits is capped by ZSTD_MAX_NB_BLOCK_SPLITS. At ZSTD_MAX_NB_BLOCK_SPLITS == 196 with the current existing blockSize
++ * Furthermore, the number of splits is capped by ZSTD_MAX_NB_BLOCK_SPLITS.
++ * At ZSTD_MAX_NB_BLOCK_SPLITS == 196 with the current existing blockSize
+  * maximum of 128 KB, this value is actually impossible to reach.
+  */
+ static void
+ ZSTD_deriveBlockSplitsHelper(seqStoreSplits* splits, size_t startIdx, size_t endIdx,
+                              ZSTD_CCtx* zc, const seqStore_t* origSeqStore)
+ {
+-    seqStore_t* fullSeqStoreChunk = &zc->blockSplitCtx.fullSeqStoreChunk;
+-    seqStore_t* firstHalfSeqStore = &zc->blockSplitCtx.firstHalfSeqStore;
+-    seqStore_t* secondHalfSeqStore = &zc->blockSplitCtx.secondHalfSeqStore;
++    seqStore_t* const fullSeqStoreChunk = &zc->blockSplitCtx.fullSeqStoreChunk;
++    seqStore_t* const firstHalfSeqStore = &zc->blockSplitCtx.firstHalfSeqStore;
++    seqStore_t* const secondHalfSeqStore = &zc->blockSplitCtx.secondHalfSeqStore;
+     size_t estimatedOriginalSize;
+     size_t estimatedFirstHalfSize;
+     size_t estimatedSecondHalfSize;
+     size_t midIdx = (startIdx + endIdx)/2;
+ 
++    DEBUGLOG(5, "ZSTD_deriveBlockSplitsHelper: startIdx=%zu endIdx=%zu", startIdx, endIdx);
++    assert(endIdx >= startIdx);
+     if (endIdx - startIdx < MIN_SEQUENCES_BLOCK_SPLITTING || splits->idx >= ZSTD_MAX_NB_BLOCK_SPLITS) {
+-        DEBUGLOG(6, "ZSTD_deriveBlockSplitsHelper: Too few sequences");
++        DEBUGLOG(6, "ZSTD_deriveBlockSplitsHelper: Too few sequences (%zu)", endIdx - startIdx);
+         return;
+     }
+-    DEBUGLOG(4, "ZSTD_deriveBlockSplitsHelper: startIdx=%zu endIdx=%zu", startIdx, endIdx);
+     ZSTD_deriveSeqStoreChunk(fullSeqStoreChunk, origSeqStore, startIdx, endIdx);
+     ZSTD_deriveSeqStoreChunk(firstHalfSeqStore, origSeqStore, startIdx, midIdx);
+     ZSTD_deriveSeqStoreChunk(secondHalfSeqStore, origSeqStore, midIdx, endIdx);
+     estimatedOriginalSize = ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(fullSeqStoreChunk, zc);
+     estimatedFirstHalfSize = ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(firstHalfSeqStore, zc);
+     estimatedSecondHalfSize = ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(secondHalfSeqStore, zc);
+-    DEBUGLOG(4, "Estimated original block size: %zu -- First half split: %zu -- Second half split: %zu",
++    DEBUGLOG(5, "Estimated original block size: %zu -- First half split: %zu -- Second half split: %zu",
+              estimatedOriginalSize, estimatedFirstHalfSize, estimatedSecondHalfSize);
+     if (ZSTD_isError(estimatedOriginalSize) || ZSTD_isError(estimatedFirstHalfSize) || ZSTD_isError(estimatedSecondHalfSize)) {
+         return;
+     }
+     if (estimatedFirstHalfSize + estimatedSecondHalfSize < estimatedOriginalSize) {
++        DEBUGLOG(5, "split decided at seqNb:%zu", midIdx);
+         ZSTD_deriveBlockSplitsHelper(splits, startIdx, midIdx, zc, origSeqStore);
+         splits->splitLocations[splits->idx] = (U32)midIdx;
+         splits->idx++;
+@@ -3527,14 +4077,18 @@ ZSTD_deriveBlockSplitsHelper(seqStoreSpl
+     }
+ }
+ 
+-/* Base recursive function. Populates a table with intra-block partition indices that can improve compression ratio.
++/* Base recursive function.
++ * Populates a table with intra-block partition indices that can improve compression ratio.
+  *
+- * Returns the number of splits made (which equals the size of the partition table - 1).
++ * @return: number of splits made (which equals the size of the partition table - 1).
+  */
+-static size_t ZSTD_deriveBlockSplits(ZSTD_CCtx* zc, U32 partitions[], U32 nbSeq) {
+-    seqStoreSplits splits = {partitions, 0};
++static size_t ZSTD_deriveBlockSplits(ZSTD_CCtx* zc, U32 partitions[], U32 nbSeq)
++{
++    seqStoreSplits splits;
++    splits.splitLocations = partitions;
++    splits.idx = 0;
+     if (nbSeq <= 4) {
+-        DEBUGLOG(4, "ZSTD_deriveBlockSplits: Too few sequences to split");
++        DEBUGLOG(5, "ZSTD_deriveBlockSplits: Too few sequences to split (%u <= 4)", nbSeq);
+         /* Refuse to try and split anything with less than 4 sequences */
+         return 0;
+     }
+@@ -3550,18 +4104,20 @@ static size_t ZSTD_deriveBlockSplits(ZST
+  * Returns combined size of all blocks (which includes headers), or a ZSTD error code.
+  */
+ static size_t
+-ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc, void* dst, size_t dstCapacity,
+-                                       const void* src, size_t blockSize, U32 lastBlock, U32 nbSeq)
++ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc,
++                                    void* dst, size_t dstCapacity,
++                              const void* src, size_t blockSize,
++                                    U32 lastBlock, U32 nbSeq)
+ {
+     size_t cSize = 0;
+     const BYTE* ip = (const BYTE*)src;
+     BYTE* op = (BYTE*)dst;
+     size_t i = 0;
+     size_t srcBytesTotal = 0;
+-    U32* partitions = zc->blockSplitCtx.partitions; /* size == ZSTD_MAX_NB_BLOCK_SPLITS */
+-    seqStore_t* nextSeqStore = &zc->blockSplitCtx.nextSeqStore;
+-    seqStore_t* currSeqStore = &zc->blockSplitCtx.currSeqStore;
+-    size_t numSplits = ZSTD_deriveBlockSplits(zc, partitions, nbSeq);
++    U32* const partitions = zc->blockSplitCtx.partitions; /* size == ZSTD_MAX_NB_BLOCK_SPLITS */
++    seqStore_t* const nextSeqStore = &zc->blockSplitCtx.nextSeqStore;
++    seqStore_t* const currSeqStore = &zc->blockSplitCtx.currSeqStore;
++    size_t const numSplits = ZSTD_deriveBlockSplits(zc, partitions, nbSeq);
+ 
+     /* If a block is split and some partitions are emitted as RLE/uncompressed, then repcode history
+      * may become invalid. In order to reconcile potentially invalid repcodes, we keep track of two
+@@ -3583,30 +4139,31 @@ ZSTD_compressBlock_splitBlock_internal(Z
+     ZSTD_memcpy(cRep.rep, zc->blockState.prevCBlock->rep, sizeof(repcodes_t));
+     ZSTD_memset(nextSeqStore, 0, sizeof(seqStore_t));
+ 
+-    DEBUGLOG(4, "ZSTD_compressBlock_splitBlock_internal (dstCapacity=%u, dictLimit=%u, nextToUpdate=%u)",
++    DEBUGLOG(5, "ZSTD_compressBlock_splitBlock_internal (dstCapacity=%u, dictLimit=%u, nextToUpdate=%u)",
+                 (unsigned)dstCapacity, (unsigned)zc->blockState.matchState.window.dictLimit,
+                 (unsigned)zc->blockState.matchState.nextToUpdate);
+ 
+     if (numSplits == 0) {
+-        size_t cSizeSingleBlock = ZSTD_compressSeqStore_singleBlock(zc, &zc->seqStore,
+-                                                                   &dRep, &cRep,
+-                                                                    op, dstCapacity,
+-                                                                    ip, blockSize,
+-                                                                    lastBlock, 0 /* isPartition */);
++        size_t cSizeSingleBlock =
++            ZSTD_compressSeqStore_singleBlock(zc, &zc->seqStore,
++                                            &dRep, &cRep,
++                                            op, dstCapacity,
++                                            ip, blockSize,
++                                            lastBlock, 0 /* isPartition */);
+         FORWARD_IF_ERROR(cSizeSingleBlock, "Compressing single block from splitBlock_internal() failed!");
+         DEBUGLOG(5, "ZSTD_compressBlock_splitBlock_internal: No splits");
+-        assert(cSizeSingleBlock <= ZSTD_BLOCKSIZE_MAX + ZSTD_blockHeaderSize);
++        assert(zc->blockSize <= ZSTD_BLOCKSIZE_MAX);
++        assert(cSizeSingleBlock <= zc->blockSize + ZSTD_blockHeaderSize);
+         return cSizeSingleBlock;
+     }
+ 
+     ZSTD_deriveSeqStoreChunk(currSeqStore, &zc->seqStore, 0, partitions[0]);
+     for (i = 0; i <= numSplits; ++i) {
+-        size_t srcBytes;
+         size_t cSizeChunk;
+         U32 const lastPartition = (i == numSplits);
+         U32 lastBlockEntireSrc = 0;
+ 
+-        srcBytes = ZSTD_countSeqStoreLiteralsBytes(currSeqStore) + ZSTD_countSeqStoreMatchBytes(currSeqStore);
++        size_t srcBytes = ZSTD_countSeqStoreLiteralsBytes(currSeqStore) + ZSTD_countSeqStoreMatchBytes(currSeqStore);
+         srcBytesTotal += srcBytes;
+         if (lastPartition) {
+             /* This is the final partition, need to account for possible last literals */
+@@ -3621,7 +4178,8 @@ ZSTD_compressBlock_splitBlock_internal(Z
+                                                        op, dstCapacity,
+                                                        ip, srcBytes,
+                                                        lastBlockEntireSrc, 1 /* isPartition */);
+-        DEBUGLOG(5, "Estimated size: %zu actual size: %zu", ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(currSeqStore, zc), cSizeChunk);
++        DEBUGLOG(5, "Estimated size: %zu vs %zu : actual size",
++                    ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(currSeqStore, zc), cSizeChunk);
+         FORWARD_IF_ERROR(cSizeChunk, "Compressing chunk failed!");
+ 
+         ip += srcBytes;
+@@ -3629,10 +4187,10 @@ ZSTD_compressBlock_splitBlock_internal(Z
+         dstCapacity -= cSizeChunk;
+         cSize += cSizeChunk;
+         *currSeqStore = *nextSeqStore;
+-        assert(cSizeChunk <= ZSTD_BLOCKSIZE_MAX + ZSTD_blockHeaderSize);
++        assert(cSizeChunk <= zc->blockSize + ZSTD_blockHeaderSize);
+     }
+-    /* cRep and dRep may have diverged during the compression. If so, we use the dRep repcodes
+-     * for the next block.
++    /* cRep and dRep may have diverged during the compression.
++     * If so, we use the dRep repcodes for the next block.
+      */
+     ZSTD_memcpy(zc->blockState.prevCBlock->rep, dRep.rep, sizeof(repcodes_t));
+     return cSize;
+@@ -3643,8 +4201,6 @@ ZSTD_compressBlock_splitBlock(ZSTD_CCtx*
+                               void* dst, size_t dstCapacity,
+                               const void* src, size_t srcSize, U32 lastBlock)
+ {
+-    const BYTE* ip = (const BYTE*)src;
+-    BYTE* op = (BYTE*)dst;
+     U32 nbSeq;
+     size_t cSize;
+     DEBUGLOG(4, "ZSTD_compressBlock_splitBlock");
+@@ -3655,7 +4211,8 @@ ZSTD_compressBlock_splitBlock(ZSTD_CCtx*
+         if (bss == ZSTDbss_noCompress) {
+             if (zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode == FSE_repeat_valid)
+                 zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode = FSE_repeat_check;
+-            cSize = ZSTD_noCompressBlock(op, dstCapacity, ip, srcSize, lastBlock);
++            RETURN_ERROR_IF(zc->seqCollector.collectSequences, sequenceProducer_failed, "Uncompressible block");
++            cSize = ZSTD_noCompressBlock(dst, dstCapacity, src, srcSize, lastBlock);
+             FORWARD_IF_ERROR(cSize, "ZSTD_noCompressBlock failed");
+             DEBUGLOG(4, "ZSTD_compressBlock_splitBlock: Nocompress block");
+             return cSize;
+@@ -3673,9 +4230,9 @@ ZSTD_compressBlock_internal(ZSTD_CCtx* z
+                             void* dst, size_t dstCapacity,
+                             const void* src, size_t srcSize, U32 frame)
+ {
+-    /* This the upper bound for the length of an rle block.
+-     * This isn't the actual upper bound. Finding the real threshold
+-     * needs further investigation.
++    /* This is an estimated upper bound for the length of an rle block.
++     * This isn't the actual upper bound.
++     * Finding the real threshold needs further investigation.
+      */
+     const U32 rleMaxLength = 25;
+     size_t cSize;
+@@ -3687,11 +4244,15 @@ ZSTD_compressBlock_internal(ZSTD_CCtx* z
+ 
+     {   const size_t bss = ZSTD_buildSeqStore(zc, src, srcSize);
+         FORWARD_IF_ERROR(bss, "ZSTD_buildSeqStore failed");
+-        if (bss == ZSTDbss_noCompress) { cSize = 0; goto out; }
++        if (bss == ZSTDbss_noCompress) {
++            RETURN_ERROR_IF(zc->seqCollector.collectSequences, sequenceProducer_failed, "Uncompressible block");
++            cSize = 0;
++            goto out;
++        }
+     }
+ 
+     if (zc->seqCollector.collectSequences) {
+-        ZSTD_copyBlockSequences(zc);
++        FORWARD_IF_ERROR(ZSTD_copyBlockSequences(&zc->seqCollector, ZSTD_getSeqStore(zc), zc->blockState.prevCBlock->rep), "copyBlockSequences failed");
+         ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState);
+         return 0;
+     }
+@@ -3767,10 +4328,11 @@ static size_t ZSTD_compressBlock_targetC
+          *   * cSize >= blockBound(srcSize): We have expanded the block too much so
+          *     emit an uncompressed block.
+          */
+-        {
+-            size_t const cSize = ZSTD_compressSuperBlock(zc, dst, dstCapacity, src, srcSize, lastBlock);
++        {   size_t const cSize =
++                ZSTD_compressSuperBlock(zc, dst, dstCapacity, src, srcSize, lastBlock);
+             if (cSize != ERROR(dstSize_tooSmall)) {
+-                size_t const maxCSize = srcSize - ZSTD_minGain(srcSize, zc->appliedParams.cParams.strategy);
++                size_t const maxCSize =
++                    srcSize - ZSTD_minGain(srcSize, zc->appliedParams.cParams.strategy);
+                 FORWARD_IF_ERROR(cSize, "ZSTD_compressSuperBlock failed");
+                 if (cSize != 0 && cSize < maxCSize + ZSTD_blockHeaderSize) {
+                     ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState);
+@@ -3778,7 +4340,7 @@ static size_t ZSTD_compressBlock_targetC
+                 }
+             }
+         }
+-    }
++    } /* if (bss == ZSTDbss_compress)*/
+ 
+     DEBUGLOG(6, "Resorting to ZSTD_noCompressBlock()");
+     /* Superblock compression failed, attempt to emit a single no compress block.
+@@ -3836,7 +4398,7 @@ static void ZSTD_overflowCorrectIfNeeded
+ *   All blocks will be terminated, all input will be consumed.
+ *   Function will issue an error if there is not enough `dstCapacity` to hold the compressed content.
+ *   Frame is supposed already started (header already produced)
+-*   @return : compressed size, or an error code
++*  @return : compressed size, or an error code
+ */
+ static size_t ZSTD_compress_frameChunk(ZSTD_CCtx* cctx,
+                                      void* dst, size_t dstCapacity,
+@@ -3860,7 +4422,9 @@ static size_t ZSTD_compress_frameChunk(Z
+         ZSTD_matchState_t* const ms = &cctx->blockState.matchState;
+         U32 const lastBlock = lastFrameChunk & (blockSize >= remaining);
+ 
+-        RETURN_ERROR_IF(dstCapacity < ZSTD_blockHeaderSize + MIN_CBLOCK_SIZE,
++        /* TODO: See 3090. We reduced MIN_CBLOCK_SIZE from 3 to 2 so to compensate we are adding
++         * additional 1. We need to revisit and change this logic to be more consistent */
++        RETURN_ERROR_IF(dstCapacity < ZSTD_blockHeaderSize + MIN_CBLOCK_SIZE + 1,
+                         dstSize_tooSmall,
+                         "not enough space to store compressed block");
+         if (remaining < blockSize) blockSize = remaining;
+@@ -3899,7 +4463,7 @@ static size_t ZSTD_compress_frameChunk(Z
+                     MEM_writeLE24(op, cBlockHeader);
+                     cSize += ZSTD_blockHeaderSize;
+                 }
+-            }
++            }  /* if (ZSTD_useTargetCBlockSize(&cctx->appliedParams))*/
+ 
+ 
+             ip += blockSize;
+@@ -4001,19 +4565,15 @@ size_t ZSTD_writeLastEmptyBlock(void* ds
+     }
+ }
+ 
+-size_t ZSTD_referenceExternalSequences(ZSTD_CCtx* cctx, rawSeq* seq, size_t nbSeq)
++void ZSTD_referenceExternalSequences(ZSTD_CCtx* cctx, rawSeq* seq, size_t nbSeq)
+ {
+-    RETURN_ERROR_IF(cctx->stage != ZSTDcs_init, stage_wrong,
+-                    "wrong cctx stage");
+-    RETURN_ERROR_IF(cctx->appliedParams.ldmParams.enableLdm == ZSTD_ps_enable,
+-                    parameter_unsupported,
+-                    "incompatible with ldm");
++    assert(cctx->stage == ZSTDcs_init);
++    assert(nbSeq == 0 || cctx->appliedParams.ldmParams.enableLdm != ZSTD_ps_enable);
+     cctx->externSeqStore.seq = seq;
+     cctx->externSeqStore.size = nbSeq;
+     cctx->externSeqStore.capacity = nbSeq;
+     cctx->externSeqStore.pos = 0;
+     cctx->externSeqStore.posInSequence = 0;
+-    return 0;
+ }
+ 
+ 
+@@ -4078,31 +4638,51 @@ static size_t ZSTD_compressContinue_inte
+     }
+ }
+ 
+-size_t ZSTD_compressContinue (ZSTD_CCtx* cctx,
+-                              void* dst, size_t dstCapacity,
+-                        const void* src, size_t srcSize)
++size_t ZSTD_compressContinue_public(ZSTD_CCtx* cctx,
++                                        void* dst, size_t dstCapacity,
++                                  const void* src, size_t srcSize)
+ {
+     DEBUGLOG(5, "ZSTD_compressContinue (srcSize=%u)", (unsigned)srcSize);
+     return ZSTD_compressContinue_internal(cctx, dst, dstCapacity, src, srcSize, 1 /* frame mode */, 0 /* last chunk */);
+ }
+ 
++/* NOTE: Must just wrap ZSTD_compressContinue_public() */
++size_t ZSTD_compressContinue(ZSTD_CCtx* cctx,
++                             void* dst, size_t dstCapacity,
++                       const void* src, size_t srcSize)
++{
++    return ZSTD_compressContinue_public(cctx, dst, dstCapacity, src, srcSize);
++}
+ 
+-size_t ZSTD_getBlockSize(const ZSTD_CCtx* cctx)
++static size_t ZSTD_getBlockSize_deprecated(const ZSTD_CCtx* cctx)
+ {
+     ZSTD_compressionParameters const cParams = cctx->appliedParams.cParams;
+     assert(!ZSTD_checkCParams(cParams));
+-    return MIN (ZSTD_BLOCKSIZE_MAX, (U32)1 << cParams.windowLog);
++    return MIN(cctx->appliedParams.maxBlockSize, (size_t)1 << cParams.windowLog);
+ }
+ 
+-size_t ZSTD_compressBlock(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize)
++/* NOTE: Must just wrap ZSTD_getBlockSize_deprecated() */
++size_t ZSTD_getBlockSize(const ZSTD_CCtx* cctx)
++{
++    return ZSTD_getBlockSize_deprecated(cctx);
++}
++
++/* NOTE: Must just wrap ZSTD_compressBlock_deprecated() */
++size_t ZSTD_compressBlock_deprecated(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize)
+ {
+     DEBUGLOG(5, "ZSTD_compressBlock: srcSize = %u", (unsigned)srcSize);
+-    { size_t const blockSizeMax = ZSTD_getBlockSize(cctx);
++    { size_t const blockSizeMax = ZSTD_getBlockSize_deprecated(cctx);
+       RETURN_ERROR_IF(srcSize > blockSizeMax, srcSize_wrong, "input is larger than a block"); }
+ 
+     return ZSTD_compressContinue_internal(cctx, dst, dstCapacity, src, srcSize, 0 /* frame mode */, 0 /* last chunk */);
+ }
+ 
++/* NOTE: Must just wrap ZSTD_compressBlock_deprecated() */
++size_t ZSTD_compressBlock(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize)
++{
++    return ZSTD_compressBlock_deprecated(cctx, dst, dstCapacity, src, srcSize);
++}
++
+ /*! ZSTD_loadDictionaryContent() :
+  *  @return : 0, or an error code
+  */
+@@ -4111,25 +4691,36 @@ static size_t ZSTD_loadDictionaryContent
+                                          ZSTD_cwksp* ws,
+                                          ZSTD_CCtx_params const* params,
+                                          const void* src, size_t srcSize,
+-                                         ZSTD_dictTableLoadMethod_e dtlm)
++                                         ZSTD_dictTableLoadMethod_e dtlm,
++                                         ZSTD_tableFillPurpose_e tfp)
+ {
+     const BYTE* ip = (const BYTE*) src;
+     const BYTE* const iend = ip + srcSize;
+     int const loadLdmDict = params->ldmParams.enableLdm == ZSTD_ps_enable && ls != NULL;
+ 
+-    /* Assert that we the ms params match the params we're being given */
++    /* Assert that the ms params match the params we're being given */
+     ZSTD_assertEqualCParams(params->cParams, ms->cParams);
+ 
+-    if (srcSize > ZSTD_CHUNKSIZE_MAX) {
++    {   /* Ensure large dictionaries can't cause index overflow */
++
+         /* Allow the dictionary to set indices up to exactly ZSTD_CURRENT_MAX.
+          * Dictionaries right at the edge will immediately trigger overflow
+          * correction, but I don't want to insert extra constraints here.
+          */
+-        U32 const maxDictSize = ZSTD_CURRENT_MAX - 1;
+-        /* We must have cleared our windows when our source is this large. */
+-        assert(ZSTD_window_isEmpty(ms->window));
+-        if (loadLdmDict)
+-            assert(ZSTD_window_isEmpty(ls->window));
++        U32 maxDictSize = ZSTD_CURRENT_MAX - ZSTD_WINDOW_START_INDEX;
++
++        int const CDictTaggedIndices = ZSTD_CDictIndicesAreTagged(&params->cParams);
++        if (CDictTaggedIndices && tfp == ZSTD_tfp_forCDict) {
++            /* Some dictionary matchfinders in zstd use "short cache",
++             * which treats the lower ZSTD_SHORT_CACHE_TAG_BITS of each
++             * CDict hashtable entry as a tag rather than as part of an index.
++             * When short cache is used, we need to truncate the dictionary
++             * so that its indices don't overlap with the tag. */
++            U32 const shortCacheMaxDictSize = (1u << (32 - ZSTD_SHORT_CACHE_TAG_BITS)) - ZSTD_WINDOW_START_INDEX;
++            maxDictSize = MIN(maxDictSize, shortCacheMaxDictSize);
++            assert(!loadLdmDict);
++        }
++
+         /* If the dictionary is too large, only load the suffix of the dictionary. */
+         if (srcSize > maxDictSize) {
+             ip = iend - maxDictSize;
+@@ -4138,35 +4729,58 @@ static size_t ZSTD_loadDictionaryContent
+         }
+     }
+ 
+-    DEBUGLOG(4, "ZSTD_loadDictionaryContent(): useRowMatchFinder=%d", (int)params->useRowMatchFinder);
++    if (srcSize > ZSTD_CHUNKSIZE_MAX) {
++        /* We must have cleared our windows when our source is this large. */
++        assert(ZSTD_window_isEmpty(ms->window));
++        if (loadLdmDict) assert(ZSTD_window_isEmpty(ls->window));
++    }
+     ZSTD_window_update(&ms->window, src, srcSize, /* forceNonContiguous */ 0);
+-    ms->loadedDictEnd = params->forceWindow ? 0 : (U32)(iend - ms->window.base);
+-    ms->forceNonContiguous = params->deterministicRefPrefix;
+ 
+-    if (loadLdmDict) {
++    DEBUGLOG(4, "ZSTD_loadDictionaryContent(): useRowMatchFinder=%d", (int)params->useRowMatchFinder);
++
++    if (loadLdmDict) { /* Load the entire dict into LDM matchfinders. */
+         ZSTD_window_update(&ls->window, src, srcSize, /* forceNonContiguous */ 0);
+         ls->loadedDictEnd = params->forceWindow ? 0 : (U32)(iend - ls->window.base);
++        ZSTD_ldm_fillHashTable(ls, ip, iend, &params->ldmParams);
++    }
++
++    /* If the dict is larger than we can reasonably index in our tables, only load the suffix. */
++    if (params->cParams.strategy < ZSTD_btultra) {
++        U32 maxDictSize = 8U << MIN(MAX(params->cParams.hashLog, params->cParams.chainLog), 28);
++        if (srcSize > maxDictSize) {
++            ip = iend - maxDictSize;
++            src = ip;
++            srcSize = maxDictSize;
++        }
+     }
+ 
++    ms->nextToUpdate = (U32)(ip - ms->window.base);
++    ms->loadedDictEnd = params->forceWindow ? 0 : (U32)(iend - ms->window.base);
++    ms->forceNonContiguous = params->deterministicRefPrefix;
++
+     if (srcSize <= HASH_READ_SIZE) return 0;
+ 
+     ZSTD_overflowCorrectIfNeeded(ms, ws, params, ip, iend);
+ 
+-    if (loadLdmDict)
+-        ZSTD_ldm_fillHashTable(ls, ip, iend, &params->ldmParams);
+-
+     switch(params->cParams.strategy)
+     {
+     case ZSTD_fast:
+-        ZSTD_fillHashTable(ms, iend, dtlm);
++        ZSTD_fillHashTable(ms, iend, dtlm, tfp);
+         break;
+     case ZSTD_dfast:
+-        ZSTD_fillDoubleHashTable(ms, iend, dtlm);
++#ifndef ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR
++        ZSTD_fillDoubleHashTable(ms, iend, dtlm, tfp);
++#else
++        assert(0); /* shouldn't be called: cparams should've been adjusted. */
++#endif
+         break;
+ 
+     case ZSTD_greedy:
+     case ZSTD_lazy:
+     case ZSTD_lazy2:
++#if !defined(ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR) \
++ || !defined(ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR) \
++ || !defined(ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR)
+         assert(srcSize >= HASH_READ_SIZE);
+         if (ms->dedicatedDictSearch) {
+             assert(ms->chainTable != NULL);
+@@ -4174,7 +4788,7 @@ static size_t ZSTD_loadDictionaryContent
+         } else {
+             assert(params->useRowMatchFinder != ZSTD_ps_auto);
+             if (params->useRowMatchFinder == ZSTD_ps_enable) {
+-                size_t const tagTableSize = ((size_t)1 << params->cParams.hashLog) * sizeof(U16);
++                size_t const tagTableSize = ((size_t)1 << params->cParams.hashLog);
+                 ZSTD_memset(ms->tagTable, 0, tagTableSize);
+                 ZSTD_row_update(ms, iend-HASH_READ_SIZE);
+                 DEBUGLOG(4, "Using row-based hash table for lazy dict");
+@@ -4183,14 +4797,23 @@ static size_t ZSTD_loadDictionaryContent
+                 DEBUGLOG(4, "Using chain-based hash table for lazy dict");
+             }
+         }
++#else
++        assert(0); /* shouldn't be called: cparams should've been adjusted. */
++#endif
+         break;
+ 
+     case ZSTD_btlazy2:   /* we want the dictionary table fully sorted */
+     case ZSTD_btopt:
+     case ZSTD_btultra:
+     case ZSTD_btultra2:
++#if !defined(ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR) \
++ || !defined(ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR) \
++ || !defined(ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR)
+         assert(srcSize >= HASH_READ_SIZE);
+         ZSTD_updateTree(ms, iend-HASH_READ_SIZE, iend);
++#else
++        assert(0); /* shouldn't be called: cparams should've been adjusted. */
++#endif
+         break;
+ 
+     default:
+@@ -4237,11 +4860,10 @@ size_t ZSTD_loadCEntropy(ZSTD_compressed
+ 
+         /* We only set the loaded table as valid if it contains all non-zero
+          * weights. Otherwise, we set it to check */
+-        if (!hasZeroWeights)
++        if (!hasZeroWeights && maxSymbolValue == 255)
+             bs->entropy.huf.repeatMode = HUF_repeat_valid;
+ 
+         RETURN_ERROR_IF(HUF_isError(hufHeaderSize), dictionary_corrupted, "");
+-        RETURN_ERROR_IF(maxSymbolValue < 255, dictionary_corrupted, "");
+         dictPtr += hufHeaderSize;
+     }
+ 
+@@ -4327,6 +4949,7 @@ static size_t ZSTD_loadZstdDictionary(ZS
+                                       ZSTD_CCtx_params const* params,
+                                       const void* dict, size_t dictSize,
+                                       ZSTD_dictTableLoadMethod_e dtlm,
++                                      ZSTD_tableFillPurpose_e tfp,
+                                       void* workspace)
+ {
+     const BYTE* dictPtr = (const BYTE*)dict;
+@@ -4345,7 +4968,7 @@ static size_t ZSTD_loadZstdDictionary(ZS
+     {
+         size_t const dictContentSize = (size_t)(dictEnd - dictPtr);
+         FORWARD_IF_ERROR(ZSTD_loadDictionaryContent(
+-            ms, NULL, ws, params, dictPtr, dictContentSize, dtlm), "");
++            ms, NULL, ws, params, dictPtr, dictContentSize, dtlm, tfp), "");
+     }
+     return dictID;
+ }
+@@ -4361,6 +4984,7 @@ ZSTD_compress_insertDictionary(ZSTD_comp
+                          const void* dict, size_t dictSize,
+                                ZSTD_dictContentType_e dictContentType,
+                                ZSTD_dictTableLoadMethod_e dtlm,
++                               ZSTD_tableFillPurpose_e tfp,
+                                void* workspace)
+ {
+     DEBUGLOG(4, "ZSTD_compress_insertDictionary (dictSize=%u)", (U32)dictSize);
+@@ -4373,13 +4997,13 @@ ZSTD_compress_insertDictionary(ZSTD_comp
+ 
+     /* dict restricted modes */
+     if (dictContentType == ZSTD_dct_rawContent)
+-        return ZSTD_loadDictionaryContent(ms, ls, ws, params, dict, dictSize, dtlm);
++        return ZSTD_loadDictionaryContent(ms, ls, ws, params, dict, dictSize, dtlm, tfp);
+ 
+     if (MEM_readLE32(dict) != ZSTD_MAGIC_DICTIONARY) {
+         if (dictContentType == ZSTD_dct_auto) {
+             DEBUGLOG(4, "raw content dictionary detected");
+             return ZSTD_loadDictionaryContent(
+-                ms, ls, ws, params, dict, dictSize, dtlm);
++                ms, ls, ws, params, dict, dictSize, dtlm, tfp);
+         }
+         RETURN_ERROR_IF(dictContentType == ZSTD_dct_fullDict, dictionary_wrong, "");
+         assert(0);   /* impossible */
+@@ -4387,13 +5011,14 @@ ZSTD_compress_insertDictionary(ZSTD_comp
+ 
+     /* dict as full zstd dictionary */
+     return ZSTD_loadZstdDictionary(
+-        bs, ms, ws, params, dict, dictSize, dtlm, workspace);
++        bs, ms, ws, params, dict, dictSize, dtlm, tfp, workspace);
+ }
+ 
+ #define ZSTD_USE_CDICT_PARAMS_SRCSIZE_CUTOFF (128 KB)
+ #define ZSTD_USE_CDICT_PARAMS_DICTSIZE_MULTIPLIER (6ULL)
+ 
+ /*! ZSTD_compressBegin_internal() :
++ * Assumption : either @dict OR @cdict (or none) is non-NULL, never both
+  * @return : 0, or an error code */
+ static size_t ZSTD_compressBegin_internal(ZSTD_CCtx* cctx,
+                                     const void* dict, size_t dictSize,
+@@ -4426,11 +5051,11 @@ static size_t ZSTD_compressBegin_interna
+                         cctx->blockState.prevCBlock, &cctx->blockState.matchState,
+                         &cctx->ldmState, &cctx->workspace, &cctx->appliedParams, cdict->dictContent,
+                         cdict->dictContentSize, cdict->dictContentType, dtlm,
+-                        cctx->entropyWorkspace)
++                        ZSTD_tfp_forCCtx, cctx->entropyWorkspace)
+               : ZSTD_compress_insertDictionary(
+                         cctx->blockState.prevCBlock, &cctx->blockState.matchState,
+                         &cctx->ldmState, &cctx->workspace, &cctx->appliedParams, dict, dictSize,
+-                        dictContentType, dtlm, cctx->entropyWorkspace);
++                        dictContentType, dtlm, ZSTD_tfp_forCCtx, cctx->entropyWorkspace);
+         FORWARD_IF_ERROR(dictID, "ZSTD_compress_insertDictionary failed");
+         assert(dictID <= UINT_MAX);
+         cctx->dictID = (U32)dictID;
+@@ -4471,11 +5096,11 @@ size_t ZSTD_compressBegin_advanced(ZSTD_
+                                             &cctxParams, pledgedSrcSize);
+ }
+ 
+-size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel)
++static size_t
++ZSTD_compressBegin_usingDict_deprecated(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel)
+ {
+     ZSTD_CCtx_params cctxParams;
+-    {
+-        ZSTD_parameters const params = ZSTD_getParams_internal(compressionLevel, ZSTD_CONTENTSIZE_UNKNOWN, dictSize, ZSTD_cpm_noAttachDict);
++    {   ZSTD_parameters const params = ZSTD_getParams_internal(compressionLevel, ZSTD_CONTENTSIZE_UNKNOWN, dictSize, ZSTD_cpm_noAttachDict);
+         ZSTD_CCtxParams_init_internal(&cctxParams, &params, (compressionLevel == 0) ? ZSTD_CLEVEL_DEFAULT : compressionLevel);
+     }
+     DEBUGLOG(4, "ZSTD_compressBegin_usingDict (dictSize=%u)", (unsigned)dictSize);
+@@ -4483,9 +5108,15 @@ size_t ZSTD_compressBegin_usingDict(ZSTD
+                                        &cctxParams, ZSTD_CONTENTSIZE_UNKNOWN, ZSTDb_not_buffered);
+ }
+ 
++size_t
++ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel)
++{
++    return ZSTD_compressBegin_usingDict_deprecated(cctx, dict, dictSize, compressionLevel);
++}
++
+ size_t ZSTD_compressBegin(ZSTD_CCtx* cctx, int compressionLevel)
+ {
+-    return ZSTD_compressBegin_usingDict(cctx, NULL, 0, compressionLevel);
++    return ZSTD_compressBegin_usingDict_deprecated(cctx, NULL, 0, compressionLevel);
+ }
+ 
+ 
+@@ -4496,14 +5127,13 @@ static size_t ZSTD_writeEpilogue(ZSTD_CC
+ {
+     BYTE* const ostart = (BYTE*)dst;
+     BYTE* op = ostart;
+-    size_t fhSize = 0;
+ 
+     DEBUGLOG(4, "ZSTD_writeEpilogue");
+     RETURN_ERROR_IF(cctx->stage == ZSTDcs_created, stage_wrong, "init missing");
+ 
+     /* special case : empty frame */
+     if (cctx->stage == ZSTDcs_init) {
+-        fhSize = ZSTD_writeFrameHeader(dst, dstCapacity, &cctx->appliedParams, 0, 0);
++        size_t fhSize = ZSTD_writeFrameHeader(dst, dstCapacity, &cctx->appliedParams, 0, 0);
+         FORWARD_IF_ERROR(fhSize, "ZSTD_writeFrameHeader failed");
+         dstCapacity -= fhSize;
+         op += fhSize;
+@@ -4513,8 +5143,9 @@ static size_t ZSTD_writeEpilogue(ZSTD_CC
+     if (cctx->stage != ZSTDcs_ending) {
+         /* write one last empty block, make it the "last" block */
+         U32 const cBlockHeader24 = 1 /* last block */ + (((U32)bt_raw)<<1) + 0;
+-        RETURN_ERROR_IF(dstCapacity<4, dstSize_tooSmall, "no room for epilogue");
+-        MEM_writeLE32(op, cBlockHeader24);
++        ZSTD_STATIC_ASSERT(ZSTD_BLOCKHEADERSIZE == 3);
++        RETURN_ERROR_IF(dstCapacity<3, dstSize_tooSmall, "no room for epilogue");
++        MEM_writeLE24(op, cBlockHeader24);
+         op += ZSTD_blockHeaderSize;
+         dstCapacity -= ZSTD_blockHeaderSize;
+     }
+@@ -4537,9 +5168,9 @@ void ZSTD_CCtx_trace(ZSTD_CCtx* cctx, si
+     (void)extraCSize;
+ }
+ 
+-size_t ZSTD_compressEnd (ZSTD_CCtx* cctx,
+-                         void* dst, size_t dstCapacity,
+-                   const void* src, size_t srcSize)
++size_t ZSTD_compressEnd_public(ZSTD_CCtx* cctx,
++                               void* dst, size_t dstCapacity,
++                         const void* src, size_t srcSize)
+ {
+     size_t endResult;
+     size_t const cSize = ZSTD_compressContinue_internal(cctx,
+@@ -4563,6 +5194,14 @@ size_t ZSTD_compressEnd (ZSTD_CCtx* cctx
+     return cSize + endResult;
+ }
+ 
++/* NOTE: Must just wrap ZSTD_compressEnd_public() */
++size_t ZSTD_compressEnd(ZSTD_CCtx* cctx,
++                        void* dst, size_t dstCapacity,
++                  const void* src, size_t srcSize)
++{
++    return ZSTD_compressEnd_public(cctx, dst, dstCapacity, src, srcSize);
++}
++
+ size_t ZSTD_compress_advanced (ZSTD_CCtx* cctx,
+                                void* dst, size_t dstCapacity,
+                          const void* src, size_t srcSize,
+@@ -4591,7 +5230,7 @@ size_t ZSTD_compress_advanced_internal(
+     FORWARD_IF_ERROR( ZSTD_compressBegin_internal(cctx,
+                          dict, dictSize, ZSTD_dct_auto, ZSTD_dtlm_fast, NULL,
+                          params, srcSize, ZSTDb_not_buffered) , "");
+-    return ZSTD_compressEnd(cctx, dst, dstCapacity, src, srcSize);
++    return ZSTD_compressEnd_public(cctx, dst, dstCapacity, src, srcSize);
+ }
+ 
+ size_t ZSTD_compress_usingDict(ZSTD_CCtx* cctx,
+@@ -4709,7 +5348,7 @@ static size_t ZSTD_initCDict_internal(
+         {   size_t const dictID = ZSTD_compress_insertDictionary(
+                     &cdict->cBlockState, &cdict->matchState, NULL, &cdict->workspace,
+                     &params, cdict->dictContent, cdict->dictContentSize,
+-                    dictContentType, ZSTD_dtlm_full, cdict->entropyWorkspace);
++                    dictContentType, ZSTD_dtlm_full, ZSTD_tfp_forCDict, cdict->entropyWorkspace);
+             FORWARD_IF_ERROR(dictID, "ZSTD_compress_insertDictionary failed");
+             assert(dictID <= (size_t)(U32)-1);
+             cdict->dictID = (U32)dictID;
+@@ -4811,7 +5450,7 @@ ZSTD_CDict* ZSTD_createCDict_advanced2(
+                         cctxParams.useRowMatchFinder, cctxParams.enableDedicatedDictSearch,
+                         customMem);
+ 
+-    if (ZSTD_isError( ZSTD_initCDict_internal(cdict,
++    if (!cdict || ZSTD_isError( ZSTD_initCDict_internal(cdict,
+                                     dict, dictSize,
+                                     dictLoadMethod, dictContentType,
+                                     cctxParams) )) {
+@@ -4906,6 +5545,7 @@ const ZSTD_CDict* ZSTD_initStaticCDict(
+     params.cParams = cParams;
+     params.useRowMatchFinder = useRowMatchFinder;
+     cdict->useRowMatchFinder = useRowMatchFinder;
++    cdict->compressionLevel = ZSTD_NO_CLEVEL;
+ 
+     if (ZSTD_isError( ZSTD_initCDict_internal(cdict,
+                                               dict, dictSize,
+@@ -4985,12 +5625,17 @@ size_t ZSTD_compressBegin_usingCDict_adv
+ 
+ /* ZSTD_compressBegin_usingCDict() :
+  * cdict must be != NULL */
+-size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict)
++size_t ZSTD_compressBegin_usingCDict_deprecated(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict)
+ {
+     ZSTD_frameParameters const fParams = { 0 /*content*/, 0 /*checksum*/, 0 /*noDictID*/ };
+     return ZSTD_compressBegin_usingCDict_internal(cctx, cdict, fParams, ZSTD_CONTENTSIZE_UNKNOWN);
+ }
+ 
++size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict)
++{
++    return ZSTD_compressBegin_usingCDict_deprecated(cctx, cdict);
++}
++
+ /*! ZSTD_compress_usingCDict_internal():
+  * Implementation of various ZSTD_compress_usingCDict* functions.
+  */
+@@ -5000,7 +5645,7 @@ static size_t ZSTD_compress_usingCDict_i
+                                 const ZSTD_CDict* cdict, ZSTD_frameParameters fParams)
+ {
+     FORWARD_IF_ERROR(ZSTD_compressBegin_usingCDict_internal(cctx, cdict, fParams, srcSize), ""); /* will check if cdict != NULL */
+-    return ZSTD_compressEnd(cctx, dst, dstCapacity, src, srcSize);
++    return ZSTD_compressEnd_public(cctx, dst, dstCapacity, src, srcSize);
+ }
+ 
+ /*! ZSTD_compress_usingCDict_advanced():
+@@ -5197,30 +5842,41 @@ size_t ZSTD_initCStream(ZSTD_CStream* zc
+ 
+ static size_t ZSTD_nextInputSizeHint(const ZSTD_CCtx* cctx)
+ {
+-    size_t hintInSize = cctx->inBuffTarget - cctx->inBuffPos;
+-    if (hintInSize==0) hintInSize = cctx->blockSize;
+-    return hintInSize;
++    if (cctx->appliedParams.inBufferMode == ZSTD_bm_stable) {
++        return cctx->blockSize - cctx->stableIn_notConsumed;
++    }
++    assert(cctx->appliedParams.inBufferMode == ZSTD_bm_buffered);
++    {   size_t hintInSize = cctx->inBuffTarget - cctx->inBuffPos;
++        if (hintInSize==0) hintInSize = cctx->blockSize;
++        return hintInSize;
++    }
+ }
+ 
+ /* ZSTD_compressStream_generic():
+  *  internal function for all *compressStream*() variants
+- *  non-static, because can be called from zstdmt_compress.c
+- * @return : hint size for next input */
++ * @return : hint size for next input to complete ongoing block */
+ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
+                                           ZSTD_outBuffer* output,
+                                           ZSTD_inBuffer* input,
+                                           ZSTD_EndDirective const flushMode)
+ {
+-    const char* const istart = (const char*)input->src;
+-    const char* const iend = input->size != 0 ? istart + input->size : istart;
+-    const char* ip = input->pos != 0 ? istart + input->pos : istart;
+-    char* const ostart = (char*)output->dst;
+-    char* const oend = output->size != 0 ? ostart + output->size : ostart;
+-    char* op = output->pos != 0 ? ostart + output->pos : ostart;
++    const char* const istart = (assert(input != NULL), (const char*)input->src);
++    const char* const iend = (istart != NULL) ? istart + input->size : istart;
++    const char* ip = (istart != NULL) ? istart + input->pos : istart;
++    char* const ostart = (assert(output != NULL), (char*)output->dst);
++    char* const oend = (ostart != NULL) ? ostart + output->size : ostart;
++    char* op = (ostart != NULL) ? ostart + output->pos : ostart;
+     U32 someMoreWork = 1;
+ 
+     /* check expectations */
+-    DEBUGLOG(5, "ZSTD_compressStream_generic, flush=%u", (unsigned)flushMode);
++    DEBUGLOG(5, "ZSTD_compressStream_generic, flush=%i, srcSize = %zu", (int)flushMode, input->size - input->pos);
++    assert(zcs != NULL);
++    if (zcs->appliedParams.inBufferMode == ZSTD_bm_stable) {
++        assert(input->pos >= zcs->stableIn_notConsumed);
++        input->pos -= zcs->stableIn_notConsumed;
++        if (ip) ip -= zcs->stableIn_notConsumed;
++        zcs->stableIn_notConsumed = 0;
++    }
+     if (zcs->appliedParams.inBufferMode == ZSTD_bm_buffered) {
+         assert(zcs->inBuff != NULL);
+         assert(zcs->inBuffSize > 0);
+@@ -5229,8 +5885,10 @@ static size_t ZSTD_compressStream_generi
+         assert(zcs->outBuff !=  NULL);
+         assert(zcs->outBuffSize > 0);
+     }
+-    assert(output->pos <= output->size);
++    if (input->src == NULL) assert(input->size == 0);
+     assert(input->pos <= input->size);
++    if (output->dst == NULL) assert(output->size == 0);
++    assert(output->pos <= output->size);
+     assert((U32)flushMode <= (U32)ZSTD_e_end);
+ 
+     while (someMoreWork) {
+@@ -5245,7 +5903,7 @@ static size_t ZSTD_compressStream_generi
+                 || zcs->appliedParams.outBufferMode == ZSTD_bm_stable)  /* OR we are allowed to return dstSizeTooSmall */
+               && (zcs->inBuffPos == 0) ) {
+                 /* shortcut to compression pass directly into output buffer */
+-                size_t const cSize = ZSTD_compressEnd(zcs,
++                size_t const cSize = ZSTD_compressEnd_public(zcs,
+                                                 op, oend-op, ip, iend-ip);
+                 DEBUGLOG(4, "ZSTD_compressEnd : cSize=%u", (unsigned)cSize);
+                 FORWARD_IF_ERROR(cSize, "ZSTD_compressEnd failed");
+@@ -5262,8 +5920,7 @@ static size_t ZSTD_compressStream_generi
+                                         zcs->inBuff + zcs->inBuffPos, toLoad,
+                                         ip, iend-ip);
+                 zcs->inBuffPos += loaded;
+-                if (loaded != 0)
+-                    ip += loaded;
++                if (ip) ip += loaded;
+                 if ( (flushMode == ZSTD_e_continue)
+                   && (zcs->inBuffPos < zcs->inBuffTarget) ) {
+                     /* not enough input to fill full block : stop here */
+@@ -5274,6 +5931,20 @@ static size_t ZSTD_compressStream_generi
+                     /* empty */
+                     someMoreWork = 0; break;
+                 }
++            } else {
++                assert(zcs->appliedParams.inBufferMode == ZSTD_bm_stable);
++                if ( (flushMode == ZSTD_e_continue)
++                  && ( (size_t)(iend - ip) < zcs->blockSize) ) {
++                    /* can't compress a full block : stop here */
++                    zcs->stableIn_notConsumed = (size_t)(iend - ip);
++                    ip = iend;  /* pretend to have consumed input */
++                    someMoreWork = 0; break;
++                }
++                if ( (flushMode == ZSTD_e_flush)
++                  && (ip == iend) ) {
++                    /* empty */
++                    someMoreWork = 0; break;
++                }
+             }
+             /* compress current block (note : this stage cannot be stopped in the middle) */
+             DEBUGLOG(5, "stream compression stage (flushMode==%u)", flushMode);
+@@ -5281,9 +5952,8 @@ static size_t ZSTD_compressStream_generi
+                 void* cDst;
+                 size_t cSize;
+                 size_t oSize = oend-op;
+-                size_t const iSize = inputBuffered
+-                    ? zcs->inBuffPos - zcs->inToCompress
+-                    : MIN((size_t)(iend - ip), zcs->blockSize);
++                size_t const iSize = inputBuffered ? zcs->inBuffPos - zcs->inToCompress
++                                                   : MIN((size_t)(iend - ip), zcs->blockSize);
+                 if (oSize >= ZSTD_compressBound(iSize) || zcs->appliedParams.outBufferMode == ZSTD_bm_stable)
+                     cDst = op;   /* compress into output buffer, to skip flush stage */
+                 else
+@@ -5291,9 +5961,9 @@ static size_t ZSTD_compressStream_generi
+                 if (inputBuffered) {
+                     unsigned const lastBlock = (flushMode == ZSTD_e_end) && (ip==iend);
+                     cSize = lastBlock ?
+-                            ZSTD_compressEnd(zcs, cDst, oSize,
++                            ZSTD_compressEnd_public(zcs, cDst, oSize,
+                                         zcs->inBuff + zcs->inToCompress, iSize) :
+-                            ZSTD_compressContinue(zcs, cDst, oSize,
++                            ZSTD_compressContinue_public(zcs, cDst, oSize,
+                                         zcs->inBuff + zcs->inToCompress, iSize);
+                     FORWARD_IF_ERROR(cSize, "%s", lastBlock ? "ZSTD_compressEnd failed" : "ZSTD_compressContinue failed");
+                     zcs->frameEnded = lastBlock;
+@@ -5306,19 +5976,16 @@ static size_t ZSTD_compressStream_generi
+                     if (!lastBlock)
+                         assert(zcs->inBuffTarget <= zcs->inBuffSize);
+                     zcs->inToCompress = zcs->inBuffPos;
+-                } else {
+-                    unsigned const lastBlock = (ip + iSize == iend);
+-                    assert(flushMode == ZSTD_e_end /* Already validated */);
++                } else { /* !inputBuffered, hence ZSTD_bm_stable */
++                    unsigned const lastBlock = (flushMode == ZSTD_e_end) && (ip + iSize == iend);
+                     cSize = lastBlock ?
+-                            ZSTD_compressEnd(zcs, cDst, oSize, ip, iSize) :
+-                            ZSTD_compressContinue(zcs, cDst, oSize, ip, iSize);
++                            ZSTD_compressEnd_public(zcs, cDst, oSize, ip, iSize) :
++                            ZSTD_compressContinue_public(zcs, cDst, oSize, ip, iSize);
+                     /* Consume the input prior to error checking to mirror buffered mode. */
+-                    if (iSize > 0)
+-                        ip += iSize;
++                    if (ip) ip += iSize;
+                     FORWARD_IF_ERROR(cSize, "%s", lastBlock ? "ZSTD_compressEnd failed" : "ZSTD_compressContinue failed");
+                     zcs->frameEnded = lastBlock;
+-                    if (lastBlock)
+-                        assert(ip == iend);
++                    if (lastBlock) assert(ip == iend);
+                 }
+                 if (cDst == op) {  /* no need to flush */
+                     op += cSize;
+@@ -5388,8 +6055,10 @@ size_t ZSTD_compressStream(ZSTD_CStream*
+ /* After a compression call set the expected input/output buffer.
+  * This is validated at the start of the next compression call.
+  */
+-static void ZSTD_setBufferExpectations(ZSTD_CCtx* cctx, ZSTD_outBuffer const* output, ZSTD_inBuffer const* input)
++static void
++ZSTD_setBufferExpectations(ZSTD_CCtx* cctx, const ZSTD_outBuffer* output, const ZSTD_inBuffer* input)
+ {
++    DEBUGLOG(5, "ZSTD_setBufferExpectations (for advanced stable in/out modes)");
+     if (cctx->appliedParams.inBufferMode == ZSTD_bm_stable) {
+         cctx->expectedInBuffer = *input;
+     }
+@@ -5408,22 +6077,22 @@ static size_t ZSTD_checkBufferStability(
+ {
+     if (cctx->appliedParams.inBufferMode == ZSTD_bm_stable) {
+         ZSTD_inBuffer const expect = cctx->expectedInBuffer;
+-        if (expect.src != input->src || expect.pos != input->pos || expect.size != input->size)
+-            RETURN_ERROR(srcBuffer_wrong, "ZSTD_c_stableInBuffer enabled but input differs!");
+-        if (endOp != ZSTD_e_end)
+-            RETURN_ERROR(srcBuffer_wrong, "ZSTD_c_stableInBuffer can only be used with ZSTD_e_end!");
++        if (expect.src != input->src || expect.pos != input->pos)
++            RETURN_ERROR(stabilityCondition_notRespected, "ZSTD_c_stableInBuffer enabled but input differs!");
+     }
++    (void)endOp;
+     if (cctx->appliedParams.outBufferMode == ZSTD_bm_stable) {
+         size_t const outBufferSize = output->size - output->pos;
+         if (cctx->expectedOutBufferSize != outBufferSize)
+-            RETURN_ERROR(dstBuffer_wrong, "ZSTD_c_stableOutBuffer enabled but output size differs!");
++            RETURN_ERROR(stabilityCondition_notRespected, "ZSTD_c_stableOutBuffer enabled but output size differs!");
+     }
+     return 0;
+ }
+ 
+ static size_t ZSTD_CCtx_init_compressStream2(ZSTD_CCtx* cctx,
+                                              ZSTD_EndDirective endOp,
+-                                             size_t inSize) {
++                                             size_t inSize)
++{
+     ZSTD_CCtx_params params = cctx->requestedParams;
+     ZSTD_prefixDict const prefixDict = cctx->prefixDict;
+     FORWARD_IF_ERROR( ZSTD_initLocalDict(cctx) , ""); /* Init the local dict if present. */
+@@ -5437,9 +6106,9 @@ static size_t ZSTD_CCtx_init_compressStr
+         params.compressionLevel = cctx->cdict->compressionLevel;
+     }
+     DEBUGLOG(4, "ZSTD_compressStream2 : transparent init stage");
+-    if (endOp == ZSTD_e_end) cctx->pledgedSrcSizePlusOne = inSize + 1;  /* auto-fix pledgedSrcSize */
+-    {
+-        size_t const dictSize = prefixDict.dict
++    if (endOp == ZSTD_e_end) cctx->pledgedSrcSizePlusOne = inSize + 1;  /* auto-determine pledgedSrcSize */
++
++    {   size_t const dictSize = prefixDict.dict
+                 ? prefixDict.dictSize
+                 : (cctx->cdict ? cctx->cdict->dictContentSize : 0);
+         ZSTD_cParamMode_e const mode = ZSTD_getCParamMode(cctx->cdict, &params, cctx->pledgedSrcSizePlusOne - 1);
+@@ -5451,6 +6120,9 @@ static size_t ZSTD_CCtx_init_compressStr
+     params.useBlockSplitter = ZSTD_resolveBlockSplitterMode(params.useBlockSplitter, &params.cParams);
+     params.ldmParams.enableLdm = ZSTD_resolveEnableLdm(params.ldmParams.enableLdm, &params.cParams);
+     params.useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(params.useRowMatchFinder, &params.cParams);
++    params.validateSequences = ZSTD_resolveExternalSequenceValidation(params.validateSequences);
++    params.maxBlockSize = ZSTD_resolveMaxBlockSize(params.maxBlockSize);
++    params.searchForExternalRepcodes = ZSTD_resolveExternalRepcodeSearch(params.searchForExternalRepcodes, params.compressionLevel);
+ 
+     {   U64 const pledgedSrcSize = cctx->pledgedSrcSizePlusOne - 1;
+         assert(!ZSTD_isError(ZSTD_checkCParams(params.cParams)));
+@@ -5477,6 +6149,8 @@ static size_t ZSTD_CCtx_init_compressStr
+     return 0;
+ }
+ 
++/* @return provides a minimum amount of data remaining to be flushed from internal buffers
++ */
+ size_t ZSTD_compressStream2( ZSTD_CCtx* cctx,
+                              ZSTD_outBuffer* output,
+                              ZSTD_inBuffer* input,
+@@ -5491,8 +6165,27 @@ size_t ZSTD_compressStream2( ZSTD_CCtx*
+ 
+     /* transparent initialization stage */
+     if (cctx->streamStage == zcss_init) {
+-        FORWARD_IF_ERROR(ZSTD_CCtx_init_compressStream2(cctx, endOp, input->size), "CompressStream2 initialization failed");
+-        ZSTD_setBufferExpectations(cctx, output, input);    /* Set initial buffer expectations now that we've initialized */
++        size_t const inputSize = input->size - input->pos;  /* no obligation to start from pos==0 */
++        size_t const totalInputSize = inputSize + cctx->stableIn_notConsumed;
++        if ( (cctx->requestedParams.inBufferMode == ZSTD_bm_stable) /* input is presumed stable, across invocations */
++          && (endOp == ZSTD_e_continue)                             /* no flush requested, more input to come */
++          && (totalInputSize < ZSTD_BLOCKSIZE_MAX) ) {              /* not even reached one block yet */
++            if (cctx->stableIn_notConsumed) {  /* not the first time */
++                /* check stable source guarantees */
++                RETURN_ERROR_IF(input->src != cctx->expectedInBuffer.src, stabilityCondition_notRespected, "stableInBuffer condition not respected: wrong src pointer");
++                RETURN_ERROR_IF(input->pos != cctx->expectedInBuffer.size, stabilityCondition_notRespected, "stableInBuffer condition not respected: externally modified pos");
++            }
++            /* pretend input was consumed, to give a sense forward progress */
++            input->pos = input->size;
++            /* save stable inBuffer, for later control, and flush/end */
++            cctx->expectedInBuffer = *input;
++            /* but actually input wasn't consumed, so keep track of position from where compression shall resume */
++            cctx->stableIn_notConsumed += inputSize;
++            /* don't initialize yet, wait for the first block of flush() order, for better parameters adaptation */
++            return ZSTD_FRAMEHEADERSIZE_MIN(cctx->requestedParams.format);  /* at least some header to produce */
++        }
++        FORWARD_IF_ERROR(ZSTD_CCtx_init_compressStream2(cctx, endOp, totalInputSize), "compressStream2 initialization failed");
++        ZSTD_setBufferExpectations(cctx, output, input);   /* Set initial buffer expectations now that we've initialized */
+     }
+     /* end of transparent initialization stage */
+ 
+@@ -5510,13 +6203,20 @@ size_t ZSTD_compressStream2_simpleArgs (
+                       const void* src, size_t srcSize, size_t* srcPos,
+                             ZSTD_EndDirective endOp)
+ {
+-    ZSTD_outBuffer output = { dst, dstCapacity, *dstPos };
+-    ZSTD_inBuffer  input  = { src, srcSize, *srcPos };
++    ZSTD_outBuffer output;
++    ZSTD_inBuffer  input;
++    output.dst = dst;
++    output.size = dstCapacity;
++    output.pos = *dstPos;
++    input.src = src;
++    input.size = srcSize;
++    input.pos = *srcPos;
+     /* ZSTD_compressStream2() will check validity of dstPos and srcPos */
+-    size_t const cErr = ZSTD_compressStream2(cctx, &output, &input, endOp);
+-    *dstPos = output.pos;
+-    *srcPos = input.pos;
+-    return cErr;
++    {   size_t const cErr = ZSTD_compressStream2(cctx, &output, &input, endOp);
++        *dstPos = output.pos;
++        *srcPos = input.pos;
++        return cErr;
++    }
+ }
+ 
+ size_t ZSTD_compress2(ZSTD_CCtx* cctx,
+@@ -5539,6 +6239,7 @@ size_t ZSTD_compress2(ZSTD_CCtx* cctx,
+         /* Reset to the original values. */
+         cctx->requestedParams.inBufferMode = originalInBufferMode;
+         cctx->requestedParams.outBufferMode = originalOutBufferMode;
++
+         FORWARD_IF_ERROR(result, "ZSTD_compressStream2_simpleArgs failed");
+         if (result != 0) {  /* compression not completed, due to lack of output space */
+             assert(oPos == dstCapacity);
+@@ -5549,64 +6250,61 @@ size_t ZSTD_compress2(ZSTD_CCtx* cctx,
+     }
+ }
+ 
+-typedef struct {
+-    U32 idx;             /* Index in array of ZSTD_Sequence */
+-    U32 posInSequence;   /* Position within sequence at idx */
+-    size_t posInSrc;        /* Number of bytes given by sequences provided so far */
+-} ZSTD_sequencePosition;
+-
+ /* ZSTD_validateSequence() :
+  * @offCode : is presumed to follow format required by ZSTD_storeSeq()
+  * @returns a ZSTD error code if sequence is not valid
+  */
+ static size_t
+-ZSTD_validateSequence(U32 offCode, U32 matchLength,
+-                      size_t posInSrc, U32 windowLog, size_t dictSize)
++ZSTD_validateSequence(U32 offCode, U32 matchLength, U32 minMatch,
++                      size_t posInSrc, U32 windowLog, size_t dictSize, int useSequenceProducer)
+ {
+-    U32 const windowSize = 1 << windowLog;
++    U32 const windowSize = 1u << windowLog;
+     /* posInSrc represents the amount of data the decoder would decode up to this point.
+      * As long as the amount of data decoded is less than or equal to window size, offsets may be
+      * larger than the total length of output decoded in order to reference the dict, even larger than
+      * window size. After output surpasses windowSize, we're limited to windowSize offsets again.
+      */
+     size_t const offsetBound = posInSrc > windowSize ? (size_t)windowSize : posInSrc + (size_t)dictSize;
+-    RETURN_ERROR_IF(offCode > STORE_OFFSET(offsetBound), corruption_detected, "Offset too large!");
+-    RETURN_ERROR_IF(matchLength < MINMATCH, corruption_detected, "Matchlength too small");
++    size_t const matchLenLowerBound = (minMatch == 3 || useSequenceProducer) ? 3 : 4;
++    RETURN_ERROR_IF(offCode > OFFSET_TO_OFFBASE(offsetBound), externalSequences_invalid, "Offset too large!");
++    /* Validate maxNbSeq is large enough for the given matchLength and minMatch */
++    RETURN_ERROR_IF(matchLength < matchLenLowerBound, externalSequences_invalid, "Matchlength too small for the minMatch");
+     return 0;
+ }
+ 
+ /* Returns an offset code, given a sequence's raw offset, the ongoing repcode array, and whether litLength == 0 */
+-static U32 ZSTD_finalizeOffCode(U32 rawOffset, const U32 rep[ZSTD_REP_NUM], U32 ll0)
++static U32 ZSTD_finalizeOffBase(U32 rawOffset, const U32 rep[ZSTD_REP_NUM], U32 ll0)
+ {
+-    U32 offCode = STORE_OFFSET(rawOffset);
++    U32 offBase = OFFSET_TO_OFFBASE(rawOffset);
+ 
+     if (!ll0 && rawOffset == rep[0]) {
+-        offCode = STORE_REPCODE_1;
++        offBase = REPCODE1_TO_OFFBASE;
+     } else if (rawOffset == rep[1]) {
+-        offCode = STORE_REPCODE(2 - ll0);
++        offBase = REPCODE_TO_OFFBASE(2 - ll0);
+     } else if (rawOffset == rep[2]) {
+-        offCode = STORE_REPCODE(3 - ll0);
++        offBase = REPCODE_TO_OFFBASE(3 - ll0);
+     } else if (ll0 && rawOffset == rep[0] - 1) {
+-        offCode = STORE_REPCODE_3;
++        offBase = REPCODE3_TO_OFFBASE;
+     }
+-    return offCode;
++    return offBase;
+ }
+ 
+-/* Returns 0 on success, and a ZSTD_error otherwise. This function scans through an array of
+- * ZSTD_Sequence, storing the sequences it finds, until it reaches a block delimiter.
+- */
+-static size_t
++size_t
+ ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx,
+                                               ZSTD_sequencePosition* seqPos,
+                                         const ZSTD_Sequence* const inSeqs, size_t inSeqsSize,
+-                                        const void* src, size_t blockSize)
++                                        const void* src, size_t blockSize,
++                                        ZSTD_paramSwitch_e externalRepSearch)
+ {
+     U32 idx = seqPos->idx;
++    U32 const startIdx = idx;
+     BYTE const* ip = (BYTE const*)(src);
+     const BYTE* const iend = ip + blockSize;
+     repcodes_t updatedRepcodes;
+     U32 dictSize;
+ 
++    DEBUGLOG(5, "ZSTD_copySequencesToSeqStoreExplicitBlockDelim (blockSize = %zu)", blockSize);
++
+     if (cctx->cdict) {
+         dictSize = (U32)cctx->cdict->dictContentSize;
+     } else if (cctx->prefixDict.dict) {
+@@ -5615,25 +6313,55 @@ ZSTD_copySequencesToSeqStoreExplicitBloc
+         dictSize = 0;
+     }
+     ZSTD_memcpy(updatedRepcodes.rep, cctx->blockState.prevCBlock->rep, sizeof(repcodes_t));
+-    for (; (inSeqs[idx].matchLength != 0 || inSeqs[idx].offset != 0) && idx < inSeqsSize; ++idx) {
++    for (; idx < inSeqsSize && (inSeqs[idx].matchLength != 0 || inSeqs[idx].offset != 0); ++idx) {
+         U32 const litLength = inSeqs[idx].litLength;
+-        U32 const ll0 = (litLength == 0);
+         U32 const matchLength = inSeqs[idx].matchLength;
+-        U32 const offCode = ZSTD_finalizeOffCode(inSeqs[idx].offset, updatedRepcodes.rep, ll0);
+-        ZSTD_updateRep(updatedRepcodes.rep, offCode, ll0);
++        U32 offBase;
++
++        if (externalRepSearch == ZSTD_ps_disable) {
++            offBase = OFFSET_TO_OFFBASE(inSeqs[idx].offset);
++        } else {
++            U32 const ll0 = (litLength == 0);
++            offBase = ZSTD_finalizeOffBase(inSeqs[idx].offset, updatedRepcodes.rep, ll0);
++            ZSTD_updateRep(updatedRepcodes.rep, offBase, ll0);
++        }
+ 
+-        DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offCode, matchLength, litLength);
++        DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offBase, matchLength, litLength);
+         if (cctx->appliedParams.validateSequences) {
+             seqPos->posInSrc += litLength + matchLength;
+-            FORWARD_IF_ERROR(ZSTD_validateSequence(offCode, matchLength, seqPos->posInSrc,
+-                                                cctx->appliedParams.cParams.windowLog, dictSize),
++            FORWARD_IF_ERROR(ZSTD_validateSequence(offBase, matchLength, cctx->appliedParams.cParams.minMatch, seqPos->posInSrc,
++                                                cctx->appliedParams.cParams.windowLog, dictSize, ZSTD_hasExtSeqProd(&cctx->appliedParams)),
+                                                 "Sequence validation failed");
+         }
+-        RETURN_ERROR_IF(idx - seqPos->idx > cctx->seqStore.maxNbSeq, memory_allocation,
++        RETURN_ERROR_IF(idx - seqPos->idx >= cctx->seqStore.maxNbSeq, externalSequences_invalid,
+                         "Not enough memory allocated. Try adjusting ZSTD_c_minMatch.");
+-        ZSTD_storeSeq(&cctx->seqStore, litLength, ip, iend, offCode, matchLength);
++        ZSTD_storeSeq(&cctx->seqStore, litLength, ip, iend, offBase, matchLength);
+         ip += matchLength + litLength;
+     }
++
++    /* If we skipped repcode search while parsing, we need to update repcodes now */
++    assert(externalRepSearch != ZSTD_ps_auto);
++    assert(idx >= startIdx);
++    if (externalRepSearch == ZSTD_ps_disable && idx != startIdx) {
++        U32* const rep = updatedRepcodes.rep;
++        U32 lastSeqIdx = idx - 1; /* index of last non-block-delimiter sequence */
++
++        if (lastSeqIdx >= startIdx + 2) {
++            rep[2] = inSeqs[lastSeqIdx - 2].offset;
++            rep[1] = inSeqs[lastSeqIdx - 1].offset;
++            rep[0] = inSeqs[lastSeqIdx].offset;
++        } else if (lastSeqIdx == startIdx + 1) {
++            rep[2] = rep[0];
++            rep[1] = inSeqs[lastSeqIdx - 1].offset;
++            rep[0] = inSeqs[lastSeqIdx].offset;
++        } else {
++            assert(lastSeqIdx == startIdx);
++            rep[2] = rep[1];
++            rep[1] = rep[0];
++            rep[0] = inSeqs[lastSeqIdx].offset;
++        }
++    }
++
+     ZSTD_memcpy(cctx->blockState.nextCBlock->rep, updatedRepcodes.rep, sizeof(repcodes_t));
+ 
+     if (inSeqs[idx].litLength) {
+@@ -5642,26 +6370,15 @@ ZSTD_copySequencesToSeqStoreExplicitBloc
+         ip += inSeqs[idx].litLength;
+         seqPos->posInSrc += inSeqs[idx].litLength;
+     }
+-    RETURN_ERROR_IF(ip != iend, corruption_detected, "Blocksize doesn't agree with block delimiter!");
++    RETURN_ERROR_IF(ip != iend, externalSequences_invalid, "Blocksize doesn't agree with block delimiter!");
+     seqPos->idx = idx+1;
+     return 0;
+ }
+ 
+-/* Returns the number of bytes to move the current read position back by. Only non-zero
+- * if we ended up splitting a sequence. Otherwise, it may return a ZSTD error if something
+- * went wrong.
+- *
+- * This function will attempt to scan through blockSize bytes represented by the sequences
+- * in inSeqs, storing any (partial) sequences.
+- *
+- * Occasionally, we may want to change the actual number of bytes we consumed from inSeqs to
+- * avoid splitting a match, or to avoid splitting a match such that it would produce a match
+- * smaller than MINMATCH. In this case, we return the number of bytes that we didn't read from this block.
+- */
+-static size_t
++size_t
+ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* seqPos,
+                                    const ZSTD_Sequence* const inSeqs, size_t inSeqsSize,
+-                                   const void* src, size_t blockSize)
++                                   const void* src, size_t blockSize, ZSTD_paramSwitch_e externalRepSearch)
+ {
+     U32 idx = seqPos->idx;
+     U32 startPosInSequence = seqPos->posInSequence;
+@@ -5673,6 +6390,9 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim
+     U32 bytesAdjustment = 0;
+     U32 finalMatchSplit = 0;
+ 
++    /* TODO(embg) support fast parsing mode in noBlockDelim mode */
++    (void)externalRepSearch;
++
+     if (cctx->cdict) {
+         dictSize = cctx->cdict->dictContentSize;
+     } else if (cctx->prefixDict.dict) {
+@@ -5680,7 +6400,7 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim
+     } else {
+         dictSize = 0;
+     }
+-    DEBUGLOG(5, "ZSTD_copySequencesToSeqStore: idx: %u PIS: %u blockSize: %zu", idx, startPosInSequence, blockSize);
++    DEBUGLOG(5, "ZSTD_copySequencesToSeqStoreNoBlockDelim: idx: %u PIS: %u blockSize: %zu", idx, startPosInSequence, blockSize);
+     DEBUGLOG(5, "Start seq: idx: %u (of: %u ml: %u ll: %u)", idx, inSeqs[idx].offset, inSeqs[idx].matchLength, inSeqs[idx].litLength);
+     ZSTD_memcpy(updatedRepcodes.rep, cctx->blockState.prevCBlock->rep, sizeof(repcodes_t));
+     while (endPosInSequence && idx < inSeqsSize && !finalMatchSplit) {
+@@ -5688,7 +6408,7 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim
+         U32 litLength = currSeq.litLength;
+         U32 matchLength = currSeq.matchLength;
+         U32 const rawOffset = currSeq.offset;
+-        U32 offCode;
++        U32 offBase;
+ 
+         /* Modify the sequence depending on where endPosInSequence lies */
+         if (endPosInSequence >= currSeq.litLength + currSeq.matchLength) {
+@@ -5702,7 +6422,6 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim
+             /* Move to the next sequence */
+             endPosInSequence -= currSeq.litLength + currSeq.matchLength;
+             startPosInSequence = 0;
+-            idx++;
+         } else {
+             /* This is the final (partial) sequence we're adding from inSeqs, and endPosInSequence
+                does not reach the end of the match. So, we have to split the sequence */
+@@ -5742,21 +6461,23 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim
+         }
+         /* Check if this offset can be represented with a repcode */
+         {   U32 const ll0 = (litLength == 0);
+-            offCode = ZSTD_finalizeOffCode(rawOffset, updatedRepcodes.rep, ll0);
+-            ZSTD_updateRep(updatedRepcodes.rep, offCode, ll0);
++            offBase = ZSTD_finalizeOffBase(rawOffset, updatedRepcodes.rep, ll0);
++            ZSTD_updateRep(updatedRepcodes.rep, offBase, ll0);
+         }
+ 
+         if (cctx->appliedParams.validateSequences) {
+             seqPos->posInSrc += litLength + matchLength;
+-            FORWARD_IF_ERROR(ZSTD_validateSequence(offCode, matchLength, seqPos->posInSrc,
+-                                                   cctx->appliedParams.cParams.windowLog, dictSize),
++            FORWARD_IF_ERROR(ZSTD_validateSequence(offBase, matchLength, cctx->appliedParams.cParams.minMatch, seqPos->posInSrc,
++                                                   cctx->appliedParams.cParams.windowLog, dictSize, ZSTD_hasExtSeqProd(&cctx->appliedParams)),
+                                                    "Sequence validation failed");
+         }
+-        DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offCode, matchLength, litLength);
+-        RETURN_ERROR_IF(idx - seqPos->idx > cctx->seqStore.maxNbSeq, memory_allocation,
++        DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offBase, matchLength, litLength);
++        RETURN_ERROR_IF(idx - seqPos->idx >= cctx->seqStore.maxNbSeq, externalSequences_invalid,
+                         "Not enough memory allocated. Try adjusting ZSTD_c_minMatch.");
+-        ZSTD_storeSeq(&cctx->seqStore, litLength, ip, iend, offCode, matchLength);
++        ZSTD_storeSeq(&cctx->seqStore, litLength, ip, iend, offBase, matchLength);
+         ip += matchLength + litLength;
++        if (!finalMatchSplit)
++            idx++; /* Next Sequence */
+     }
+     DEBUGLOG(5, "Ending seq: idx: %u (of: %u ml: %u ll: %u)", idx, inSeqs[idx].offset, inSeqs[idx].matchLength, inSeqs[idx].litLength);
+     assert(idx == inSeqsSize || endPosInSequence <= inSeqs[idx].litLength + inSeqs[idx].matchLength);
+@@ -5779,7 +6500,7 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim
+ 
+ typedef size_t (*ZSTD_sequenceCopier) (ZSTD_CCtx* cctx, ZSTD_sequencePosition* seqPos,
+                                        const ZSTD_Sequence* const inSeqs, size_t inSeqsSize,
+-                                       const void* src, size_t blockSize);
++                                       const void* src, size_t blockSize, ZSTD_paramSwitch_e externalRepSearch);
+ static ZSTD_sequenceCopier ZSTD_selectSequenceCopier(ZSTD_sequenceFormat_e mode)
+ {
+     ZSTD_sequenceCopier sequenceCopier = NULL;
+@@ -5793,6 +6514,57 @@ static ZSTD_sequenceCopier ZSTD_selectSe
+     return sequenceCopier;
+ }
+ 
++/* Discover the size of next block by searching for the delimiter.
++ * Note that a block delimiter **must** exist in this mode,
++ * otherwise it's an input error.
++ * The block size retrieved will be later compared to ensure it remains within bounds */
++static size_t
++blockSize_explicitDelimiter(const ZSTD_Sequence* inSeqs, size_t inSeqsSize, ZSTD_sequencePosition seqPos)
++{
++    int end = 0;
++    size_t blockSize = 0;
++    size_t spos = seqPos.idx;
++    DEBUGLOG(6, "blockSize_explicitDelimiter : seq %zu / %zu", spos, inSeqsSize);
++    assert(spos <= inSeqsSize);
++    while (spos < inSeqsSize) {
++        end = (inSeqs[spos].offset == 0);
++        blockSize += inSeqs[spos].litLength + inSeqs[spos].matchLength;
++        if (end) {
++            if (inSeqs[spos].matchLength != 0)
++                RETURN_ERROR(externalSequences_invalid, "delimiter format error : both matchlength and offset must be == 0");
++            break;
++        }
++        spos++;
++    }
++    if (!end)
++        RETURN_ERROR(externalSequences_invalid, "Reached end of sequences without finding a block delimiter");
++    return blockSize;
++}
++
++/* More a "target" block size */
++static size_t blockSize_noDelimiter(size_t blockSize, size_t remaining)
++{
++    int const lastBlock = (remaining <= blockSize);
++    return lastBlock ? remaining : blockSize;
++}
++
++static size_t determine_blockSize(ZSTD_sequenceFormat_e mode,
++                           size_t blockSize, size_t remaining,
++                     const ZSTD_Sequence* inSeqs, size_t inSeqsSize, ZSTD_sequencePosition seqPos)
++{
++    DEBUGLOG(6, "determine_blockSize : remainingSize = %zu", remaining);
++    if (mode == ZSTD_sf_noBlockDelimiters)
++        return blockSize_noDelimiter(blockSize, remaining);
++    {   size_t const explicitBlockSize = blockSize_explicitDelimiter(inSeqs, inSeqsSize, seqPos);
++        FORWARD_IF_ERROR(explicitBlockSize, "Error while determining block size with explicit delimiters");
++        if (explicitBlockSize > blockSize)
++            RETURN_ERROR(externalSequences_invalid, "sequences incorrectly define a too large block");
++        if (explicitBlockSize > remaining)
++            RETURN_ERROR(externalSequences_invalid, "sequences define a frame longer than source");
++        return explicitBlockSize;
++    }
++}
++
+ /* Compress, block-by-block, all of the sequences given.
+  *
+  * Returns the cumulative size of all compressed blocks (including their headers),
+@@ -5805,9 +6577,6 @@ ZSTD_compressSequences_internal(ZSTD_CCt
+                           const void* src, size_t srcSize)
+ {
+     size_t cSize = 0;
+-    U32 lastBlock;
+-    size_t blockSize;
+-    size_t compressedSeqsSize;
+     size_t remaining = srcSize;
+     ZSTD_sequencePosition seqPos = {0, 0, 0};
+ 
+@@ -5827,22 +6596,29 @@ ZSTD_compressSequences_internal(ZSTD_CCt
+     }
+ 
+     while (remaining) {
++        size_t compressedSeqsSize;
+         size_t cBlockSize;
+         size_t additionalByteAdjustment;
+-        lastBlock = remaining <= cctx->blockSize;
+-        blockSize = lastBlock ? (U32)remaining : (U32)cctx->blockSize;
++        size_t blockSize = determine_blockSize(cctx->appliedParams.blockDelimiters,
++                                        cctx->blockSize, remaining,
++                                        inSeqs, inSeqsSize, seqPos);
++        U32 const lastBlock = (blockSize == remaining);
++        FORWARD_IF_ERROR(blockSize, "Error while trying to determine block size");
++        assert(blockSize <= remaining);
+         ZSTD_resetSeqStore(&cctx->seqStore);
+-        DEBUGLOG(4, "Working on new block. Blocksize: %zu", blockSize);
++        DEBUGLOG(5, "Working on new block. Blocksize: %zu (total:%zu)", blockSize, (ip - (const BYTE*)src) + blockSize);
+ 
+-        additionalByteAdjustment = sequenceCopier(cctx, &seqPos, inSeqs, inSeqsSize, ip, blockSize);
++        additionalByteAdjustment = sequenceCopier(cctx, &seqPos, inSeqs, inSeqsSize, ip, blockSize, cctx->appliedParams.searchForExternalRepcodes);
+         FORWARD_IF_ERROR(additionalByteAdjustment, "Bad sequence copy");
+         blockSize -= additionalByteAdjustment;
+ 
+         /* If blocks are too small, emit as a nocompress block */
+-        if (blockSize < MIN_CBLOCK_SIZE+ZSTD_blockHeaderSize+1) {
++        /* TODO: See 3090. We reduced MIN_CBLOCK_SIZE from 3 to 2 so to compensate we are adding
++         * additional 1. We need to revisit and change this logic to be more consistent */
++        if (blockSize < MIN_CBLOCK_SIZE+ZSTD_blockHeaderSize+1+1) {
+             cBlockSize = ZSTD_noCompressBlock(op, dstCapacity, ip, blockSize, lastBlock);
+             FORWARD_IF_ERROR(cBlockSize, "Nocompress block failed");
+-            DEBUGLOG(4, "Block too small, writing out nocompress block: cSize: %zu", cBlockSize);
++            DEBUGLOG(5, "Block too small, writing out nocompress block: cSize: %zu", cBlockSize);
+             cSize += cBlockSize;
+             ip += blockSize;
+             op += cBlockSize;
+@@ -5851,6 +6627,7 @@ ZSTD_compressSequences_internal(ZSTD_CCt
+             continue;
+         }
+ 
++        RETURN_ERROR_IF(dstCapacity < ZSTD_blockHeaderSize, dstSize_tooSmall, "not enough dstCapacity to write a new compressed block");
+         compressedSeqsSize = ZSTD_entropyCompressSeqStore(&cctx->seqStore,
+                                 &cctx->blockState.prevCBlock->entropy, &cctx->blockState.nextCBlock->entropy,
+                                 &cctx->appliedParams,
+@@ -5859,11 +6636,11 @@ ZSTD_compressSequences_internal(ZSTD_CCt
+                                 cctx->entropyWorkspace, ENTROPY_WORKSPACE_SIZE /* statically allocated in resetCCtx */,
+                                 cctx->bmi2);
+         FORWARD_IF_ERROR(compressedSeqsSize, "Compressing sequences of block failed");
+-        DEBUGLOG(4, "Compressed sequences size: %zu", compressedSeqsSize);
++        DEBUGLOG(5, "Compressed sequences size: %zu", compressedSeqsSize);
+ 
+         if (!cctx->isFirstBlock &&
+             ZSTD_maybeRLE(&cctx->seqStore) &&
+-            ZSTD_isRLE((BYTE const*)src, srcSize)) {
++            ZSTD_isRLE(ip, blockSize)) {
+             /* We don't want to emit our first block as a RLE even if it qualifies because
+             * doing so will cause the decoder (cli only) to throw a "should consume all input error."
+             * This is only an issue for zstd <= v1.4.3
+@@ -5874,12 +6651,12 @@ ZSTD_compressSequences_internal(ZSTD_CCt
+         if (compressedSeqsSize == 0) {
+             /* ZSTD_noCompressBlock writes the block header as well */
+             cBlockSize = ZSTD_noCompressBlock(op, dstCapacity, ip, blockSize, lastBlock);
+-            FORWARD_IF_ERROR(cBlockSize, "Nocompress block failed");
+-            DEBUGLOG(4, "Writing out nocompress block, size: %zu", cBlockSize);
++            FORWARD_IF_ERROR(cBlockSize, "ZSTD_noCompressBlock failed");
++            DEBUGLOG(5, "Writing out nocompress block, size: %zu", cBlockSize);
+         } else if (compressedSeqsSize == 1) {
+             cBlockSize = ZSTD_rleCompressBlock(op, dstCapacity, *ip, blockSize, lastBlock);
+-            FORWARD_IF_ERROR(cBlockSize, "RLE compress block failed");
+-            DEBUGLOG(4, "Writing out RLE block, size: %zu", cBlockSize);
++            FORWARD_IF_ERROR(cBlockSize, "ZSTD_rleCompressBlock failed");
++            DEBUGLOG(5, "Writing out RLE block, size: %zu", cBlockSize);
+         } else {
+             U32 cBlockHeader;
+             /* Error checking and repcodes update */
+@@ -5891,11 +6668,10 @@ ZSTD_compressSequences_internal(ZSTD_CCt
+             cBlockHeader = lastBlock + (((U32)bt_compressed)<<1) + (U32)(compressedSeqsSize << 3);
+             MEM_writeLE24(op, cBlockHeader);
+             cBlockSize = ZSTD_blockHeaderSize + compressedSeqsSize;
+-            DEBUGLOG(4, "Writing out compressed block, size: %zu", cBlockSize);
++            DEBUGLOG(5, "Writing out compressed block, size: %zu", cBlockSize);
+         }
+ 
+         cSize += cBlockSize;
+-        DEBUGLOG(4, "cSize running total: %zu", cSize);
+ 
+         if (lastBlock) {
+             break;
+@@ -5906,12 +6682,15 @@ ZSTD_compressSequences_internal(ZSTD_CCt
+             dstCapacity -= cBlockSize;
+             cctx->isFirstBlock = 0;
+         }
++        DEBUGLOG(5, "cSize running total: %zu (remaining dstCapacity=%zu)", cSize, dstCapacity);
+     }
+ 
++    DEBUGLOG(4, "cSize final total: %zu", cSize);
+     return cSize;
+ }
+ 
+-size_t ZSTD_compressSequences(ZSTD_CCtx* const cctx, void* dst, size_t dstCapacity,
++size_t ZSTD_compressSequences(ZSTD_CCtx* cctx,
++                              void* dst, size_t dstCapacity,
+                               const ZSTD_Sequence* inSeqs, size_t inSeqsSize,
+                               const void* src, size_t srcSize)
+ {
+@@ -5921,7 +6700,7 @@ size_t ZSTD_compressSequences(ZSTD_CCtx*
+     size_t frameHeaderSize = 0;
+ 
+     /* Transparent initialization stage, same as compressStream2() */
+-    DEBUGLOG(3, "ZSTD_compressSequences()");
++    DEBUGLOG(4, "ZSTD_compressSequences (dstCapacity=%zu)", dstCapacity);
+     assert(cctx != NULL);
+     FORWARD_IF_ERROR(ZSTD_CCtx_init_compressStream2(cctx, ZSTD_e_end, srcSize), "CCtx initialization failed");
+     /* Begin writing output, starting with frame header */
+@@ -5949,26 +6728,34 @@ size_t ZSTD_compressSequences(ZSTD_CCtx*
+         cSize += 4;
+     }
+ 
+-    DEBUGLOG(3, "Final compressed size: %zu", cSize);
++    DEBUGLOG(4, "Final compressed size: %zu", cSize);
+     return cSize;
+ }
+ 
+ /*======   Finalize   ======*/
+ 
++static ZSTD_inBuffer inBuffer_forEndFlush(const ZSTD_CStream* zcs)
++{
++    const ZSTD_inBuffer nullInput = { NULL, 0, 0 };
++    const int stableInput = (zcs->appliedParams.inBufferMode == ZSTD_bm_stable);
++    return stableInput ? zcs->expectedInBuffer : nullInput;
++}
++
+ /*! ZSTD_flushStream() :
+  * @return : amount of data remaining to flush */
+ size_t ZSTD_flushStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output)
+ {
+-    ZSTD_inBuffer input = { NULL, 0, 0 };
++    ZSTD_inBuffer input = inBuffer_forEndFlush(zcs);
++    input.size = input.pos; /* do not ingest more input during flush */
+     return ZSTD_compressStream2(zcs, output, &input, ZSTD_e_flush);
+ }
+ 
+ 
+ size_t ZSTD_endStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output)
+ {
+-    ZSTD_inBuffer input = { NULL, 0, 0 };
++    ZSTD_inBuffer input = inBuffer_forEndFlush(zcs);
+     size_t const remainingToFlush = ZSTD_compressStream2(zcs, output, &input, ZSTD_e_end);
+-    FORWARD_IF_ERROR( remainingToFlush , "ZSTD_compressStream2 failed");
++    FORWARD_IF_ERROR(remainingToFlush , "ZSTD_compressStream2(,,ZSTD_e_end) failed");
+     if (zcs->appliedParams.nbWorkers > 0) return remainingToFlush;   /* minimal estimation */
+     /* single thread mode : attempt to calculate remaining to flush more precisely */
+     {   size_t const lastBlockSize = zcs->frameEnded ? 0 : ZSTD_BLOCKHEADERSIZE;
+@@ -6090,7 +6877,7 @@ static ZSTD_compressionParameters ZSTD_g
+             cp.targetLength = (unsigned)(-clampedCompressionLevel);
+         }
+         /* refine parameters based on srcSize & dictSize */
+-        return ZSTD_adjustCParams_internal(cp, srcSizeHint, dictSize, mode);
++        return ZSTD_adjustCParams_internal(cp, srcSizeHint, dictSize, mode, ZSTD_ps_auto);
+     }
+ }
+ 
+@@ -6125,3 +6912,29 @@ ZSTD_parameters ZSTD_getParams(int compr
+     if (srcSizeHint == 0) srcSizeHint = ZSTD_CONTENTSIZE_UNKNOWN;
+     return ZSTD_getParams_internal(compressionLevel, srcSizeHint, dictSize, ZSTD_cpm_unknown);
+ }
++
++void ZSTD_registerSequenceProducer(
++    ZSTD_CCtx* zc,
++    void* extSeqProdState,
++    ZSTD_sequenceProducer_F extSeqProdFunc
++) {
++    assert(zc != NULL);
++    ZSTD_CCtxParams_registerSequenceProducer(
++        &zc->requestedParams, extSeqProdState, extSeqProdFunc
++    );
++}
++
++void ZSTD_CCtxParams_registerSequenceProducer(
++  ZSTD_CCtx_params* params,
++  void* extSeqProdState,
++  ZSTD_sequenceProducer_F extSeqProdFunc
++) {
++    assert(params != NULL);
++    if (extSeqProdFunc != NULL) {
++        params->extSeqProdFunc = extSeqProdFunc;
++        params->extSeqProdState = extSeqProdState;
++    } else {
++        params->extSeqProdFunc = NULL;
++        params->extSeqProdState = NULL;
++    }
++}
+--- a/lib/zstd/compress/zstd_compress_internal.h
++++ b/lib/zstd/compress/zstd_compress_internal.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -20,6 +21,7 @@
+ ***************************************/
+ #include "../common/zstd_internal.h"
+ #include "zstd_cwksp.h"
++#include "../common/bits.h" /* ZSTD_highbit32, ZSTD_NbCommonBytes */
+ 
+ 
+ /*-*************************************
+@@ -32,7 +34,7 @@
+                                        It's not a big deal though : candidate will just be sorted again.
+                                        Additionally, candidate position 1 will be lost.
+                                        But candidate 1 cannot hide a large tree of candidates, so it's a minimal loss.
+-                                       The benefit is that ZSTD_DUBT_UNSORTED_MARK cannot be mishandled after table re-use with a different strategy.
++                                       The benefit is that ZSTD_DUBT_UNSORTED_MARK cannot be mishandled after table reuse with a different strategy.
+                                        This constant is required by ZSTD_compressBlock_btlazy2() and ZSTD_reduceTable_internal() */
+ 
+ 
+@@ -111,12 +113,13 @@ typedef struct {
+ /* ZSTD_buildBlockEntropyStats() :
+  *  Builds entropy for the block.
+  *  @return : 0 on success or error code */
+-size_t ZSTD_buildBlockEntropyStats(seqStore_t* seqStorePtr,
+-                             const ZSTD_entropyCTables_t* prevEntropy,
+-                                   ZSTD_entropyCTables_t* nextEntropy,
+-                             const ZSTD_CCtx_params* cctxParams,
+-                                   ZSTD_entropyCTablesMetadata_t* entropyMetadata,
+-                                   void* workspace, size_t wkspSize);
++size_t ZSTD_buildBlockEntropyStats(
++                    const seqStore_t* seqStorePtr,
++                    const ZSTD_entropyCTables_t* prevEntropy,
++                          ZSTD_entropyCTables_t* nextEntropy,
++                    const ZSTD_CCtx_params* cctxParams,
++                          ZSTD_entropyCTablesMetadata_t* entropyMetadata,
++                          void* workspace, size_t wkspSize);
+ 
+ /* *******************************
+ *  Compression internals structs *
+@@ -142,26 +145,33 @@ typedef struct {
+   size_t capacity;      /* The capacity starting from `seq` pointer */
+ } rawSeqStore_t;
+ 
++typedef struct {
++    U32 idx;            /* Index in array of ZSTD_Sequence */
++    U32 posInSequence;  /* Position within sequence at idx */
++    size_t posInSrc;    /* Number of bytes given by sequences provided so far */
++} ZSTD_sequencePosition;
++
+ UNUSED_ATTR static const rawSeqStore_t kNullRawSeqStore = {NULL, 0, 0, 0, 0};
+ 
+ typedef struct {
+-    int price;
+-    U32 off;
+-    U32 mlen;
+-    U32 litlen;
+-    U32 rep[ZSTD_REP_NUM];
++    int price;  /* price from beginning of segment to this position */
++    U32 off;    /* offset of previous match */
++    U32 mlen;   /* length of previous match */
++    U32 litlen; /* nb of literals since previous match */
++    U32 rep[ZSTD_REP_NUM];  /* offset history after previous match */
+ } ZSTD_optimal_t;
+ 
+ typedef enum { zop_dynamic=0, zop_predef } ZSTD_OptPrice_e;
+ 
++#define ZSTD_OPT_SIZE (ZSTD_OPT_NUM+3)
+ typedef struct {
+     /* All tables are allocated inside cctx->workspace by ZSTD_resetCCtx_internal() */
+     unsigned* litFreq;           /* table of literals statistics, of size 256 */
+     unsigned* litLengthFreq;     /* table of litLength statistics, of size (MaxLL+1) */
+     unsigned* matchLengthFreq;   /* table of matchLength statistics, of size (MaxML+1) */
+     unsigned* offCodeFreq;       /* table of offCode statistics, of size (MaxOff+1) */
+-    ZSTD_match_t* matchTable;    /* list of found matches, of size ZSTD_OPT_NUM+1 */
+-    ZSTD_optimal_t* priceTable;  /* All positions tracked by optimal parser, of size ZSTD_OPT_NUM+1 */
++    ZSTD_match_t* matchTable;    /* list of found matches, of size ZSTD_OPT_SIZE */
++    ZSTD_optimal_t* priceTable;  /* All positions tracked by optimal parser, of size ZSTD_OPT_SIZE */
+ 
+     U32  litSum;                 /* nb of literals */
+     U32  litLengthSum;           /* nb of litLength codes */
+@@ -212,8 +222,10 @@ struct ZSTD_matchState_t {
+     U32 hashLog3;           /* dispatch table for matches of len==3 : larger == faster, more memory */
+ 
+     U32 rowHashLog;                          /* For row-based matchfinder: Hashlog based on nb of rows in the hashTable.*/
+-    U16* tagTable;                           /* For row-based matchFinder: A row-based table containing the hashes and head index. */
++    BYTE* tagTable;                          /* For row-based matchFinder: A row-based table containing the hashes and head index. */
+     U32 hashCache[ZSTD_ROW_HASH_CACHE_SIZE]; /* For row-based matchFinder: a cache of hashes to improve speed */
++    U64 hashSalt;                            /* For row-based matchFinder: salts the hash for reuse of tag table */
++    U32 hashSaltEntropy;                     /* For row-based matchFinder: collects entropy for salt generation */
+ 
+     U32* hashTable;
+     U32* hashTable3;
+@@ -228,6 +240,18 @@ struct ZSTD_matchState_t {
+     const ZSTD_matchState_t* dictMatchState;
+     ZSTD_compressionParameters cParams;
+     const rawSeqStore_t* ldmSeqStore;
++
++    /* Controls prefetching in some dictMatchState matchfinders.
++     * This behavior is controlled from the cctx ms.
++     * This parameter has no effect in the cdict ms. */
++    int prefetchCDictTables;
++
++    /* When == 0, lazy match finders insert every position.
++     * When != 0, lazy match finders only insert positions they search.
++     * This allows them to skip much faster over incompressible data,
++     * at a small cost to compression ratio.
++     */
++    int lazySkipping;
+ };
+ 
+ typedef struct {
+@@ -324,6 +348,25 @@ struct ZSTD_CCtx_params_s {
+ 
+     /* Internal use, for createCCtxParams() and freeCCtxParams() only */
+     ZSTD_customMem customMem;
++
++    /* Controls prefetching in some dictMatchState matchfinders */
++    ZSTD_paramSwitch_e prefetchCDictTables;
++
++    /* Controls whether zstd will fall back to an internal matchfinder
++     * if the external matchfinder returns an error code. */
++    int enableMatchFinderFallback;
++
++    /* Parameters for the external sequence producer API.
++     * Users set these parameters through ZSTD_registerSequenceProducer().
++     * It is not possible to set these parameters individually through the public API. */
++    void* extSeqProdState;
++    ZSTD_sequenceProducer_F extSeqProdFunc;
++
++    /* Adjust the max block size*/
++    size_t maxBlockSize;
++
++    /* Controls repcode search in external sequence parsing */
++    ZSTD_paramSwitch_e searchForExternalRepcodes;
+ };  /* typedef'd to ZSTD_CCtx_params within "zstd.h" */
+ 
+ #define COMPRESS_SEQUENCES_WORKSPACE_SIZE (sizeof(unsigned) * (MaxSeq + 2))
+@@ -404,6 +447,7 @@ struct ZSTD_CCtx_s {
+ 
+     /* Stable in/out buffer verification */
+     ZSTD_inBuffer expectedInBuffer;
++    size_t stableIn_notConsumed; /* nb bytes within stable input buffer that are said to be consumed but are not */
+     size_t expectedOutBufferSize;
+ 
+     /* Dictionary */
+@@ -417,9 +461,14 @@ struct ZSTD_CCtx_s {
+ 
+     /* Workspace for block splitter */
+     ZSTD_blockSplitCtx blockSplitCtx;
++
++    /* Buffer for output from external sequence producer */
++    ZSTD_Sequence* extSeqBuf;
++    size_t extSeqBufCapacity;
+ };
+ 
+ typedef enum { ZSTD_dtlm_fast, ZSTD_dtlm_full } ZSTD_dictTableLoadMethod_e;
++typedef enum { ZSTD_tfp_forCCtx, ZSTD_tfp_forCDict } ZSTD_tableFillPurpose_e;
+ 
+ typedef enum {
+     ZSTD_noDict = 0,
+@@ -441,7 +490,7 @@ typedef enum {
+                                  * In this mode we take both the source size and the dictionary size
+                                  * into account when selecting and adjusting the parameters.
+                                  */
+-    ZSTD_cpm_unknown = 3,       /* ZSTD_getCParams, ZSTD_getParams, ZSTD_adjustParams.
++    ZSTD_cpm_unknown = 3        /* ZSTD_getCParams, ZSTD_getParams, ZSTD_adjustParams.
+                                  * We don't know what these parameters are for. We default to the legacy
+                                  * behavior of taking both the source size and the dict size into account
+                                  * when selecting and adjusting parameters.
+@@ -500,9 +549,11 @@ MEM_STATIC int ZSTD_cParam_withinBounds(
+ /* ZSTD_noCompressBlock() :
+  * Writes uncompressed block to dst buffer from given src.
+  * Returns the size of the block */
+-MEM_STATIC size_t ZSTD_noCompressBlock (void* dst, size_t dstCapacity, const void* src, size_t srcSize, U32 lastBlock)
++MEM_STATIC size_t
++ZSTD_noCompressBlock(void* dst, size_t dstCapacity, const void* src, size_t srcSize, U32 lastBlock)
+ {
+     U32 const cBlockHeader24 = lastBlock + (((U32)bt_raw)<<1) + (U32)(srcSize << 3);
++    DEBUGLOG(5, "ZSTD_noCompressBlock (srcSize=%zu, dstCapacity=%zu)", srcSize, dstCapacity);
+     RETURN_ERROR_IF(srcSize + ZSTD_blockHeaderSize > dstCapacity,
+                     dstSize_tooSmall, "dst buf too small for uncompressed block");
+     MEM_writeLE24(dst, cBlockHeader24);
+@@ -510,7 +561,8 @@ MEM_STATIC size_t ZSTD_noCompressBlock (
+     return ZSTD_blockHeaderSize + srcSize;
+ }
+ 
+-MEM_STATIC size_t ZSTD_rleCompressBlock (void* dst, size_t dstCapacity, BYTE src, size_t srcSize, U32 lastBlock)
++MEM_STATIC size_t
++ZSTD_rleCompressBlock(void* dst, size_t dstCapacity, BYTE src, size_t srcSize, U32 lastBlock)
+ {
+     BYTE* const op = (BYTE*)dst;
+     U32 const cBlockHeader = lastBlock + (((U32)bt_rle)<<1) + (U32)(srcSize << 3);
+@@ -529,7 +581,7 @@ MEM_STATIC size_t ZSTD_minGain(size_t sr
+ {
+     U32 const minlog = (strat>=ZSTD_btultra) ? (U32)(strat) - 1 : 6;
+     ZSTD_STATIC_ASSERT(ZSTD_btultra == 8);
+-    assert(ZSTD_cParam_withinBounds(ZSTD_c_strategy, strat));
++    assert(ZSTD_cParam_withinBounds(ZSTD_c_strategy, (int)strat));
+     return (srcSize >> minlog) + 2;
+ }
+ 
+@@ -565,29 +617,27 @@ ZSTD_safecopyLiterals(BYTE* op, BYTE con
+     while (ip < iend) *op++ = *ip++;
+ }
+ 
+-#define ZSTD_REP_MOVE     (ZSTD_REP_NUM-1)
+-#define STORE_REPCODE_1 STORE_REPCODE(1)
+-#define STORE_REPCODE_2 STORE_REPCODE(2)
+-#define STORE_REPCODE_3 STORE_REPCODE(3)
+-#define STORE_REPCODE(r) (assert((r)>=1), assert((r)<=3), (r)-1)
+-#define STORE_OFFSET(o)  (assert((o)>0), o + ZSTD_REP_MOVE)
+-#define STORED_IS_OFFSET(o)  ((o) > ZSTD_REP_MOVE)
+-#define STORED_IS_REPCODE(o) ((o) <= ZSTD_REP_MOVE)
+-#define STORED_OFFSET(o)  (assert(STORED_IS_OFFSET(o)), (o)-ZSTD_REP_MOVE)
+-#define STORED_REPCODE(o) (assert(STORED_IS_REPCODE(o)), (o)+1)  /* returns ID 1,2,3 */
+-#define STORED_TO_OFFBASE(o) ((o)+1)
+-#define OFFBASE_TO_STORED(o) ((o)-1)
++
++#define REPCODE1_TO_OFFBASE REPCODE_TO_OFFBASE(1)
++#define REPCODE2_TO_OFFBASE REPCODE_TO_OFFBASE(2)
++#define REPCODE3_TO_OFFBASE REPCODE_TO_OFFBASE(3)
++#define REPCODE_TO_OFFBASE(r) (assert((r)>=1), assert((r)<=ZSTD_REP_NUM), (r)) /* accepts IDs 1,2,3 */
++#define OFFSET_TO_OFFBASE(o)  (assert((o)>0), o + ZSTD_REP_NUM)
++#define OFFBASE_IS_OFFSET(o)  ((o) > ZSTD_REP_NUM)
++#define OFFBASE_IS_REPCODE(o) ( 1 <= (o) && (o) <= ZSTD_REP_NUM)
++#define OFFBASE_TO_OFFSET(o)  (assert(OFFBASE_IS_OFFSET(o)), (o) - ZSTD_REP_NUM)
++#define OFFBASE_TO_REPCODE(o) (assert(OFFBASE_IS_REPCODE(o)), (o))  /* returns ID 1,2,3 */
+ 
+ /*! ZSTD_storeSeq() :
+- *  Store a sequence (litlen, litPtr, offCode and matchLength) into seqStore_t.
+- *  @offBase_minus1 : Users should use employ macros STORE_REPCODE_X and STORE_OFFSET().
++ *  Store a sequence (litlen, litPtr, offBase and matchLength) into seqStore_t.
++ *  @offBase : Users should employ macros REPCODE_TO_OFFBASE() and OFFSET_TO_OFFBASE().
+  *  @matchLength : must be >= MINMATCH
+- *  Allowed to overread literals up to litLimit.
++ *  Allowed to over-read literals up to litLimit.
+ */
+ HINT_INLINE UNUSED_ATTR void
+ ZSTD_storeSeq(seqStore_t* seqStorePtr,
+               size_t litLength, const BYTE* literals, const BYTE* litLimit,
+-              U32 offBase_minus1,
++              U32 offBase,
+               size_t matchLength)
+ {
+     BYTE const* const litLimit_w = litLimit - WILDCOPY_OVERLENGTH;
+@@ -596,8 +646,8 @@ ZSTD_storeSeq(seqStore_t* seqStorePtr,
+     static const BYTE* g_start = NULL;
+     if (g_start==NULL) g_start = (const BYTE*)literals;  /* note : index only works for compression within a single segment */
+     {   U32 const pos = (U32)((const BYTE*)literals - g_start);
+-        DEBUGLOG(6, "Cpos%7u :%3u literals, match%4u bytes at offCode%7u",
+-               pos, (U32)litLength, (U32)matchLength, (U32)offBase_minus1);
++        DEBUGLOG(6, "Cpos%7u :%3u literals, match%4u bytes at offBase%7u",
++               pos, (U32)litLength, (U32)matchLength, (U32)offBase);
+     }
+ #endif
+     assert((size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart) < seqStorePtr->maxNbSeq);
+@@ -607,9 +657,9 @@ ZSTD_storeSeq(seqStore_t* seqStorePtr,
+     assert(literals + litLength <= litLimit);
+     if (litEnd <= litLimit_w) {
+         /* Common case we can use wildcopy.
+-	 * First copy 16 bytes, because literals are likely short.
+-	 */
+-        assert(WILDCOPY_OVERLENGTH >= 16);
++         * First copy 16 bytes, because literals are likely short.
++         */
++        ZSTD_STATIC_ASSERT(WILDCOPY_OVERLENGTH >= 16);
+         ZSTD_copy16(seqStorePtr->lit, literals);
+         if (litLength > 16) {
+             ZSTD_wildcopy(seqStorePtr->lit+16, literals+16, (ptrdiff_t)litLength-16, ZSTD_no_overlap);
+@@ -628,7 +678,7 @@ ZSTD_storeSeq(seqStore_t* seqStorePtr,
+     seqStorePtr->sequences[0].litLength = (U16)litLength;
+ 
+     /* match offset */
+-    seqStorePtr->sequences[0].offBase = STORED_TO_OFFBASE(offBase_minus1);
++    seqStorePtr->sequences[0].offBase = offBase;
+ 
+     /* match Length */
+     assert(matchLength >= MINMATCH);
+@@ -646,17 +696,17 @@ ZSTD_storeSeq(seqStore_t* seqStorePtr,
+ 
+ /* ZSTD_updateRep() :
+  * updates in-place @rep (array of repeat offsets)
+- * @offBase_minus1 : sum-type, with same numeric representation as ZSTD_storeSeq()
++ * @offBase : sum-type, using numeric representation of ZSTD_storeSeq()
+  */
+ MEM_STATIC void
+-ZSTD_updateRep(U32 rep[ZSTD_REP_NUM], U32 const offBase_minus1, U32 const ll0)
++ZSTD_updateRep(U32 rep[ZSTD_REP_NUM], U32 const offBase, U32 const ll0)
+ {
+-    if (STORED_IS_OFFSET(offBase_minus1)) {  /* full offset */
++    if (OFFBASE_IS_OFFSET(offBase)) {  /* full offset */
+         rep[2] = rep[1];
+         rep[1] = rep[0];
+-        rep[0] = STORED_OFFSET(offBase_minus1);
++        rep[0] = OFFBASE_TO_OFFSET(offBase);
+     } else {   /* repcode */
+-        U32 const repCode = STORED_REPCODE(offBase_minus1) - 1 + ll0;
++        U32 const repCode = OFFBASE_TO_REPCODE(offBase) - 1 + ll0;
+         if (repCode > 0) {  /* note : if repCode==0, no change */
+             U32 const currentOffset = (repCode==ZSTD_REP_NUM) ? (rep[0] - 1) : rep[repCode];
+             rep[2] = (repCode >= 2) ? rep[1] : rep[2];
+@@ -673,11 +723,11 @@ typedef struct repcodes_s {
+ } repcodes_t;
+ 
+ MEM_STATIC repcodes_t
+-ZSTD_newRep(U32 const rep[ZSTD_REP_NUM], U32 const offBase_minus1, U32 const ll0)
++ZSTD_newRep(U32 const rep[ZSTD_REP_NUM], U32 const offBase, U32 const ll0)
+ {
+     repcodes_t newReps;
+     ZSTD_memcpy(&newReps, rep, sizeof(newReps));
+-    ZSTD_updateRep(newReps.rep, offBase_minus1, ll0);
++    ZSTD_updateRep(newReps.rep, offBase, ll0);
+     return newReps;
+ }
+ 
+@@ -685,59 +735,6 @@ ZSTD_newRep(U32 const rep[ZSTD_REP_NUM],
+ /*-*************************************
+ *  Match length counter
+ ***************************************/
+-static unsigned ZSTD_NbCommonBytes (size_t val)
+-{
+-    if (MEM_isLittleEndian()) {
+-        if (MEM_64bits()) {
+-#       if (__GNUC__ >= 4)
+-            return (__builtin_ctzll((U64)val) >> 3);
+-#       else
+-            static const int DeBruijnBytePos[64] = { 0, 0, 0, 0, 0, 1, 1, 2,
+-                                                     0, 3, 1, 3, 1, 4, 2, 7,
+-                                                     0, 2, 3, 6, 1, 5, 3, 5,
+-                                                     1, 3, 4, 4, 2, 5, 6, 7,
+-                                                     7, 0, 1, 2, 3, 3, 4, 6,
+-                                                     2, 6, 5, 5, 3, 4, 5, 6,
+-                                                     7, 1, 2, 4, 6, 4, 4, 5,
+-                                                     7, 2, 6, 5, 7, 6, 7, 7 };
+-            return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58];
+-#       endif
+-        } else { /* 32 bits */
+-#       if (__GNUC__ >= 3)
+-            return (__builtin_ctz((U32)val) >> 3);
+-#       else
+-            static const int DeBruijnBytePos[32] = { 0, 0, 3, 0, 3, 1, 3, 0,
+-                                                     3, 2, 2, 1, 3, 2, 0, 1,
+-                                                     3, 3, 1, 2, 2, 2, 2, 0,
+-                                                     3, 1, 2, 0, 1, 0, 1, 1 };
+-            return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27];
+-#       endif
+-        }
+-    } else {  /* Big Endian CPU */
+-        if (MEM_64bits()) {
+-#       if (__GNUC__ >= 4)
+-            return (__builtin_clzll(val) >> 3);
+-#       else
+-            unsigned r;
+-            const unsigned n32 = sizeof(size_t)*4;   /* calculate this way due to compiler complaining in 32-bits mode */
+-            if (!(val>>n32)) { r=4; } else { r=0; val>>=n32; }
+-            if (!(val>>16)) { r+=2; val>>=8; } else { val>>=24; }
+-            r += (!val);
+-            return r;
+-#       endif
+-        } else { /* 32 bits */
+-#       if (__GNUC__ >= 3)
+-            return (__builtin_clz((U32)val) >> 3);
+-#       else
+-            unsigned r;
+-            if (!(val>>16)) { r=2; val>>=8; } else { r=0; val>>=24; }
+-            r += (!val);
+-            return r;
+-#       endif
+-    }   }
+-}
+-
+-
+ MEM_STATIC size_t ZSTD_count(const BYTE* pIn, const BYTE* pMatch, const BYTE* const pInLimit)
+ {
+     const BYTE* const pStart = pIn;
+@@ -783,32 +780,43 @@ ZSTD_count_2segments(const BYTE* ip, con
+  *  Hashes
+  ***************************************/
+ static const U32 prime3bytes = 506832829U;
+-static U32    ZSTD_hash3(U32 u, U32 h) { return ((u << (32-24)) * prime3bytes)  >> (32-h) ; }
+-MEM_STATIC size_t ZSTD_hash3Ptr(const void* ptr, U32 h) { return ZSTD_hash3(MEM_readLE32(ptr), h); } /* only in zstd_opt.h */
++static U32    ZSTD_hash3(U32 u, U32 h, U32 s) { assert(h <= 32); return (((u << (32-24)) * prime3bytes) ^ s)  >> (32-h) ; }
++MEM_STATIC size_t ZSTD_hash3Ptr(const void* ptr, U32 h) { return ZSTD_hash3(MEM_readLE32(ptr), h, 0); } /* only in zstd_opt.h */
++MEM_STATIC size_t ZSTD_hash3PtrS(const void* ptr, U32 h, U32 s) { return ZSTD_hash3(MEM_readLE32(ptr), h, s); }
+ 
+ static const U32 prime4bytes = 2654435761U;
+-static U32    ZSTD_hash4(U32 u, U32 h) { return (u * prime4bytes) >> (32-h) ; }
+-static size_t ZSTD_hash4Ptr(const void* ptr, U32 h) { return ZSTD_hash4(MEM_read32(ptr), h); }
++static U32    ZSTD_hash4(U32 u, U32 h, U32 s) { assert(h <= 32); return ((u * prime4bytes) ^ s) >> (32-h) ; }
++static size_t ZSTD_hash4Ptr(const void* ptr, U32 h) { return ZSTD_hash4(MEM_readLE32(ptr), h, 0); }
++static size_t ZSTD_hash4PtrS(const void* ptr, U32 h, U32 s) { return ZSTD_hash4(MEM_readLE32(ptr), h, s); }
+ 
+ static const U64 prime5bytes = 889523592379ULL;
+-static size_t ZSTD_hash5(U64 u, U32 h) { return (size_t)(((u  << (64-40)) * prime5bytes) >> (64-h)) ; }
+-static size_t ZSTD_hash5Ptr(const void* p, U32 h) { return ZSTD_hash5(MEM_readLE64(p), h); }
++static size_t ZSTD_hash5(U64 u, U32 h, U64 s) { assert(h <= 64); return (size_t)((((u  << (64-40)) * prime5bytes) ^ s) >> (64-h)) ; }
++static size_t ZSTD_hash5Ptr(const void* p, U32 h) { return ZSTD_hash5(MEM_readLE64(p), h, 0); }
++static size_t ZSTD_hash5PtrS(const void* p, U32 h, U64 s) { return ZSTD_hash5(MEM_readLE64(p), h, s); }
+ 
+ static const U64 prime6bytes = 227718039650203ULL;
+-static size_t ZSTD_hash6(U64 u, U32 h) { return (size_t)(((u  << (64-48)) * prime6bytes) >> (64-h)) ; }
+-static size_t ZSTD_hash6Ptr(const void* p, U32 h) { return ZSTD_hash6(MEM_readLE64(p), h); }
++static size_t ZSTD_hash6(U64 u, U32 h, U64 s) { assert(h <= 64); return (size_t)((((u  << (64-48)) * prime6bytes) ^ s) >> (64-h)) ; }
++static size_t ZSTD_hash6Ptr(const void* p, U32 h) { return ZSTD_hash6(MEM_readLE64(p), h, 0); }
++static size_t ZSTD_hash6PtrS(const void* p, U32 h, U64 s) { return ZSTD_hash6(MEM_readLE64(p), h, s); }
+ 
+ static const U64 prime7bytes = 58295818150454627ULL;
+-static size_t ZSTD_hash7(U64 u, U32 h) { return (size_t)(((u  << (64-56)) * prime7bytes) >> (64-h)) ; }
+-static size_t ZSTD_hash7Ptr(const void* p, U32 h) { return ZSTD_hash7(MEM_readLE64(p), h); }
++static size_t ZSTD_hash7(U64 u, U32 h, U64 s) { assert(h <= 64); return (size_t)((((u  << (64-56)) * prime7bytes) ^ s) >> (64-h)) ; }
++static size_t ZSTD_hash7Ptr(const void* p, U32 h) { return ZSTD_hash7(MEM_readLE64(p), h, 0); }
++static size_t ZSTD_hash7PtrS(const void* p, U32 h, U64 s) { return ZSTD_hash7(MEM_readLE64(p), h, s); }
+ 
+ static const U64 prime8bytes = 0xCF1BBCDCB7A56463ULL;
+-static size_t ZSTD_hash8(U64 u, U32 h) { return (size_t)(((u) * prime8bytes) >> (64-h)) ; }
+-static size_t ZSTD_hash8Ptr(const void* p, U32 h) { return ZSTD_hash8(MEM_readLE64(p), h); }
++static size_t ZSTD_hash8(U64 u, U32 h, U64 s) { assert(h <= 64); return (size_t)((((u) * prime8bytes)  ^ s) >> (64-h)) ; }
++static size_t ZSTD_hash8Ptr(const void* p, U32 h) { return ZSTD_hash8(MEM_readLE64(p), h, 0); }
++static size_t ZSTD_hash8PtrS(const void* p, U32 h, U64 s) { return ZSTD_hash8(MEM_readLE64(p), h, s); }
++
+ 
+ MEM_STATIC FORCE_INLINE_ATTR
+ size_t ZSTD_hashPtr(const void* p, U32 hBits, U32 mls)
+ {
++    /* Although some of these hashes do support hBits up to 64, some do not.
++     * To be on the safe side, always avoid hBits > 32. */
++    assert(hBits <= 32);
++
+     switch(mls)
+     {
+     default:
+@@ -820,6 +828,24 @@ size_t ZSTD_hashPtr(const void* p, U32 h
+     }
+ }
+ 
++MEM_STATIC FORCE_INLINE_ATTR
++size_t ZSTD_hashPtrSalted(const void* p, U32 hBits, U32 mls, const U64 hashSalt) {
++    /* Although some of these hashes do support hBits up to 64, some do not.
++     * To be on the safe side, always avoid hBits > 32. */
++    assert(hBits <= 32);
++
++    switch(mls)
++    {
++        default:
++        case 4: return ZSTD_hash4PtrS(p, hBits, (U32)hashSalt);
++        case 5: return ZSTD_hash5PtrS(p, hBits, hashSalt);
++        case 6: return ZSTD_hash6PtrS(p, hBits, hashSalt);
++        case 7: return ZSTD_hash7PtrS(p, hBits, hashSalt);
++        case 8: return ZSTD_hash8PtrS(p, hBits, hashSalt);
++    }
++}
++
++
+ /* ZSTD_ipow() :
+  * Return base^exponent.
+  */
+@@ -1011,7 +1037,9 @@ MEM_STATIC U32 ZSTD_window_needOverflowC
+  * The least significant cycleLog bits of the indices must remain the same,
+  * which may be 0. Every index up to maxDist in the past must be valid.
+  */
+-MEM_STATIC U32 ZSTD_window_correctOverflow(ZSTD_window_t* window, U32 cycleLog,
++MEM_STATIC
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++U32 ZSTD_window_correctOverflow(ZSTD_window_t* window, U32 cycleLog,
+                                            U32 maxDist, void const* src)
+ {
+     /* preemptive overflow correction:
+@@ -1167,10 +1195,15 @@ ZSTD_checkDictValidity(const ZSTD_window
+                     (unsigned)blockEndIdx, (unsigned)maxDist, (unsigned)loadedDictEnd);
+         assert(blockEndIdx >= loadedDictEnd);
+ 
+-        if (blockEndIdx > loadedDictEnd + maxDist) {
++        if (blockEndIdx > loadedDictEnd + maxDist || loadedDictEnd != window->dictLimit) {
+             /* On reaching window size, dictionaries are invalidated.
+              * For simplification, if window size is reached anywhere within next block,
+              * the dictionary is invalidated for the full block.
++             *
++             * We also have to invalidate the dictionary if ZSTD_window_update() has detected
++             * non-contiguous segments, which means that loadedDictEnd != window->dictLimit.
++             * loadedDictEnd may be 0, if forceWindow is true, but in that case we never use
++             * dictMatchState, so setting it to NULL is not a problem.
+              */
+             DEBUGLOG(6, "invalidating dictionary for current block (distance > windowSize)");
+             *loadedDictEndPtr = 0;
+@@ -1199,7 +1232,9 @@ MEM_STATIC void ZSTD_window_init(ZSTD_wi
+  * forget about the extDict. Handles overlap of the prefix and extDict.
+  * Returns non-zero if the segment is contiguous.
+  */
+-MEM_STATIC U32 ZSTD_window_update(ZSTD_window_t* window,
++MEM_STATIC
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++U32 ZSTD_window_update(ZSTD_window_t* window,
+                                   void const* src, size_t srcSize,
+                                   int forceNonContiguous)
+ {
+@@ -1302,6 +1337,42 @@ MEM_STATIC void ZSTD_debugTable(const U3
+ 
+ #endif
+ 
++/* Short Cache */
++
++/* Normally, zstd matchfinders follow this flow:
++ *     1. Compute hash at ip
++ *     2. Load index from hashTable[hash]
++ *     3. Check if *ip == *(base + index)
++ * In dictionary compression, loading *(base + index) is often an L2 or even L3 miss.
++ *
++ * Short cache is an optimization which allows us to avoid step 3 most of the time
++ * when the data doesn't actually match. With short cache, the flow becomes:
++ *     1. Compute (hash, currentTag) at ip. currentTag is an 8-bit independent hash at ip.
++ *     2. Load (index, matchTag) from hashTable[hash]. See ZSTD_writeTaggedIndex to understand how this works.
++ *     3. Only if currentTag == matchTag, check *ip == *(base + index). Otherwise, continue.
++ *
++ * Currently, short cache is only implemented in CDict hashtables. Thus, its use is limited to
++ * dictMatchState matchfinders.
++ */
++#define ZSTD_SHORT_CACHE_TAG_BITS 8
++#define ZSTD_SHORT_CACHE_TAG_MASK ((1u << ZSTD_SHORT_CACHE_TAG_BITS) - 1)
++
++/* Helper function for ZSTD_fillHashTable and ZSTD_fillDoubleHashTable.
++ * Unpacks hashAndTag into (hash, tag), then packs (index, tag) into hashTable[hash]. */
++MEM_STATIC void ZSTD_writeTaggedIndex(U32* const hashTable, size_t hashAndTag, U32 index) {
++    size_t const hash = hashAndTag >> ZSTD_SHORT_CACHE_TAG_BITS;
++    U32 const tag = (U32)(hashAndTag & ZSTD_SHORT_CACHE_TAG_MASK);
++    assert(index >> (32 - ZSTD_SHORT_CACHE_TAG_BITS) == 0);
++    hashTable[hash] = (index << ZSTD_SHORT_CACHE_TAG_BITS) | tag;
++}
++
++/* Helper function for short cache matchfinders.
++ * Unpacks tag1 and tag2 from lower bits of packedTag1 and packedTag2, then checks if the tags match. */
++MEM_STATIC int ZSTD_comparePackedTags(size_t packedTag1, size_t packedTag2) {
++    U32 const tag1 = packedTag1 & ZSTD_SHORT_CACHE_TAG_MASK;
++    U32 const tag2 = packedTag2 & ZSTD_SHORT_CACHE_TAG_MASK;
++    return tag1 == tag2;
++}
+ 
+ 
+ /* ===============================================================
+@@ -1381,11 +1452,10 @@ size_t ZSTD_writeLastEmptyBlock(void* ds
+  * This cannot be used when long range matching is enabled.
+  * Zstd will use these sequences, and pass the literals to a secondary block
+  * compressor.
+- * @return : An error code on failure.
+  * NOTE: seqs are not verified! Invalid sequences can cause out-of-bounds memory
+  * access and data corruption.
+  */
+-size_t ZSTD_referenceExternalSequences(ZSTD_CCtx* cctx, rawSeq* seq, size_t nbSeq);
++void ZSTD_referenceExternalSequences(ZSTD_CCtx* cctx, rawSeq* seq, size_t nbSeq);
+ 
+ /* ZSTD_cycleLog() :
+  *  condition for correct operation : hashLog > 1 */
+@@ -1396,4 +1466,55 @@ U32 ZSTD_cycleLog(U32 hashLog, ZSTD_stra
+  */
+ void ZSTD_CCtx_trace(ZSTD_CCtx* cctx, size_t extraCSize);
+ 
++/* Returns 0 on success, and a ZSTD_error otherwise. This function scans through an array of
++ * ZSTD_Sequence, storing the sequences it finds, until it reaches a block delimiter.
++ * Note that the block delimiter must include the last literals of the block.
++ */
++size_t
++ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx,
++                                              ZSTD_sequencePosition* seqPos,
++                                        const ZSTD_Sequence* const inSeqs, size_t inSeqsSize,
++                                        const void* src, size_t blockSize, ZSTD_paramSwitch_e externalRepSearch);
++
++/* Returns the number of bytes to move the current read position back by.
++ * Only non-zero if we ended up splitting a sequence.
++ * Otherwise, it may return a ZSTD error if something went wrong.
++ *
++ * This function will attempt to scan through blockSize bytes
++ * represented by the sequences in @inSeqs,
++ * storing any (partial) sequences.
++ *
++ * Occasionally, we may want to change the actual number of bytes we consumed from inSeqs to
++ * avoid splitting a match, or to avoid splitting a match such that it would produce a match
++ * smaller than MINMATCH. In this case, we return the number of bytes that we didn't read from this block.
++ */
++size_t
++ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* seqPos,
++                                   const ZSTD_Sequence* const inSeqs, size_t inSeqsSize,
++                                   const void* src, size_t blockSize, ZSTD_paramSwitch_e externalRepSearch);
++
++/* Returns 1 if an external sequence producer is registered, otherwise returns 0. */
++MEM_STATIC int ZSTD_hasExtSeqProd(const ZSTD_CCtx_params* params) {
++    return params->extSeqProdFunc != NULL;
++}
++
++/* ===============================================================
++ * Deprecated definitions that are still used internally to avoid
++ * deprecation warnings. These functions are exactly equivalent to
++ * their public variants, but avoid the deprecation warnings.
++ * =============================================================== */
++
++size_t ZSTD_compressBegin_usingCDict_deprecated(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict);
++
++size_t ZSTD_compressContinue_public(ZSTD_CCtx* cctx,
++                                    void* dst, size_t dstCapacity,
++                              const void* src, size_t srcSize);
++
++size_t ZSTD_compressEnd_public(ZSTD_CCtx* cctx,
++                               void* dst, size_t dstCapacity,
++                         const void* src, size_t srcSize);
++
++size_t ZSTD_compressBlock_deprecated(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
++
++
+ #endif /* ZSTD_COMPRESS_H */
+--- a/lib/zstd/compress/zstd_compress_literals.c
++++ b/lib/zstd/compress/zstd_compress_literals.c
+@@ -1,5 +1,6 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -13,11 +14,36 @@
+  ***************************************/
+ #include "zstd_compress_literals.h"
+ 
++
++/* **************************************************************
++*  Debug Traces
++****************************************************************/
++#if DEBUGLEVEL >= 2
++
++static size_t showHexa(const void* src, size_t srcSize)
++{
++    const BYTE* const ip = (const BYTE*)src;
++    size_t u;
++    for (u=0; u<srcSize; u++) {
++        RAWLOG(5, " %02X", ip[u]); (void)ip;
++    }
++    RAWLOG(5, " \n");
++    return srcSize;
++}
++
++#endif
++
++
++/* **************************************************************
++*  Literals compression - special cases
++****************************************************************/
+ size_t ZSTD_noCompressLiterals (void* dst, size_t dstCapacity, const void* src, size_t srcSize)
+ {
+     BYTE* const ostart = (BYTE*)dst;
+     U32   const flSize = 1 + (srcSize>31) + (srcSize>4095);
+ 
++    DEBUGLOG(5, "ZSTD_noCompressLiterals: srcSize=%zu, dstCapacity=%zu", srcSize, dstCapacity);
++
+     RETURN_ERROR_IF(srcSize + flSize > dstCapacity, dstSize_tooSmall, "");
+ 
+     switch(flSize)
+@@ -36,16 +62,30 @@ size_t ZSTD_noCompressLiterals (void* ds
+     }
+ 
+     ZSTD_memcpy(ostart + flSize, src, srcSize);
+-    DEBUGLOG(5, "Raw literals: %u -> %u", (U32)srcSize, (U32)(srcSize + flSize));
++    DEBUGLOG(5, "Raw (uncompressed) literals: %u -> %u", (U32)srcSize, (U32)(srcSize + flSize));
+     return srcSize + flSize;
+ }
+ 
++static int allBytesIdentical(const void* src, size_t srcSize)
++{
++    assert(srcSize >= 1);
++    assert(src != NULL);
++    {   const BYTE b = ((const BYTE*)src)[0];
++        size_t p;
++        for (p=1; p<srcSize; p++) {
++            if (((const BYTE*)src)[p] != b) return 0;
++        }
++        return 1;
++    }
++}
++
+ size_t ZSTD_compressRleLiteralsBlock (void* dst, size_t dstCapacity, const void* src, size_t srcSize)
+ {
+     BYTE* const ostart = (BYTE*)dst;
+     U32   const flSize = 1 + (srcSize>31) + (srcSize>4095);
+ 
+-    (void)dstCapacity;  /* dstCapacity already guaranteed to be >=4, hence large enough */
++    assert(dstCapacity >= 4); (void)dstCapacity;
++    assert(allBytesIdentical(src, srcSize));
+ 
+     switch(flSize)
+     {
+@@ -63,28 +103,51 @@ size_t ZSTD_compressRleLiteralsBlock (vo
+     }
+ 
+     ostart[flSize] = *(const BYTE*)src;
+-    DEBUGLOG(5, "RLE literals: %u -> %u", (U32)srcSize, (U32)flSize + 1);
++    DEBUGLOG(5, "RLE : Repeated Literal (%02X: %u times) -> %u bytes encoded", ((const BYTE*)src)[0], (U32)srcSize, (U32)flSize + 1);
+     return flSize+1;
+ }
+ 
+-size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf,
+-                              ZSTD_hufCTables_t* nextHuf,
+-                              ZSTD_strategy strategy, int disableLiteralCompression,
+-                              void* dst, size_t dstCapacity,
+-                        const void* src, size_t srcSize,
+-                              void* entropyWorkspace, size_t entropyWorkspaceSize,
+-                        const int bmi2,
+-                        unsigned suspectUncompressible)
++/* ZSTD_minLiteralsToCompress() :
++ * returns minimal amount of literals
++ * for literal compression to even be attempted.
++ * Minimum is made tighter as compression strategy increases.
++ */
++static size_t
++ZSTD_minLiteralsToCompress(ZSTD_strategy strategy, HUF_repeat huf_repeat)
++{
++    assert((int)strategy >= 0);
++    assert((int)strategy <= 9);
++    /* btultra2 : min 8 bytes;
++     * then 2x larger for each successive compression strategy
++     * max threshold 64 bytes */
++    {   int const shift = MIN(9-(int)strategy, 3);
++        size_t const mintc = (huf_repeat == HUF_repeat_valid) ? 6 : (size_t)8 << shift;
++        DEBUGLOG(7, "minLiteralsToCompress = %zu", mintc);
++        return mintc;
++    }
++}
++
++size_t ZSTD_compressLiterals (
++                  void* dst, size_t dstCapacity,
++            const void* src, size_t srcSize,
++                  void* entropyWorkspace, size_t entropyWorkspaceSize,
++            const ZSTD_hufCTables_t* prevHuf,
++                  ZSTD_hufCTables_t* nextHuf,
++                  ZSTD_strategy strategy,
++                  int disableLiteralCompression,
++                  int suspectUncompressible,
++                  int bmi2)
+ {
+-    size_t const minGain = ZSTD_minGain(srcSize, strategy);
+     size_t const lhSize = 3 + (srcSize >= 1 KB) + (srcSize >= 16 KB);
+     BYTE*  const ostart = (BYTE*)dst;
+     U32 singleStream = srcSize < 256;
+     symbolEncodingType_e hType = set_compressed;
+     size_t cLitSize;
+ 
+-    DEBUGLOG(5,"ZSTD_compressLiterals (disableLiteralCompression=%i srcSize=%u)",
+-                disableLiteralCompression, (U32)srcSize);
++    DEBUGLOG(5,"ZSTD_compressLiterals (disableLiteralCompression=%i, srcSize=%u, dstCapacity=%zu)",
++                disableLiteralCompression, (U32)srcSize, dstCapacity);
++
++    DEBUGLOG(6, "Completed literals listing (%zu bytes)", showHexa(src, srcSize));
+ 
+     /* Prepare nextEntropy assuming reusing the existing table */
+     ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
+@@ -92,40 +155,51 @@ size_t ZSTD_compressLiterals (ZSTD_hufCT
+     if (disableLiteralCompression)
+         return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize);
+ 
+-    /* small ? don't even attempt compression (speed opt) */
+-#   define COMPRESS_LITERALS_SIZE_MIN 63
+-    {   size_t const minLitSize = (prevHuf->repeatMode == HUF_repeat_valid) ? 6 : COMPRESS_LITERALS_SIZE_MIN;
+-        if (srcSize <= minLitSize) return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize);
+-    }
++    /* if too small, don't even attempt compression (speed opt) */
++    if (srcSize < ZSTD_minLiteralsToCompress(strategy, prevHuf->repeatMode))
++        return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize);
+ 
+     RETURN_ERROR_IF(dstCapacity < lhSize+1, dstSize_tooSmall, "not enough space for compression");
+     {   HUF_repeat repeat = prevHuf->repeatMode;
+-        int const preferRepeat = strategy < ZSTD_lazy ? srcSize <= 1024 : 0;
++        int const flags = 0
++            | (bmi2 ? HUF_flags_bmi2 : 0)
++            | (strategy < ZSTD_lazy && srcSize <= 1024 ? HUF_flags_preferRepeat : 0)
++            | (strategy >= HUF_OPTIMAL_DEPTH_THRESHOLD ? HUF_flags_optimalDepth : 0)
++            | (suspectUncompressible ? HUF_flags_suspectUncompressible : 0);
++
++        typedef size_t (*huf_compress_f)(void*, size_t, const void*, size_t, unsigned, unsigned, void*, size_t, HUF_CElt*, HUF_repeat*, int);
++        huf_compress_f huf_compress;
+         if (repeat == HUF_repeat_valid && lhSize == 3) singleStream = 1;
+-        cLitSize = singleStream ?
+-            HUF_compress1X_repeat(
+-                ostart+lhSize, dstCapacity-lhSize, src, srcSize,
+-                HUF_SYMBOLVALUE_MAX, HUF_TABLELOG_DEFAULT, entropyWorkspace, entropyWorkspaceSize,
+-                (HUF_CElt*)nextHuf->CTable, &repeat, preferRepeat, bmi2, suspectUncompressible) :
+-            HUF_compress4X_repeat(
+-                ostart+lhSize, dstCapacity-lhSize, src, srcSize,
+-                HUF_SYMBOLVALUE_MAX, HUF_TABLELOG_DEFAULT, entropyWorkspace, entropyWorkspaceSize,
+-                (HUF_CElt*)nextHuf->CTable, &repeat, preferRepeat, bmi2, suspectUncompressible);
++        huf_compress = singleStream ? HUF_compress1X_repeat : HUF_compress4X_repeat;
++        cLitSize = huf_compress(ostart+lhSize, dstCapacity-lhSize,
++                                src, srcSize,
++                                HUF_SYMBOLVALUE_MAX, LitHufLog,
++                                entropyWorkspace, entropyWorkspaceSize,
++                                (HUF_CElt*)nextHuf->CTable,
++                                &repeat, flags);
++        DEBUGLOG(5, "%zu literals compressed into %zu bytes (before header)", srcSize, cLitSize);
+         if (repeat != HUF_repeat_none) {
+             /* reused the existing table */
+-            DEBUGLOG(5, "Reusing previous huffman table");
++            DEBUGLOG(5, "reusing statistics from previous huffman block");
+             hType = set_repeat;
+         }
+     }
+ 
+-    if ((cLitSize==0) || (cLitSize >= srcSize - minGain) || ERR_isError(cLitSize)) {
+-        ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
+-        return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize);
+-    }
++    {   size_t const minGain = ZSTD_minGain(srcSize, strategy);
++        if ((cLitSize==0) || (cLitSize >= srcSize - minGain) || ERR_isError(cLitSize)) {
++            ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
++            return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize);
++    }   }
+     if (cLitSize==1) {
+-        ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
+-        return ZSTD_compressRleLiteralsBlock(dst, dstCapacity, src, srcSize);
+-    }
++        /* A return value of 1 signals that the alphabet consists of a single symbol.
++         * However, in some rare circumstances, it could be the compressed size (a single byte).
++         * For that outcome to have a chance to happen, it's necessary that `srcSize < 8`.
++         * (it's also necessary to not generate statistics).
++         * Therefore, in such a case, actively check that all bytes are identical. */
++        if ((srcSize >= 8) || allBytesIdentical(src, srcSize)) {
++            ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
++            return ZSTD_compressRleLiteralsBlock(dst, dstCapacity, src, srcSize);
++    }   }
+ 
+     if (hType == set_compressed) {
+         /* using a newly constructed table */
+@@ -136,16 +210,19 @@ size_t ZSTD_compressLiterals (ZSTD_hufCT
+     switch(lhSize)
+     {
+     case 3: /* 2 - 2 - 10 - 10 */
+-        {   U32 const lhc = hType + ((!singleStream) << 2) + ((U32)srcSize<<4) + ((U32)cLitSize<<14);
++        if (!singleStream) assert(srcSize >= MIN_LITERALS_FOR_4_STREAMS);
++        {   U32 const lhc = hType + ((U32)(!singleStream) << 2) + ((U32)srcSize<<4) + ((U32)cLitSize<<14);
+             MEM_writeLE24(ostart, lhc);
+             break;
+         }
+     case 4: /* 2 - 2 - 14 - 14 */
++        assert(srcSize >= MIN_LITERALS_FOR_4_STREAMS);
+         {   U32 const lhc = hType + (2 << 2) + ((U32)srcSize<<4) + ((U32)cLitSize<<18);
+             MEM_writeLE32(ostart, lhc);
+             break;
+         }
+     case 5: /* 2 - 2 - 18 - 18 */
++        assert(srcSize >= MIN_LITERALS_FOR_4_STREAMS);
+         {   U32 const lhc = hType + (3 << 2) + ((U32)srcSize<<4) + ((U32)cLitSize<<22);
+             MEM_writeLE32(ostart, lhc);
+             ostart[4] = (BYTE)(cLitSize >> 10);
+--- a/lib/zstd/compress/zstd_compress_literals.h
++++ b/lib/zstd/compress/zstd_compress_literals.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -16,16 +17,24 @@
+ 
+ size_t ZSTD_noCompressLiterals (void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+ 
++/* ZSTD_compressRleLiteralsBlock() :
++ * Conditions :
++ * - All bytes in @src are identical
++ * - dstCapacity >= 4 */
+ size_t ZSTD_compressRleLiteralsBlock (void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+ 
+-/* If suspectUncompressible then some sampling checks will be run to potentially skip huffman coding */
+-size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf,
+-                              ZSTD_hufCTables_t* nextHuf,
+-                              ZSTD_strategy strategy, int disableLiteralCompression,
+-                              void* dst, size_t dstCapacity,
++/* ZSTD_compressLiterals():
++ * @entropyWorkspace: must be aligned on 4-bytes boundaries
++ * @entropyWorkspaceSize : must be >= HUF_WORKSPACE_SIZE
++ * @suspectUncompressible: sampling checks, to potentially skip huffman coding
++ */
++size_t ZSTD_compressLiterals (void* dst, size_t dstCapacity,
+                         const void* src, size_t srcSize,
+                               void* entropyWorkspace, size_t entropyWorkspaceSize,
+-                        const int bmi2,
+-                        unsigned suspectUncompressible);
++                        const ZSTD_hufCTables_t* prevHuf,
++                              ZSTD_hufCTables_t* nextHuf,
++                              ZSTD_strategy strategy, int disableLiteralCompression,
++                              int suspectUncompressible,
++                              int bmi2);
+ 
+ #endif /* ZSTD_COMPRESS_LITERALS_H */
+--- a/lib/zstd/compress/zstd_compress_sequences.c
++++ b/lib/zstd/compress/zstd_compress_sequences.c
+@@ -1,5 +1,6 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -58,7 +59,7 @@ static unsigned ZSTD_useLowProbCount(siz
+ {
+     /* Heuristic: This should cover most blocks <= 16K and
+      * start to fade out after 16K to about 32K depending on
+-     * comprssibility.
++     * compressibility.
+      */
+     return nbSeq >= 2048;
+ }
+@@ -166,7 +167,7 @@ ZSTD_selectEncodingType(
+     if (mostFrequent == nbSeq) {
+         *repeatMode = FSE_repeat_none;
+         if (isDefaultAllowed && nbSeq <= 2) {
+-            /* Prefer set_basic over set_rle when there are 2 or less symbols,
++            /* Prefer set_basic over set_rle when there are 2 or fewer symbols,
+              * since RLE uses 1 byte, but set_basic uses 5-6 bits per symbol.
+              * If basic encoding isn't possible, always choose RLE.
+              */
+--- a/lib/zstd/compress/zstd_compress_sequences.h
++++ b/lib/zstd/compress/zstd_compress_sequences.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+--- a/lib/zstd/compress/zstd_compress_superblock.c
++++ b/lib/zstd/compress/zstd_compress_superblock.c
+@@ -1,5 +1,6 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -36,13 +37,14 @@
+  *      If it is set_compressed, first sub-block's literals section will be Treeless_Literals_Block
+  *      and the following sub-blocks' literals sections will be Treeless_Literals_Block.
+  *  @return : compressed size of literals section of a sub-block
+- *            Or 0 if it unable to compress.
++ *            Or 0 if unable to compress.
+  *            Or error code */
+-static size_t ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable,
+-                                    const ZSTD_hufCTablesMetadata_t* hufMetadata,
+-                                    const BYTE* literals, size_t litSize,
+-                                    void* dst, size_t dstSize,
+-                                    const int bmi2, int writeEntropy, int* entropyWritten)
++static size_t
++ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable,
++                              const ZSTD_hufCTablesMetadata_t* hufMetadata,
++                              const BYTE* literals, size_t litSize,
++                              void* dst, size_t dstSize,
++                              const int bmi2, int writeEntropy, int* entropyWritten)
+ {
+     size_t const header = writeEntropy ? 200 : 0;
+     size_t const lhSize = 3 + (litSize >= (1 KB - header)) + (litSize >= (16 KB - header));
+@@ -53,8 +55,6 @@ static size_t ZSTD_compressSubBlock_lite
+     symbolEncodingType_e hType = writeEntropy ? hufMetadata->hType : set_repeat;
+     size_t cLitSize = 0;
+ 
+-    (void)bmi2; /* TODO bmi2... */
+-
+     DEBUGLOG(5, "ZSTD_compressSubBlock_literal (litSize=%zu, lhSize=%zu, writeEntropy=%d)", litSize, lhSize, writeEntropy);
+ 
+     *entropyWritten = 0;
+@@ -76,9 +76,9 @@ static size_t ZSTD_compressSubBlock_lite
+         DEBUGLOG(5, "ZSTD_compressSubBlock_literal (hSize=%zu)", hufMetadata->hufDesSize);
+     }
+ 
+-    /* TODO bmi2 */
+-    {   const size_t cSize = singleStream ? HUF_compress1X_usingCTable(op, oend-op, literals, litSize, hufTable)
+-                                          : HUF_compress4X_usingCTable(op, oend-op, literals, litSize, hufTable);
++    {   int const flags = bmi2 ? HUF_flags_bmi2 : 0;
++        const size_t cSize = singleStream ? HUF_compress1X_usingCTable(op, (size_t)(oend-op), literals, litSize, hufTable, flags)
++                                          : HUF_compress4X_usingCTable(op, (size_t)(oend-op), literals, litSize, hufTable, flags);
+         op += cSize;
+         cLitSize += cSize;
+         if (cSize == 0 || ERR_isError(cSize)) {
+@@ -103,7 +103,7 @@ static size_t ZSTD_compressSubBlock_lite
+     switch(lhSize)
+     {
+     case 3: /* 2 - 2 - 10 - 10 */
+-        {   U32 const lhc = hType + ((!singleStream) << 2) + ((U32)litSize<<4) + ((U32)cLitSize<<14);
++        {   U32 const lhc = hType + ((U32)(!singleStream) << 2) + ((U32)litSize<<4) + ((U32)cLitSize<<14);
+             MEM_writeLE24(ostart, lhc);
+             break;
+         }
+@@ -123,26 +123,30 @@ static size_t ZSTD_compressSubBlock_lite
+     }
+     *entropyWritten = 1;
+     DEBUGLOG(5, "Compressed literals: %u -> %u", (U32)litSize, (U32)(op-ostart));
+-    return op-ostart;
++    return (size_t)(op-ostart);
+ }
+ 
+-static size_t ZSTD_seqDecompressedSize(seqStore_t const* seqStore, const seqDef* sequences, size_t nbSeq, size_t litSize, int lastSequence) {
+-    const seqDef* const sstart = sequences;
+-    const seqDef* const send = sequences + nbSeq;
+-    const seqDef* sp = sstart;
++static size_t
++ZSTD_seqDecompressedSize(seqStore_t const* seqStore,
++                   const seqDef* sequences, size_t nbSeqs,
++                         size_t litSize, int lastSubBlock)
++{
+     size_t matchLengthSum = 0;
+     size_t litLengthSum = 0;
+-    (void)(litLengthSum); /* suppress unused variable warning on some environments */
+-    while (send-sp > 0) {
+-        ZSTD_sequenceLength const seqLen = ZSTD_getSequenceLength(seqStore, sp);
++    size_t n;
++    for (n=0; n<nbSeqs; n++) {
++        const ZSTD_sequenceLength seqLen = ZSTD_getSequenceLength(seqStore, sequences+n);
+         litLengthSum += seqLen.litLength;
+         matchLengthSum += seqLen.matchLength;
+-        sp++;
+     }
+-    assert(litLengthSum <= litSize);
+-    if (!lastSequence) {
++    DEBUGLOG(5, "ZSTD_seqDecompressedSize: %u sequences from %p: %u literals + %u matchlength",
++                (unsigned)nbSeqs, (const void*)sequences,
++                (unsigned)litLengthSum, (unsigned)matchLengthSum);
++    if (!lastSubBlock)
+         assert(litLengthSum == litSize);
+-    }
++    else
++        assert(litLengthSum <= litSize);
++    (void)litLengthSum;
+     return matchLengthSum + litSize;
+ }
+ 
+@@ -156,13 +160,14 @@ static size_t ZSTD_seqDecompressedSize(s
+  *  @return : compressed size of sequences section of a sub-block
+  *            Or 0 if it is unable to compress
+  *            Or error code. */
+-static size_t ZSTD_compressSubBlock_sequences(const ZSTD_fseCTables_t* fseTables,
+-                                              const ZSTD_fseCTablesMetadata_t* fseMetadata,
+-                                              const seqDef* sequences, size_t nbSeq,
+-                                              const BYTE* llCode, const BYTE* mlCode, const BYTE* ofCode,
+-                                              const ZSTD_CCtx_params* cctxParams,
+-                                              void* dst, size_t dstCapacity,
+-                                              const int bmi2, int writeEntropy, int* entropyWritten)
++static size_t
++ZSTD_compressSubBlock_sequences(const ZSTD_fseCTables_t* fseTables,
++                                const ZSTD_fseCTablesMetadata_t* fseMetadata,
++                                const seqDef* sequences, size_t nbSeq,
++                                const BYTE* llCode, const BYTE* mlCode, const BYTE* ofCode,
++                                const ZSTD_CCtx_params* cctxParams,
++                                void* dst, size_t dstCapacity,
++                                const int bmi2, int writeEntropy, int* entropyWritten)
+ {
+     const int longOffsets = cctxParams->cParams.windowLog > STREAM_ACCUMULATOR_MIN;
+     BYTE* const ostart = (BYTE*)dst;
+@@ -176,14 +181,14 @@ static size_t ZSTD_compressSubBlock_sequ
+     /* Sequences Header */
+     RETURN_ERROR_IF((oend-op) < 3 /*max nbSeq Size*/ + 1 /*seqHead*/,
+                     dstSize_tooSmall, "");
+-    if (nbSeq < 0x7F)
++    if (nbSeq < 128)
+         *op++ = (BYTE)nbSeq;
+     else if (nbSeq < LONGNBSEQ)
+         op[0] = (BYTE)((nbSeq>>8) + 0x80), op[1] = (BYTE)nbSeq, op+=2;
+     else
+         op[0]=0xFF, MEM_writeLE16(op+1, (U16)(nbSeq - LONGNBSEQ)), op+=3;
+     if (nbSeq==0) {
+-        return op - ostart;
++        return (size_t)(op - ostart);
+     }
+ 
+     /* seqHead : flags for FSE encoding type */
+@@ -205,7 +210,7 @@ static size_t ZSTD_compressSubBlock_sequ
+     }
+ 
+     {   size_t const bitstreamSize = ZSTD_encodeSequences(
+-                                        op, oend - op,
++                                        op, (size_t)(oend - op),
+                                         fseTables->matchlengthCTable, mlCode,
+                                         fseTables->offcodeCTable, ofCode,
+                                         fseTables->litlengthCTable, llCode,
+@@ -249,7 +254,7 @@ static size_t ZSTD_compressSubBlock_sequ
+ #endif
+ 
+     *entropyWritten = 1;
+-    return op - ostart;
++    return (size_t)(op - ostart);
+ }
+ 
+ /* ZSTD_compressSubBlock() :
+@@ -275,7 +280,8 @@ static size_t ZSTD_compressSubBlock(cons
+                 litSize, nbSeq, writeLitEntropy, writeSeqEntropy, lastBlock);
+     {   size_t cLitSize = ZSTD_compressSubBlock_literal((const HUF_CElt*)entropy->huf.CTable,
+                                                         &entropyMetadata->hufMetadata, literals, litSize,
+-                                                        op, oend-op, bmi2, writeLitEntropy, litEntropyWritten);
++                                                        op, (size_t)(oend-op),
++                                                        bmi2, writeLitEntropy, litEntropyWritten);
+         FORWARD_IF_ERROR(cLitSize, "ZSTD_compressSubBlock_literal failed");
+         if (cLitSize == 0) return 0;
+         op += cLitSize;
+@@ -285,18 +291,18 @@ static size_t ZSTD_compressSubBlock(cons
+                                                   sequences, nbSeq,
+                                                   llCode, mlCode, ofCode,
+                                                   cctxParams,
+-                                                  op, oend-op,
++                                                  op, (size_t)(oend-op),
+                                                   bmi2, writeSeqEntropy, seqEntropyWritten);
+         FORWARD_IF_ERROR(cSeqSize, "ZSTD_compressSubBlock_sequences failed");
+         if (cSeqSize == 0) return 0;
+         op += cSeqSize;
+     }
+     /* Write block header */
+-    {   size_t cSize = (op-ostart)-ZSTD_blockHeaderSize;
++    {   size_t cSize = (size_t)(op-ostart) - ZSTD_blockHeaderSize;
+         U32 const cBlockHeader24 = lastBlock + (((U32)bt_compressed)<<1) + (U32)(cSize << 3);
+         MEM_writeLE24(ostart, cBlockHeader24);
+     }
+-    return op-ostart;
++    return (size_t)(op-ostart);
+ }
+ 
+ static size_t ZSTD_estimateSubBlockSize_literal(const BYTE* literals, size_t litSize,
+@@ -385,7 +391,11 @@ static size_t ZSTD_estimateSubBlockSize_
+     return cSeqSizeEstimate + sequencesSectionHeaderSize;
+ }
+ 
+-static size_t ZSTD_estimateSubBlockSize(const BYTE* literals, size_t litSize,
++typedef struct {
++    size_t estLitSize;
++    size_t estBlockSize;
++} EstimatedBlockSize;
++static EstimatedBlockSize ZSTD_estimateSubBlockSize(const BYTE* literals, size_t litSize,
+                                         const BYTE* ofCodeTable,
+                                         const BYTE* llCodeTable,
+                                         const BYTE* mlCodeTable,
+@@ -393,15 +403,17 @@ static size_t ZSTD_estimateSubBlockSize(
+                                         const ZSTD_entropyCTables_t* entropy,
+                                         const ZSTD_entropyCTablesMetadata_t* entropyMetadata,
+                                         void* workspace, size_t wkspSize,
+-                                        int writeLitEntropy, int writeSeqEntropy) {
+-    size_t cSizeEstimate = 0;
+-    cSizeEstimate += ZSTD_estimateSubBlockSize_literal(literals, litSize,
+-                                                         &entropy->huf, &entropyMetadata->hufMetadata,
+-                                                         workspace, wkspSize, writeLitEntropy);
+-    cSizeEstimate += ZSTD_estimateSubBlockSize_sequences(ofCodeTable, llCodeTable, mlCodeTable,
++                                        int writeLitEntropy, int writeSeqEntropy)
++{
++    EstimatedBlockSize ebs;
++    ebs.estLitSize = ZSTD_estimateSubBlockSize_literal(literals, litSize,
++                                                        &entropy->huf, &entropyMetadata->hufMetadata,
++                                                        workspace, wkspSize, writeLitEntropy);
++    ebs.estBlockSize = ZSTD_estimateSubBlockSize_sequences(ofCodeTable, llCodeTable, mlCodeTable,
+                                                          nbSeq, &entropy->fse, &entropyMetadata->fseMetadata,
+                                                          workspace, wkspSize, writeSeqEntropy);
+-    return cSizeEstimate + ZSTD_blockHeaderSize;
++    ebs.estBlockSize += ebs.estLitSize + ZSTD_blockHeaderSize;
++    return ebs;
+ }
+ 
+ static int ZSTD_needSequenceEntropyTables(ZSTD_fseCTablesMetadata_t const* fseMetadata)
+@@ -415,13 +427,56 @@ static int ZSTD_needSequenceEntropyTable
+     return 0;
+ }
+ 
++static size_t countLiterals(seqStore_t const* seqStore, const seqDef* sp, size_t seqCount)
++{
++    size_t n, total = 0;
++    assert(sp != NULL);
++    for (n=0; n<seqCount; n++) {
++        total += ZSTD_getSequenceLength(seqStore, sp+n).litLength;
++    }
++    DEBUGLOG(6, "countLiterals for %zu sequences from %p => %zu bytes", seqCount, (const void*)sp, total);
++    return total;
++}
++
++#define BYTESCALE 256
++
++static size_t sizeBlockSequences(const seqDef* sp, size_t nbSeqs,
++                size_t targetBudget, size_t avgLitCost, size_t avgSeqCost,
++                int firstSubBlock)
++{
++    size_t n, budget = 0, inSize=0;
++    /* entropy headers */
++    size_t const headerSize = (size_t)firstSubBlock * 120 * BYTESCALE; /* generous estimate */
++    assert(firstSubBlock==0 || firstSubBlock==1);
++    budget += headerSize;
++
++    /* first sequence => at least one sequence*/
++    budget += sp[0].litLength * avgLitCost + avgSeqCost;
++    if (budget > targetBudget) return 1;
++    inSize = sp[0].litLength + (sp[0].mlBase+MINMATCH);
++
++    /* loop over sequences */
++    for (n=1; n<nbSeqs; n++) {
++        size_t currentCost = sp[n].litLength * avgLitCost + avgSeqCost;
++        budget += currentCost;
++        inSize += sp[n].litLength + (sp[n].mlBase+MINMATCH);
++        /* stop when sub-block budget is reached */
++        if ( (budget > targetBudget)
++            /* though continue to expand until the sub-block is deemed compressible */
++          && (budget < inSize * BYTESCALE) )
++            break;
++    }
++
++    return n;
++}
++
+ /* ZSTD_compressSubBlock_multi() :
+  *  Breaks super-block into multiple sub-blocks and compresses them.
+- *  Entropy will be written to the first block.
+- *  The following blocks will use repeat mode to compress.
+- *  All sub-blocks are compressed blocks (no raw or rle blocks).
+- *  @return : compressed size of the super block (which is multiple ZSTD blocks)
+- *            Or 0 if it failed to compress. */
++ *  Entropy will be written into the first block.
++ *  The following blocks use repeat_mode to compress.
++ *  Sub-blocks are all compressed, except the last one when beneficial.
++ *  @return : compressed size of the super block (which features multiple ZSTD blocks)
++ *            or 0 if it failed to compress. */
+ static size_t ZSTD_compressSubBlock_multi(const seqStore_t* seqStorePtr,
+                             const ZSTD_compressedBlockState_t* prevCBlock,
+                             ZSTD_compressedBlockState_t* nextCBlock,
+@@ -434,10 +489,12 @@ static size_t ZSTD_compressSubBlock_mult
+ {
+     const seqDef* const sstart = seqStorePtr->sequencesStart;
+     const seqDef* const send = seqStorePtr->sequences;
+-    const seqDef* sp = sstart;
++    const seqDef* sp = sstart; /* tracks progresses within seqStorePtr->sequences */
++    size_t const nbSeqs = (size_t)(send - sstart);
+     const BYTE* const lstart = seqStorePtr->litStart;
+     const BYTE* const lend = seqStorePtr->lit;
+     const BYTE* lp = lstart;
++    size_t const nbLiterals = (size_t)(lend - lstart);
+     BYTE const* ip = (BYTE const*)src;
+     BYTE const* const iend = ip + srcSize;
+     BYTE* const ostart = (BYTE*)dst;
+@@ -446,112 +503,171 @@ static size_t ZSTD_compressSubBlock_mult
+     const BYTE* llCodePtr = seqStorePtr->llCode;
+     const BYTE* mlCodePtr = seqStorePtr->mlCode;
+     const BYTE* ofCodePtr = seqStorePtr->ofCode;
+-    size_t targetCBlockSize = cctxParams->targetCBlockSize;
+-    size_t litSize, seqCount;
+-    int writeLitEntropy = entropyMetadata->hufMetadata.hType == set_compressed;
++    size_t const minTarget = ZSTD_TARGETCBLOCKSIZE_MIN; /* enforce minimum size, to reduce undesirable side effects */
++    size_t const targetCBlockSize = MAX(minTarget, cctxParams->targetCBlockSize);
++    int writeLitEntropy = (entropyMetadata->hufMetadata.hType == set_compressed);
+     int writeSeqEntropy = 1;
+-    int lastSequence = 0;
+ 
+-    DEBUGLOG(5, "ZSTD_compressSubBlock_multi (litSize=%u, nbSeq=%u)",
+-                (unsigned)(lend-lp), (unsigned)(send-sstart));
++    DEBUGLOG(5, "ZSTD_compressSubBlock_multi (srcSize=%u, litSize=%u, nbSeq=%u)",
++               (unsigned)srcSize, (unsigned)(lend-lstart), (unsigned)(send-sstart));
+ 
+-    litSize = 0;
+-    seqCount = 0;
+-    do {
+-        size_t cBlockSizeEstimate = 0;
+-        if (sstart == send) {
+-            lastSequence = 1;
+-        } else {
+-            const seqDef* const sequence = sp + seqCount;
+-            lastSequence = sequence == send - 1;
+-            litSize += ZSTD_getSequenceLength(seqStorePtr, sequence).litLength;
+-            seqCount++;
+-        }
+-        if (lastSequence) {
+-            assert(lp <= lend);
+-            assert(litSize <= (size_t)(lend - lp));
+-            litSize = (size_t)(lend - lp);
+-        }
+-        /* I think there is an optimization opportunity here.
+-         * Calling ZSTD_estimateSubBlockSize for every sequence can be wasteful
+-         * since it recalculates estimate from scratch.
+-         * For example, it would recount literal distribution and symbol codes every time.
+-         */
+-        cBlockSizeEstimate = ZSTD_estimateSubBlockSize(lp, litSize, ofCodePtr, llCodePtr, mlCodePtr, seqCount,
+-                                                       &nextCBlock->entropy, entropyMetadata,
+-                                                       workspace, wkspSize, writeLitEntropy, writeSeqEntropy);
+-        if (cBlockSizeEstimate > targetCBlockSize || lastSequence) {
+-            int litEntropyWritten = 0;
+-            int seqEntropyWritten = 0;
+-            const size_t decompressedSize = ZSTD_seqDecompressedSize(seqStorePtr, sp, seqCount, litSize, lastSequence);
+-            const size_t cSize = ZSTD_compressSubBlock(&nextCBlock->entropy, entropyMetadata,
+-                                                       sp, seqCount,
+-                                                       lp, litSize,
+-                                                       llCodePtr, mlCodePtr, ofCodePtr,
+-                                                       cctxParams,
+-                                                       op, oend-op,
+-                                                       bmi2, writeLitEntropy, writeSeqEntropy,
+-                                                       &litEntropyWritten, &seqEntropyWritten,
+-                                                       lastBlock && lastSequence);
+-            FORWARD_IF_ERROR(cSize, "ZSTD_compressSubBlock failed");
+-            if (cSize > 0 && cSize < decompressedSize) {
+-                DEBUGLOG(5, "Committed the sub-block");
+-                assert(ip + decompressedSize <= iend);
+-                ip += decompressedSize;
+-                sp += seqCount;
+-                lp += litSize;
+-                op += cSize;
+-                llCodePtr += seqCount;
+-                mlCodePtr += seqCount;
+-                ofCodePtr += seqCount;
+-                litSize = 0;
+-                seqCount = 0;
+-                /* Entropy only needs to be written once */
+-                if (litEntropyWritten) {
+-                    writeLitEntropy = 0;
+-                }
+-                if (seqEntropyWritten) {
+-                    writeSeqEntropy = 0;
+-                }
++        /* let's start by a general estimation for the full block */
++    if (nbSeqs > 0) {
++        EstimatedBlockSize const ebs =
++                ZSTD_estimateSubBlockSize(lp, nbLiterals,
++                                        ofCodePtr, llCodePtr, mlCodePtr, nbSeqs,
++                                        &nextCBlock->entropy, entropyMetadata,
++                                        workspace, wkspSize,
++                                        writeLitEntropy, writeSeqEntropy);
++        /* quick estimation */
++        size_t const avgLitCost = nbLiterals ? (ebs.estLitSize * BYTESCALE) / nbLiterals : BYTESCALE;
++        size_t const avgSeqCost = ((ebs.estBlockSize - ebs.estLitSize) * BYTESCALE) / nbSeqs;
++        const size_t nbSubBlocks = MAX((ebs.estBlockSize + (targetCBlockSize/2)) / targetCBlockSize, 1);
++        size_t n, avgBlockBudget, blockBudgetSupp=0;
++        avgBlockBudget = (ebs.estBlockSize * BYTESCALE) / nbSubBlocks;
++        DEBUGLOG(5, "estimated fullblock size=%u bytes ; avgLitCost=%.2f ; avgSeqCost=%.2f ; targetCBlockSize=%u, nbSubBlocks=%u ; avgBlockBudget=%.0f bytes",
++                    (unsigned)ebs.estBlockSize, (double)avgLitCost/BYTESCALE, (double)avgSeqCost/BYTESCALE,
++                    (unsigned)targetCBlockSize, (unsigned)nbSubBlocks, (double)avgBlockBudget/BYTESCALE);
++        /* simplification: if estimates states that the full superblock doesn't compress, just bail out immediately
++         * this will result in the production of a single uncompressed block covering @srcSize.*/
++        if (ebs.estBlockSize > srcSize) return 0;
++
++        /* compress and write sub-blocks */
++        assert(nbSubBlocks>0);
++        for (n=0; n < nbSubBlocks-1; n++) {
++            /* determine nb of sequences for current sub-block + nbLiterals from next sequence */
++            size_t const seqCount = sizeBlockSequences(sp, (size_t)(send-sp),
++                                        avgBlockBudget + blockBudgetSupp, avgLitCost, avgSeqCost, n==0);
++            /* if reached last sequence : break to last sub-block (simplification) */
++            assert(seqCount <= (size_t)(send-sp));
++            if (sp + seqCount == send) break;
++            assert(seqCount > 0);
++            /* compress sub-block */
++            {   int litEntropyWritten = 0;
++                int seqEntropyWritten = 0;
++                size_t litSize = countLiterals(seqStorePtr, sp, seqCount);
++                const size_t decompressedSize =
++                        ZSTD_seqDecompressedSize(seqStorePtr, sp, seqCount, litSize, 0);
++                size_t const cSize = ZSTD_compressSubBlock(&nextCBlock->entropy, entropyMetadata,
++                                                sp, seqCount,
++                                                lp, litSize,
++                                                llCodePtr, mlCodePtr, ofCodePtr,
++                                                cctxParams,
++                                                op, (size_t)(oend-op),
++                                                bmi2, writeLitEntropy, writeSeqEntropy,
++                                                &litEntropyWritten, &seqEntropyWritten,
++                                                0);
++                FORWARD_IF_ERROR(cSize, "ZSTD_compressSubBlock failed");
++
++                /* check compressibility, update state components */
++                if (cSize > 0 && cSize < decompressedSize) {
++                    DEBUGLOG(5, "Committed sub-block compressing %u bytes => %u bytes",
++                                (unsigned)decompressedSize, (unsigned)cSize);
++                    assert(ip + decompressedSize <= iend);
++                    ip += decompressedSize;
++                    lp += litSize;
++                    op += cSize;
++                    llCodePtr += seqCount;
++                    mlCodePtr += seqCount;
++                    ofCodePtr += seqCount;
++                    /* Entropy only needs to be written once */
++                    if (litEntropyWritten) {
++                        writeLitEntropy = 0;
++                    }
++                    if (seqEntropyWritten) {
++                        writeSeqEntropy = 0;
++                    }
++                    sp += seqCount;
++                    blockBudgetSupp = 0;
++            }   }
++            /* otherwise : do not compress yet, coalesce current sub-block with following one */
++        }
++    } /* if (nbSeqs > 0) */
++
++    /* write last block */
++    DEBUGLOG(5, "Generate last sub-block: %u sequences remaining", (unsigned)(send - sp));
++    {   int litEntropyWritten = 0;
++        int seqEntropyWritten = 0;
++        size_t litSize = (size_t)(lend - lp);
++        size_t seqCount = (size_t)(send - sp);
++        const size_t decompressedSize =
++                ZSTD_seqDecompressedSize(seqStorePtr, sp, seqCount, litSize, 1);
++        size_t const cSize = ZSTD_compressSubBlock(&nextCBlock->entropy, entropyMetadata,
++                                            sp, seqCount,
++                                            lp, litSize,
++                                            llCodePtr, mlCodePtr, ofCodePtr,
++                                            cctxParams,
++                                            op, (size_t)(oend-op),
++                                            bmi2, writeLitEntropy, writeSeqEntropy,
++                                            &litEntropyWritten, &seqEntropyWritten,
++                                            lastBlock);
++        FORWARD_IF_ERROR(cSize, "ZSTD_compressSubBlock failed");
++
++        /* update pointers, the nb of literals borrowed from next sequence must be preserved */
++        if (cSize > 0 && cSize < decompressedSize) {
++            DEBUGLOG(5, "Last sub-block compressed %u bytes => %u bytes",
++                        (unsigned)decompressedSize, (unsigned)cSize);
++            assert(ip + decompressedSize <= iend);
++            ip += decompressedSize;
++            lp += litSize;
++            op += cSize;
++            llCodePtr += seqCount;
++            mlCodePtr += seqCount;
++            ofCodePtr += seqCount;
++            /* Entropy only needs to be written once */
++            if (litEntropyWritten) {
++                writeLitEntropy = 0;
+             }
++            if (seqEntropyWritten) {
++                writeSeqEntropy = 0;
++            }
++            sp += seqCount;
+         }
+-    } while (!lastSequence);
++    }
++
++
+     if (writeLitEntropy) {
+-        DEBUGLOG(5, "ZSTD_compressSubBlock_multi has literal entropy tables unwritten");
++        DEBUGLOG(5, "Literal entropy tables were never written");
+         ZSTD_memcpy(&nextCBlock->entropy.huf, &prevCBlock->entropy.huf, sizeof(prevCBlock->entropy.huf));
+     }
+     if (writeSeqEntropy && ZSTD_needSequenceEntropyTables(&entropyMetadata->fseMetadata)) {
+         /* If we haven't written our entropy tables, then we've violated our contract and
+          * must emit an uncompressed block.
+          */
+-        DEBUGLOG(5, "ZSTD_compressSubBlock_multi has sequence entropy tables unwritten");
++        DEBUGLOG(5, "Sequence entropy tables were never written => cancel, emit an uncompressed block");
+         return 0;
+     }
++
+     if (ip < iend) {
+-        size_t const cSize = ZSTD_noCompressBlock(op, oend - op, ip, iend - ip, lastBlock);
+-        DEBUGLOG(5, "ZSTD_compressSubBlock_multi last sub-block uncompressed, %zu bytes", (size_t)(iend - ip));
++        /* some data left : last part of the block sent uncompressed */
++        size_t const rSize = (size_t)((iend - ip));
++        size_t const cSize = ZSTD_noCompressBlock(op, (size_t)(oend - op), ip, rSize, lastBlock);
++        DEBUGLOG(5, "Generate last uncompressed sub-block of %u bytes", (unsigned)(rSize));
+         FORWARD_IF_ERROR(cSize, "ZSTD_noCompressBlock failed");
+         assert(cSize != 0);
+         op += cSize;
+         /* We have to regenerate the repcodes because we've skipped some sequences */
+         if (sp < send) {
+-            seqDef const* seq;
++            const seqDef* seq;
+             repcodes_t rep;
+             ZSTD_memcpy(&rep, prevCBlock->rep, sizeof(rep));
+             for (seq = sstart; seq < sp; ++seq) {
+-                ZSTD_updateRep(rep.rep, seq->offBase - 1, ZSTD_getSequenceLength(seqStorePtr, seq).litLength == 0);
++                ZSTD_updateRep(rep.rep, seq->offBase, ZSTD_getSequenceLength(seqStorePtr, seq).litLength == 0);
+             }
+             ZSTD_memcpy(nextCBlock->rep, &rep, sizeof(rep));
+         }
+     }
+-    DEBUGLOG(5, "ZSTD_compressSubBlock_multi compressed");
+-    return op-ostart;
++
++    DEBUGLOG(5, "ZSTD_compressSubBlock_multi compressed all subBlocks: total compressed size = %u",
++                (unsigned)(op-ostart));
++    return (size_t)(op-ostart);
+ }
+ 
+ size_t ZSTD_compressSuperBlock(ZSTD_CCtx* zc,
+                                void* dst, size_t dstCapacity,
+-                               void const* src, size_t srcSize,
+-                               unsigned lastBlock) {
++                               const void* src, size_t srcSize,
++                               unsigned lastBlock)
++{
+     ZSTD_entropyCTablesMetadata_t entropyMetadata;
+ 
+     FORWARD_IF_ERROR(ZSTD_buildBlockEntropyStats(&zc->seqStore,
+--- a/lib/zstd/compress/zstd_compress_superblock.h
++++ b/lib/zstd/compress/zstd_compress_superblock.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+--- a/lib/zstd/compress/zstd_cwksp.h
++++ b/lib/zstd/compress/zstd_cwksp.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -14,7 +15,9 @@
+ /*-*************************************
+ *  Dependencies
+ ***************************************/
++#include "../common/allocations.h"  /* ZSTD_customMalloc, ZSTD_customFree */
+ #include "../common/zstd_internal.h"
++#include "../common/portability_macros.h"
+ 
+ 
+ /*-*************************************
+@@ -41,8 +44,9 @@
+ ***************************************/
+ typedef enum {
+     ZSTD_cwksp_alloc_objects,
+-    ZSTD_cwksp_alloc_buffers,
+-    ZSTD_cwksp_alloc_aligned
++    ZSTD_cwksp_alloc_aligned_init_once,
++    ZSTD_cwksp_alloc_aligned,
++    ZSTD_cwksp_alloc_buffers
+ } ZSTD_cwksp_alloc_phase_e;
+ 
+ /*
+@@ -95,8 +99,8 @@ typedef enum {
+  *
+  * Workspace Layout:
+  *
+- * [                        ... workspace ...                         ]
+- * [objects][tables ... ->] free space [<- ... aligned][<- ... buffers]
++ * [                        ... workspace ...                           ]
++ * [objects][tables ->] free space [<- buffers][<- aligned][<- init once]
+  *
+  * The various objects that live in the workspace are divided into the
+  * following categories, and are allocated separately:
+@@ -120,9 +124,18 @@ typedef enum {
+  *   uint32_t arrays, all of whose values are between 0 and (nextSrc - base).
+  *   Their sizes depend on the cparams. These tables are 64-byte aligned.
+  *
+- * - Aligned: these buffers are used for various purposes that require 4 byte
+- *   alignment, but don't require any initialization before they're used. These
+- *   buffers are each aligned to 64 bytes.
++ * - Init once: these buffers require to be initialized at least once before
++ *   use. They should be used when we want to skip memory initialization
++ *   while not triggering memory checkers (like Valgrind) when reading from
++ *   from this memory without writing to it first.
++ *   These buffers should be used carefully as they might contain data
++ *   from previous compressions.
++ *   Buffers are aligned to 64 bytes.
++ *
++ * - Aligned: these buffers don't require any initialization before they're
++ *   used. The user of the buffer should make sure they write into a buffer
++ *   location before reading from it.
++ *   Buffers are aligned to 64 bytes.
+  *
+  * - Buffers: these buffers are used for various purposes that don't require
+  *   any alignment or initialization before they're used. This means they can
+@@ -134,8 +147,9 @@ typedef enum {
+  * correctly packed into the workspace buffer. That order is:
+  *
+  * 1. Objects
+- * 2. Buffers
+- * 3. Aligned/Tables
++ * 2. Init once / Tables
++ * 3. Aligned / Tables
++ * 4. Buffers / Tables
+  *
+  * Attempts to reserve objects of different types out of order will fail.
+  */
+@@ -147,6 +161,7 @@ typedef struct {
+     void* tableEnd;
+     void* tableValidEnd;
+     void* allocStart;
++    void* initOnceStart;
+ 
+     BYTE allocFailed;
+     int workspaceOversizedDuration;
+@@ -159,6 +174,7 @@ typedef struct {
+ ***************************************/
+ 
+ MEM_STATIC size_t ZSTD_cwksp_available_space(ZSTD_cwksp* ws);
++MEM_STATIC void*  ZSTD_cwksp_initialAllocStart(ZSTD_cwksp* ws);
+ 
+ MEM_STATIC void ZSTD_cwksp_assert_internal_consistency(ZSTD_cwksp* ws) {
+     (void)ws;
+@@ -168,6 +184,8 @@ MEM_STATIC void ZSTD_cwksp_assert_intern
+     assert(ws->tableEnd <= ws->allocStart);
+     assert(ws->tableValidEnd <= ws->allocStart);
+     assert(ws->allocStart <= ws->workspaceEnd);
++    assert(ws->initOnceStart <= ZSTD_cwksp_initialAllocStart(ws));
++    assert(ws->workspace <= ws->initOnceStart);
+ }
+ 
+ /*
+@@ -210,14 +228,10 @@ MEM_STATIC size_t ZSTD_cwksp_aligned_all
+  * for internal purposes (currently only alignment).
+  */
+ MEM_STATIC size_t ZSTD_cwksp_slack_space_required(void) {
+-    /* For alignment, the wksp will always allocate an additional n_1=[1, 64] bytes
+-     * to align the beginning of tables section, as well as another n_2=[0, 63] bytes
+-     * to align the beginning of the aligned section.
+-     *
+-     * n_1 + n_2 == 64 bytes if the cwksp is freshly allocated, due to tables and
+-     * aligneds being sized in multiples of 64 bytes.
++    /* For alignment, the wksp will always allocate an additional 2*ZSTD_CWKSP_ALIGNMENT_BYTES
++     * bytes to align the beginning of tables section and end of buffers;
+      */
+-    size_t const slackSpace = ZSTD_CWKSP_ALIGNMENT_BYTES;
++    size_t const slackSpace = ZSTD_CWKSP_ALIGNMENT_BYTES * 2;
+     return slackSpace;
+ }
+ 
+@@ -230,11 +244,19 @@ MEM_STATIC size_t ZSTD_cwksp_bytes_to_al
+     size_t const alignBytesMask = alignBytes - 1;
+     size_t const bytes = (alignBytes - ((size_t)ptr & (alignBytesMask))) & alignBytesMask;
+     assert((alignBytes & alignBytesMask) == 0);
+-    assert(bytes != ZSTD_CWKSP_ALIGNMENT_BYTES);
++    assert(bytes < alignBytes);
+     return bytes;
+ }
+ 
+ /*
++ * Returns the initial value for allocStart which is used to determine the position from
++ * which we can allocate from the end of the workspace.
++ */
++MEM_STATIC void*  ZSTD_cwksp_initialAllocStart(ZSTD_cwksp* ws) {
++    return (void*)((size_t)ws->workspaceEnd & ~(ZSTD_CWKSP_ALIGNMENT_BYTES-1));
++}
++
++/*
+  * Internal function. Do not use directly.
+  * Reserves the given number of bytes within the aligned/buffer segment of the wksp,
+  * which counts from the end of the wksp (as opposed to the object/table segment).
+@@ -274,27 +296,16 @@ ZSTD_cwksp_internal_advance_phase(ZSTD_c
+ {
+     assert(phase >= ws->phase);
+     if (phase > ws->phase) {
+-        /* Going from allocating objects to allocating buffers */
+-        if (ws->phase < ZSTD_cwksp_alloc_buffers &&
+-                phase >= ZSTD_cwksp_alloc_buffers) {
++        /* Going from allocating objects to allocating initOnce / tables */
++        if (ws->phase < ZSTD_cwksp_alloc_aligned_init_once &&
++            phase >= ZSTD_cwksp_alloc_aligned_init_once) {
+             ws->tableValidEnd = ws->objectEnd;
+-        }
++            ws->initOnceStart = ZSTD_cwksp_initialAllocStart(ws);
+ 
+-        /* Going from allocating buffers to allocating aligneds/tables */
+-        if (ws->phase < ZSTD_cwksp_alloc_aligned &&
+-                phase >= ZSTD_cwksp_alloc_aligned) {
+-            {   /* Align the start of the "aligned" to 64 bytes. Use [1, 64] bytes. */
+-                size_t const bytesToAlign =
+-                    ZSTD_CWKSP_ALIGNMENT_BYTES - ZSTD_cwksp_bytes_to_align_ptr(ws->allocStart, ZSTD_CWKSP_ALIGNMENT_BYTES);
+-                DEBUGLOG(5, "reserving aligned alignment addtl space: %zu", bytesToAlign);
+-                ZSTD_STATIC_ASSERT((ZSTD_CWKSP_ALIGNMENT_BYTES & (ZSTD_CWKSP_ALIGNMENT_BYTES - 1)) == 0); /* power of 2 */
+-                RETURN_ERROR_IF(!ZSTD_cwksp_reserve_internal_buffer_space(ws, bytesToAlign),
+-                                memory_allocation, "aligned phase - alignment initial allocation failed!");
+-            }
+             {   /* Align the start of the tables to 64 bytes. Use [0, 63] bytes */
+-                void* const alloc = ws->objectEnd;
++                void *const alloc = ws->objectEnd;
+                 size_t const bytesToAlign = ZSTD_cwksp_bytes_to_align_ptr(alloc, ZSTD_CWKSP_ALIGNMENT_BYTES);
+-                void* const objectEnd = (BYTE*)alloc + bytesToAlign;
++                void *const objectEnd = (BYTE *) alloc + bytesToAlign;
+                 DEBUGLOG(5, "reserving table alignment addtl space: %zu", bytesToAlign);
+                 RETURN_ERROR_IF(objectEnd > ws->workspaceEnd, memory_allocation,
+                                 "table phase - alignment initial allocation failed!");
+@@ -302,7 +313,9 @@ ZSTD_cwksp_internal_advance_phase(ZSTD_c
+                 ws->tableEnd = objectEnd;  /* table area starts being empty */
+                 if (ws->tableValidEnd < ws->tableEnd) {
+                     ws->tableValidEnd = ws->tableEnd;
+-        }   }   }
++                }
++            }
++        }
+         ws->phase = phase;
+         ZSTD_cwksp_assert_internal_consistency(ws);
+     }
+@@ -314,7 +327,7 @@ ZSTD_cwksp_internal_advance_phase(ZSTD_c
+  */
+ MEM_STATIC int ZSTD_cwksp_owns_buffer(const ZSTD_cwksp* ws, const void* ptr)
+ {
+-    return (ptr != NULL) && (ws->workspace <= ptr) && (ptr <= ws->workspaceEnd);
++    return (ptr != NULL) && (ws->workspace <= ptr) && (ptr < ws->workspaceEnd);
+ }
+ 
+ /*
+@@ -345,6 +358,33 @@ MEM_STATIC BYTE* ZSTD_cwksp_reserve_buff
+ 
+ /*
+  * Reserves and returns memory sized on and aligned on ZSTD_CWKSP_ALIGNMENT_BYTES (64 bytes).
++ * This memory has been initialized at least once in the past.
++ * This doesn't mean it has been initialized this time, and it might contain data from previous
++ * operations.
++ * The main usage is for algorithms that might need read access into uninitialized memory.
++ * The algorithm must maintain safety under these conditions and must make sure it doesn't
++ * leak any of the past data (directly or in side channels).
++ */
++MEM_STATIC void* ZSTD_cwksp_reserve_aligned_init_once(ZSTD_cwksp* ws, size_t bytes)
++{
++    size_t const alignedBytes = ZSTD_cwksp_align(bytes, ZSTD_CWKSP_ALIGNMENT_BYTES);
++    void* ptr = ZSTD_cwksp_reserve_internal(ws, alignedBytes, ZSTD_cwksp_alloc_aligned_init_once);
++    assert(((size_t)ptr & (ZSTD_CWKSP_ALIGNMENT_BYTES-1))== 0);
++    if(ptr && ptr < ws->initOnceStart) {
++        /* We assume the memory following the current allocation is either:
++         * 1. Not usable as initOnce memory (end of workspace)
++         * 2. Another initOnce buffer that has been allocated before (and so was previously memset)
++         * 3. An ASAN redzone, in which case we don't want to write on it
++         * For these reasons it should be fine to not explicitly zero every byte up to ws->initOnceStart.
++         * Note that we assume here that MSAN and ASAN cannot run in the same time. */
++        ZSTD_memset(ptr, 0, MIN((size_t)((U8*)ws->initOnceStart - (U8*)ptr), alignedBytes));
++        ws->initOnceStart = ptr;
++    }
++    return ptr;
++}
++
++/*
++ * Reserves and returns memory sized on and aligned on ZSTD_CWKSP_ALIGNMENT_BYTES (64 bytes).
+  */
+ MEM_STATIC void* ZSTD_cwksp_reserve_aligned(ZSTD_cwksp* ws, size_t bytes)
+ {
+@@ -356,18 +396,22 @@ MEM_STATIC void* ZSTD_cwksp_reserve_alig
+ 
+ /*
+  * Aligned on 64 bytes. These buffers have the special property that
+- * their values remain constrained, allowing us to re-use them without
++ * their values remain constrained, allowing us to reuse them without
+  * memset()-ing them.
+  */
+ MEM_STATIC void* ZSTD_cwksp_reserve_table(ZSTD_cwksp* ws, size_t bytes)
+ {
+-    const ZSTD_cwksp_alloc_phase_e phase = ZSTD_cwksp_alloc_aligned;
++    const ZSTD_cwksp_alloc_phase_e phase = ZSTD_cwksp_alloc_aligned_init_once;
+     void* alloc;
+     void* end;
+     void* top;
+ 
+-    if (ZSTD_isError(ZSTD_cwksp_internal_advance_phase(ws, phase))) {
+-        return NULL;
++    /* We can only start allocating tables after we are done reserving space for objects at the
++     * start of the workspace */
++    if(ws->phase < phase) {
++        if (ZSTD_isError(ZSTD_cwksp_internal_advance_phase(ws, phase))) {
++            return NULL;
++        }
+     }
+     alloc = ws->tableEnd;
+     end = (BYTE *)alloc + bytes;
+@@ -451,7 +495,7 @@ MEM_STATIC void ZSTD_cwksp_clean_tables(
+     assert(ws->tableValidEnd >= ws->objectEnd);
+     assert(ws->tableValidEnd <= ws->allocStart);
+     if (ws->tableValidEnd < ws->tableEnd) {
+-        ZSTD_memset(ws->tableValidEnd, 0, (BYTE*)ws->tableEnd - (BYTE*)ws->tableValidEnd);
++        ZSTD_memset(ws->tableValidEnd, 0, (size_t)((BYTE*)ws->tableEnd - (BYTE*)ws->tableValidEnd));
+     }
+     ZSTD_cwksp_mark_tables_clean(ws);
+ }
+@@ -478,14 +522,23 @@ MEM_STATIC void ZSTD_cwksp_clear(ZSTD_cw
+ 
+ 
+     ws->tableEnd = ws->objectEnd;
+-    ws->allocStart = ws->workspaceEnd;
++    ws->allocStart = ZSTD_cwksp_initialAllocStart(ws);
+     ws->allocFailed = 0;
+-    if (ws->phase > ZSTD_cwksp_alloc_buffers) {
+-        ws->phase = ZSTD_cwksp_alloc_buffers;
++    if (ws->phase > ZSTD_cwksp_alloc_aligned_init_once) {
++        ws->phase = ZSTD_cwksp_alloc_aligned_init_once;
+     }
+     ZSTD_cwksp_assert_internal_consistency(ws);
+ }
+ 
++MEM_STATIC size_t ZSTD_cwksp_sizeof(const ZSTD_cwksp* ws) {
++    return (size_t)((BYTE*)ws->workspaceEnd - (BYTE*)ws->workspace);
++}
++
++MEM_STATIC size_t ZSTD_cwksp_used(const ZSTD_cwksp* ws) {
++    return (size_t)((BYTE*)ws->tableEnd - (BYTE*)ws->workspace)
++         + (size_t)((BYTE*)ws->workspaceEnd - (BYTE*)ws->allocStart);
++}
++
+ /*
+  * The provided workspace takes ownership of the buffer [start, start+size).
+  * Any existing values in the workspace are ignored (the previously managed
+@@ -498,6 +551,7 @@ MEM_STATIC void ZSTD_cwksp_init(ZSTD_cwk
+     ws->workspaceEnd = (BYTE*)start + size;
+     ws->objectEnd = ws->workspace;
+     ws->tableValidEnd = ws->objectEnd;
++    ws->initOnceStart = ZSTD_cwksp_initialAllocStart(ws);
+     ws->phase = ZSTD_cwksp_alloc_objects;
+     ws->isStatic = isStatic;
+     ZSTD_cwksp_clear(ws);
+@@ -529,15 +583,6 @@ MEM_STATIC void ZSTD_cwksp_move(ZSTD_cwk
+     ZSTD_memset(src, 0, sizeof(ZSTD_cwksp));
+ }
+ 
+-MEM_STATIC size_t ZSTD_cwksp_sizeof(const ZSTD_cwksp* ws) {
+-    return (size_t)((BYTE*)ws->workspaceEnd - (BYTE*)ws->workspace);
+-}
+-
+-MEM_STATIC size_t ZSTD_cwksp_used(const ZSTD_cwksp* ws) {
+-    return (size_t)((BYTE*)ws->tableEnd - (BYTE*)ws->workspace)
+-         + (size_t)((BYTE*)ws->workspaceEnd - (BYTE*)ws->allocStart);
+-}
+-
+ MEM_STATIC int ZSTD_cwksp_reserve_failed(const ZSTD_cwksp* ws) {
+     return ws->allocFailed;
+ }
+@@ -550,17 +595,11 @@ MEM_STATIC int ZSTD_cwksp_reserve_failed
+  * Returns if the estimated space needed for a wksp is within an acceptable limit of the
+  * actual amount of space used.
+  */
+-MEM_STATIC int ZSTD_cwksp_estimated_space_within_bounds(const ZSTD_cwksp* const ws,
+-                                                        size_t const estimatedSpace, int resizedWorkspace) {
+-    if (resizedWorkspace) {
+-        /* Resized/newly allocated wksp should have exact bounds */
+-        return ZSTD_cwksp_used(ws) == estimatedSpace;
+-    } else {
+-        /* Due to alignment, when reusing a workspace, we can actually consume 63 fewer or more bytes
+-         * than estimatedSpace. See the comments in zstd_cwksp.h for details.
+-         */
+-        return (ZSTD_cwksp_used(ws) >= estimatedSpace - 63) && (ZSTD_cwksp_used(ws) <= estimatedSpace + 63);
+-    }
++MEM_STATIC int ZSTD_cwksp_estimated_space_within_bounds(const ZSTD_cwksp *const ws, size_t const estimatedSpace) {
++    /* We have an alignment space between objects and tables between tables and buffers, so we can have up to twice
++     * the alignment bytes difference between estimation and actual usage */
++    return (estimatedSpace - ZSTD_cwksp_slack_space_required()) <= ZSTD_cwksp_used(ws) &&
++           ZSTD_cwksp_used(ws) <= estimatedSpace;
+ }
+ 
+ 
+--- a/lib/zstd/compress/zstd_double_fast.c
++++ b/lib/zstd/compress/zstd_double_fast.c
+@@ -1,5 +1,6 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -11,8 +12,49 @@
+ #include "zstd_compress_internal.h"
+ #include "zstd_double_fast.h"
+ 
++#ifndef ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR
+ 
+-void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms,
++static
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++void ZSTD_fillDoubleHashTableForCDict(ZSTD_matchState_t* ms,
++                              void const* end, ZSTD_dictTableLoadMethod_e dtlm)
++{
++    const ZSTD_compressionParameters* const cParams = &ms->cParams;
++    U32* const hashLarge = ms->hashTable;
++    U32  const hBitsL = cParams->hashLog + ZSTD_SHORT_CACHE_TAG_BITS;
++    U32  const mls = cParams->minMatch;
++    U32* const hashSmall = ms->chainTable;
++    U32  const hBitsS = cParams->chainLog + ZSTD_SHORT_CACHE_TAG_BITS;
++    const BYTE* const base = ms->window.base;
++    const BYTE* ip = base + ms->nextToUpdate;
++    const BYTE* const iend = ((const BYTE*)end) - HASH_READ_SIZE;
++    const U32 fastHashFillStep = 3;
++
++    /* Always insert every fastHashFillStep position into the hash tables.
++     * Insert the other positions into the large hash table if their entry
++     * is empty.
++     */
++    for (; ip + fastHashFillStep - 1 <= iend; ip += fastHashFillStep) {
++        U32 const curr = (U32)(ip - base);
++        U32 i;
++        for (i = 0; i < fastHashFillStep; ++i) {
++            size_t const smHashAndTag = ZSTD_hashPtr(ip + i, hBitsS, mls);
++            size_t const lgHashAndTag = ZSTD_hashPtr(ip + i, hBitsL, 8);
++            if (i == 0) {
++                ZSTD_writeTaggedIndex(hashSmall, smHashAndTag, curr + i);
++            }
++            if (i == 0 || hashLarge[lgHashAndTag >> ZSTD_SHORT_CACHE_TAG_BITS] == 0) {
++                ZSTD_writeTaggedIndex(hashLarge, lgHashAndTag, curr + i);
++            }
++            /* Only load extra positions for ZSTD_dtlm_full */
++            if (dtlm == ZSTD_dtlm_fast)
++                break;
++    }   }
++}
++
++static
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++void ZSTD_fillDoubleHashTableForCCtx(ZSTD_matchState_t* ms,
+                               void const* end, ZSTD_dictTableLoadMethod_e dtlm)
+ {
+     const ZSTD_compressionParameters* const cParams = &ms->cParams;
+@@ -43,11 +85,24 @@ void ZSTD_fillDoubleHashTable(ZSTD_match
+             /* Only load extra positions for ZSTD_dtlm_full */
+             if (dtlm == ZSTD_dtlm_fast)
+                 break;
+-    }   }
++        }   }
++}
++
++void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms,
++                        const void* const end,
++                        ZSTD_dictTableLoadMethod_e dtlm,
++                        ZSTD_tableFillPurpose_e tfp)
++{
++    if (tfp == ZSTD_tfp_forCDict) {
++        ZSTD_fillDoubleHashTableForCDict(ms, end, dtlm);
++    } else {
++        ZSTD_fillDoubleHashTableForCCtx(ms, end, dtlm);
++    }
+ }
+ 
+ 
+ FORCE_INLINE_TEMPLATE
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+ size_t ZSTD_compressBlock_doubleFast_noDict_generic(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize, U32 const mls /* template */)
+@@ -67,7 +122,7 @@ size_t ZSTD_compressBlock_doubleFast_noD
+     const BYTE* const iend = istart + srcSize;
+     const BYTE* const ilimit = iend - HASH_READ_SIZE;
+     U32 offset_1=rep[0], offset_2=rep[1];
+-    U32 offsetSaved = 0;
++    U32 offsetSaved1 = 0, offsetSaved2 = 0;
+ 
+     size_t mLength;
+     U32 offset;
+@@ -100,8 +155,8 @@ size_t ZSTD_compressBlock_doubleFast_noD
+         U32 const current = (U32)(ip - base);
+         U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, current, cParams->windowLog);
+         U32 const maxRep = current - windowLow;
+-        if (offset_2 > maxRep) offsetSaved = offset_2, offset_2 = 0;
+-        if (offset_1 > maxRep) offsetSaved = offset_1, offset_1 = 0;
++        if (offset_2 > maxRep) offsetSaved2 = offset_2, offset_2 = 0;
++        if (offset_1 > maxRep) offsetSaved1 = offset_1, offset_1 = 0;
+     }
+ 
+     /* Outer Loop: one iteration per match found and stored */
+@@ -131,7 +186,7 @@ size_t ZSTD_compressBlock_doubleFast_noD
+             if ((offset_1 > 0) & (MEM_read32(ip+1-offset_1) == MEM_read32(ip+1))) {
+                 mLength = ZSTD_count(ip+1+4, ip+1+4-offset_1, iend) + 4;
+                 ip++;
+-                ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_REPCODE_1, mLength);
++                ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength);
+                 goto _match_stored;
+             }
+ 
+@@ -175,9 +230,13 @@ size_t ZSTD_compressBlock_doubleFast_noD
+         } while (ip1 <= ilimit);
+ 
+ _cleanup:
++        /* If offset_1 started invalid (offsetSaved1 != 0) and became valid (offset_1 != 0),
++         * rotate saved offsets. See comment in ZSTD_compressBlock_fast_noDict for more context. */
++        offsetSaved2 = ((offsetSaved1 != 0) && (offset_1 != 0)) ? offsetSaved1 : offsetSaved2;
++
+         /* save reps for next block */
+-        rep[0] = offset_1 ? offset_1 : offsetSaved;
+-        rep[1] = offset_2 ? offset_2 : offsetSaved;
++        rep[0] = offset_1 ? offset_1 : offsetSaved1;
++        rep[1] = offset_2 ? offset_2 : offsetSaved2;
+ 
+         /* Return the last literals size */
+         return (size_t)(iend - anchor);
+@@ -217,7 +276,7 @@ _match_found: /* requires ip, offset, mL
+             hashLong[hl1] = (U32)(ip1 - base);
+         }
+ 
+-        ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength);
++        ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength);
+ 
+ _match_stored:
+         /* match found */
+@@ -243,7 +302,7 @@ _match_stored:
+                 U32 const tmpOff = offset_2; offset_2 = offset_1; offset_1 = tmpOff;  /* swap offset_2 <=> offset_1 */
+                 hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = (U32)(ip-base);
+                 hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = (U32)(ip-base);
+-                ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, rLength);
++                ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, rLength);
+                 ip += rLength;
+                 anchor = ip;
+                 continue;   /* faster when present ... (?) */
+@@ -254,6 +313,7 @@ _match_stored:
+ 
+ 
+ FORCE_INLINE_TEMPLATE
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize,
+@@ -275,7 +335,6 @@ size_t ZSTD_compressBlock_doubleFast_dic
+     const BYTE* const iend = istart + srcSize;
+     const BYTE* const ilimit = iend - HASH_READ_SIZE;
+     U32 offset_1=rep[0], offset_2=rep[1];
+-    U32 offsetSaved = 0;
+ 
+     const ZSTD_matchState_t* const dms = ms->dictMatchState;
+     const ZSTD_compressionParameters* const dictCParams = &dms->cParams;
+@@ -286,8 +345,8 @@ size_t ZSTD_compressBlock_doubleFast_dic
+     const BYTE* const dictStart    = dictBase + dictStartIndex;
+     const BYTE* const dictEnd      = dms->window.nextSrc;
+     const U32 dictIndexDelta       = prefixLowestIndex - (U32)(dictEnd - dictBase);
+-    const U32 dictHBitsL           = dictCParams->hashLog;
+-    const U32 dictHBitsS           = dictCParams->chainLog;
++    const U32 dictHBitsL           = dictCParams->hashLog + ZSTD_SHORT_CACHE_TAG_BITS;
++    const U32 dictHBitsS           = dictCParams->chainLog + ZSTD_SHORT_CACHE_TAG_BITS;
+     const U32 dictAndPrefixLength  = (U32)((ip - prefixLowest) + (dictEnd - dictStart));
+ 
+     DEBUGLOG(5, "ZSTD_compressBlock_doubleFast_dictMatchState_generic");
+@@ -295,6 +354,13 @@ size_t ZSTD_compressBlock_doubleFast_dic
+     /* if a dictionary is attached, it must be within window range */
+     assert(ms->window.dictLimit + (1U << cParams->windowLog) >= endIndex);
+ 
++    if (ms->prefetchCDictTables) {
++        size_t const hashTableBytes = (((size_t)1) << dictCParams->hashLog) * sizeof(U32);
++        size_t const chainTableBytes = (((size_t)1) << dictCParams->chainLog) * sizeof(U32);
++        PREFETCH_AREA(dictHashLong, hashTableBytes);
++        PREFETCH_AREA(dictHashSmall, chainTableBytes);
++    }
++
+     /* init */
+     ip += (dictAndPrefixLength == 0);
+ 
+@@ -309,8 +375,12 @@ size_t ZSTD_compressBlock_doubleFast_dic
+         U32 offset;
+         size_t const h2 = ZSTD_hashPtr(ip, hBitsL, 8);
+         size_t const h = ZSTD_hashPtr(ip, hBitsS, mls);
+-        size_t const dictHL = ZSTD_hashPtr(ip, dictHBitsL, 8);
+-        size_t const dictHS = ZSTD_hashPtr(ip, dictHBitsS, mls);
++        size_t const dictHashAndTagL = ZSTD_hashPtr(ip, dictHBitsL, 8);
++        size_t const dictHashAndTagS = ZSTD_hashPtr(ip, dictHBitsS, mls);
++        U32 const dictMatchIndexAndTagL = dictHashLong[dictHashAndTagL >> ZSTD_SHORT_CACHE_TAG_BITS];
++        U32 const dictMatchIndexAndTagS = dictHashSmall[dictHashAndTagS >> ZSTD_SHORT_CACHE_TAG_BITS];
++        int const dictTagsMatchL = ZSTD_comparePackedTags(dictMatchIndexAndTagL, dictHashAndTagL);
++        int const dictTagsMatchS = ZSTD_comparePackedTags(dictMatchIndexAndTagS, dictHashAndTagS);
+         U32 const curr = (U32)(ip-base);
+         U32 const matchIndexL = hashLong[h2];
+         U32 matchIndexS = hashSmall[h];
+@@ -328,7 +398,7 @@ size_t ZSTD_compressBlock_doubleFast_dic
+             const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend;
+             mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4;
+             ip++;
+-            ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_REPCODE_1, mLength);
++            ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength);
+             goto _match_stored;
+         }
+ 
+@@ -340,9 +410,9 @@ size_t ZSTD_compressBlock_doubleFast_dic
+                 while (((ip>anchor) & (matchLong>prefixLowest)) && (ip[-1] == matchLong[-1])) { ip--; matchLong--; mLength++; } /* catch up */
+                 goto _match_found;
+             }
+-        } else {
++        } else if (dictTagsMatchL) {
+             /* check dictMatchState long match */
+-            U32 const dictMatchIndexL = dictHashLong[dictHL];
++            U32 const dictMatchIndexL = dictMatchIndexAndTagL >> ZSTD_SHORT_CACHE_TAG_BITS;
+             const BYTE* dictMatchL = dictBase + dictMatchIndexL;
+             assert(dictMatchL < dictEnd);
+ 
+@@ -358,9 +428,9 @@ size_t ZSTD_compressBlock_doubleFast_dic
+             if (MEM_read32(match) == MEM_read32(ip)) {
+                 goto _search_next_long;
+             }
+-        } else {
++        } else if (dictTagsMatchS) {
+             /* check dictMatchState short match */
+-            U32 const dictMatchIndexS = dictHashSmall[dictHS];
++            U32 const dictMatchIndexS = dictMatchIndexAndTagS >> ZSTD_SHORT_CACHE_TAG_BITS;
+             match = dictBase + dictMatchIndexS;
+             matchIndexS = dictMatchIndexS + dictIndexDelta;
+ 
+@@ -375,10 +445,11 @@ size_t ZSTD_compressBlock_doubleFast_dic
+         continue;
+ 
+ _search_next_long:
+-
+         {   size_t const hl3 = ZSTD_hashPtr(ip+1, hBitsL, 8);
+-            size_t const dictHLNext = ZSTD_hashPtr(ip+1, dictHBitsL, 8);
++            size_t const dictHashAndTagL3 = ZSTD_hashPtr(ip+1, dictHBitsL, 8);
+             U32 const matchIndexL3 = hashLong[hl3];
++            U32 const dictMatchIndexAndTagL3 = dictHashLong[dictHashAndTagL3 >> ZSTD_SHORT_CACHE_TAG_BITS];
++            int const dictTagsMatchL3 = ZSTD_comparePackedTags(dictMatchIndexAndTagL3, dictHashAndTagL3);
+             const BYTE* matchL3 = base + matchIndexL3;
+             hashLong[hl3] = curr + 1;
+ 
+@@ -391,9 +462,9 @@ _search_next_long:
+                     while (((ip>anchor) & (matchL3>prefixLowest)) && (ip[-1] == matchL3[-1])) { ip--; matchL3--; mLength++; } /* catch up */
+                     goto _match_found;
+                 }
+-            } else {
++            } else if (dictTagsMatchL3) {
+                 /* check dict long +1 match */
+-                U32 const dictMatchIndexL3 = dictHashLong[dictHLNext];
++                U32 const dictMatchIndexL3 = dictMatchIndexAndTagL3 >> ZSTD_SHORT_CACHE_TAG_BITS;
+                 const BYTE* dictMatchL3 = dictBase + dictMatchIndexL3;
+                 assert(dictMatchL3 < dictEnd);
+                 if (dictMatchL3 > dictStart && MEM_read64(dictMatchL3) == MEM_read64(ip+1)) {
+@@ -419,7 +490,7 @@ _match_found:
+         offset_2 = offset_1;
+         offset_1 = offset;
+ 
+-        ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength);
++        ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength);
+ 
+ _match_stored:
+         /* match found */
+@@ -448,7 +519,7 @@ _match_stored:
+                     const BYTE* const repEnd2 = repIndex2 < prefixLowestIndex ? dictEnd : iend;
+                     size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixLowest) + 4;
+                     U32 tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset;   /* swap offset_2 <=> offset_1 */
+-                    ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, repLength2);
++                    ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, repLength2);
+                     hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = current2;
+                     hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = current2;
+                     ip += repLength2;
+@@ -461,8 +532,8 @@ _match_stored:
+     }   /* while (ip < ilimit) */
+ 
+     /* save reps for next block */
+-    rep[0] = offset_1 ? offset_1 : offsetSaved;
+-    rep[1] = offset_2 ? offset_2 : offsetSaved;
++    rep[0] = offset_1;
++    rep[1] = offset_2;
+ 
+     /* Return the last literals size */
+     return (size_t)(iend - anchor);
+@@ -527,7 +598,9 @@ size_t ZSTD_compressBlock_doubleFast_dic
+ }
+ 
+ 
+-static size_t ZSTD_compressBlock_doubleFast_extDict_generic(
++static
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++size_t ZSTD_compressBlock_doubleFast_extDict_generic(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize,
+         U32 const mls /* template */)
+@@ -585,7 +658,7 @@ static size_t ZSTD_compressBlock_doubleF
+             const BYTE* repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend;
+             mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixStart) + 4;
+             ip++;
+-            ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_REPCODE_1, mLength);
++            ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength);
+         } else {
+             if ((matchLongIndex > dictStartIndex) && (MEM_read64(matchLong) == MEM_read64(ip))) {
+                 const BYTE* const matchEnd = matchLongIndex < prefixStartIndex ? dictEnd : iend;
+@@ -596,7 +669,7 @@ static size_t ZSTD_compressBlock_doubleF
+                 while (((ip>anchor) & (matchLong>lowMatchPtr)) && (ip[-1] == matchLong[-1])) { ip--; matchLong--; mLength++; }   /* catch up */
+                 offset_2 = offset_1;
+                 offset_1 = offset;
+-                ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength);
++                ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength);
+ 
+             } else if ((matchIndex > dictStartIndex) && (MEM_read32(match) == MEM_read32(ip))) {
+                 size_t const h3 = ZSTD_hashPtr(ip+1, hBitsL, 8);
+@@ -621,7 +694,7 @@ static size_t ZSTD_compressBlock_doubleF
+                 }
+                 offset_2 = offset_1;
+                 offset_1 = offset;
+-                ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength);
++                ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength);
+ 
+             } else {
+                 ip += ((ip-anchor) >> kSearchStrength) + 1;
+@@ -653,7 +726,7 @@ static size_t ZSTD_compressBlock_doubleF
+                     const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend;
+                     size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixStart) + 4;
+                     U32 const tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset;   /* swap offset_2 <=> offset_1 */
+-                    ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, repLength2);
++                    ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, repLength2);
+                     hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = current2;
+                     hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = current2;
+                     ip += repLength2;
+@@ -694,3 +767,5 @@ size_t ZSTD_compressBlock_doubleFast_ext
+         return ZSTD_compressBlock_doubleFast_extDict_7(ms, seqStore, rep, src, srcSize);
+     }
+ }
++
++#endif /* ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR */
+--- a/lib/zstd/compress/zstd_double_fast.h
++++ b/lib/zstd/compress/zstd_double_fast.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -15,8 +16,12 @@
+ #include "../common/mem.h"      /* U32 */
+ #include "zstd_compress_internal.h"     /* ZSTD_CCtx, size_t */
+ 
++#ifndef ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR
++
+ void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms,
+-                              void const* end, ZSTD_dictTableLoadMethod_e dtlm);
++                              void const* end, ZSTD_dictTableLoadMethod_e dtlm,
++                              ZSTD_tableFillPurpose_e tfp);
++
+ size_t ZSTD_compressBlock_doubleFast(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+@@ -27,6 +32,14 @@ size_t ZSTD_compressBlock_doubleFast_ext
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+ 
++#define ZSTD_COMPRESSBLOCK_DOUBLEFAST ZSTD_compressBlock_doubleFast
++#define ZSTD_COMPRESSBLOCK_DOUBLEFAST_DICTMATCHSTATE ZSTD_compressBlock_doubleFast_dictMatchState
++#define ZSTD_COMPRESSBLOCK_DOUBLEFAST_EXTDICT ZSTD_compressBlock_doubleFast_extDict
++#else
++#define ZSTD_COMPRESSBLOCK_DOUBLEFAST NULL
++#define ZSTD_COMPRESSBLOCK_DOUBLEFAST_DICTMATCHSTATE NULL
++#define ZSTD_COMPRESSBLOCK_DOUBLEFAST_EXTDICT NULL
++#endif /* ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR */
+ 
+ 
+ #endif /* ZSTD_DOUBLE_FAST_H */
+--- a/lib/zstd/compress/zstd_fast.c
++++ b/lib/zstd/compress/zstd_fast.c
+@@ -1,5 +1,6 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -11,8 +12,46 @@
+ #include "zstd_compress_internal.h"  /* ZSTD_hashPtr, ZSTD_count, ZSTD_storeSeq */
+ #include "zstd_fast.h"
+ 
++static
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++void ZSTD_fillHashTableForCDict(ZSTD_matchState_t* ms,
++                        const void* const end,
++                        ZSTD_dictTableLoadMethod_e dtlm)
++{
++    const ZSTD_compressionParameters* const cParams = &ms->cParams;
++    U32* const hashTable = ms->hashTable;
++    U32  const hBits = cParams->hashLog + ZSTD_SHORT_CACHE_TAG_BITS;
++    U32  const mls = cParams->minMatch;
++    const BYTE* const base = ms->window.base;
++    const BYTE* ip = base + ms->nextToUpdate;
++    const BYTE* const iend = ((const BYTE*)end) - HASH_READ_SIZE;
++    const U32 fastHashFillStep = 3;
+ 
+-void ZSTD_fillHashTable(ZSTD_matchState_t* ms,
++    /* Currently, we always use ZSTD_dtlm_full for filling CDict tables.
++     * Feel free to remove this assert if there's a good reason! */
++    assert(dtlm == ZSTD_dtlm_full);
++
++    /* Always insert every fastHashFillStep position into the hash table.
++     * Insert the other positions if their hash entry is empty.
++     */
++    for ( ; ip + fastHashFillStep < iend + 2; ip += fastHashFillStep) {
++        U32 const curr = (U32)(ip - base);
++        {   size_t const hashAndTag = ZSTD_hashPtr(ip, hBits, mls);
++            ZSTD_writeTaggedIndex(hashTable, hashAndTag, curr);   }
++
++        if (dtlm == ZSTD_dtlm_fast) continue;
++        /* Only load extra positions for ZSTD_dtlm_full */
++        {   U32 p;
++            for (p = 1; p < fastHashFillStep; ++p) {
++                size_t const hashAndTag = ZSTD_hashPtr(ip + p, hBits, mls);
++                if (hashTable[hashAndTag >> ZSTD_SHORT_CACHE_TAG_BITS] == 0) {  /* not yet filled */
++                    ZSTD_writeTaggedIndex(hashTable, hashAndTag, curr + p);
++                }   }   }   }
++}
++
++static
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++void ZSTD_fillHashTableForCCtx(ZSTD_matchState_t* ms,
+                         const void* const end,
+                         ZSTD_dictTableLoadMethod_e dtlm)
+ {
+@@ -25,6 +64,10 @@ void ZSTD_fillHashTable(ZSTD_matchState_
+     const BYTE* const iend = ((const BYTE*)end) - HASH_READ_SIZE;
+     const U32 fastHashFillStep = 3;
+ 
++    /* Currently, we always use ZSTD_dtlm_fast for filling CCtx tables.
++     * Feel free to remove this assert if there's a good reason! */
++    assert(dtlm == ZSTD_dtlm_fast);
++
+     /* Always insert every fastHashFillStep position into the hash table.
+      * Insert the other positions if their hash entry is empty.
+      */
+@@ -42,6 +85,18 @@ void ZSTD_fillHashTable(ZSTD_matchState_
+     }   }   }   }
+ }
+ 
++void ZSTD_fillHashTable(ZSTD_matchState_t* ms,
++                        const void* const end,
++                        ZSTD_dictTableLoadMethod_e dtlm,
++                        ZSTD_tableFillPurpose_e tfp)
++{
++    if (tfp == ZSTD_tfp_forCDict) {
++        ZSTD_fillHashTableForCDict(ms, end, dtlm);
++    } else {
++        ZSTD_fillHashTableForCCtx(ms, end, dtlm);
++    }
++}
++
+ 
+ /*
+  * If you squint hard enough (and ignore repcodes), the search operation at any
+@@ -89,8 +144,9 @@ void ZSTD_fillHashTable(ZSTD_matchState_
+  *
+  * This is also the work we do at the beginning to enter the loop initially.
+  */
+-FORCE_INLINE_TEMPLATE size_t
+-ZSTD_compressBlock_fast_noDict_generic(
++FORCE_INLINE_TEMPLATE
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++size_t ZSTD_compressBlock_fast_noDict_generic(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize,
+         U32 const mls, U32 const hasStep)
+@@ -117,7 +173,7 @@ ZSTD_compressBlock_fast_noDict_generic(
+ 
+     U32 rep_offset1 = rep[0];
+     U32 rep_offset2 = rep[1];
+-    U32 offsetSaved = 0;
++    U32 offsetSaved1 = 0, offsetSaved2 = 0;
+ 
+     size_t hash0; /* hash for ip0 */
+     size_t hash1; /* hash for ip1 */
+@@ -141,8 +197,8 @@ ZSTD_compressBlock_fast_noDict_generic(
+     {   U32 const curr = (U32)(ip0 - base);
+         U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, curr, cParams->windowLog);
+         U32 const maxRep = curr - windowLow;
+-        if (rep_offset2 > maxRep) offsetSaved = rep_offset2, rep_offset2 = 0;
+-        if (rep_offset1 > maxRep) offsetSaved = rep_offset1, rep_offset1 = 0;
++        if (rep_offset2 > maxRep) offsetSaved2 = rep_offset2, rep_offset2 = 0;
++        if (rep_offset1 > maxRep) offsetSaved1 = rep_offset1, rep_offset1 = 0;
+     }
+ 
+     /* start each op */
+@@ -180,8 +236,14 @@ _start: /* Requires: ip0 */
+             mLength = ip0[-1] == match0[-1];
+             ip0 -= mLength;
+             match0 -= mLength;
+-            offcode = STORE_REPCODE_1;
++            offcode = REPCODE1_TO_OFFBASE;
+             mLength += 4;
++
++            /* First write next hash table entry; we've already calculated it.
++             * This write is known to be safe because the ip1 is before the
++             * repcode (ip2). */
++            hashTable[hash1] = (U32)(ip1 - base);
++
+             goto _match;
+         }
+ 
+@@ -195,6 +257,12 @@ _start: /* Requires: ip0 */
+         /* check match at ip[0] */
+         if (MEM_read32(ip0) == mval) {
+             /* found a match! */
++
++            /* First write next hash table entry; we've already calculated it.
++             * This write is known to be safe because the ip1 == ip0 + 1, so
++             * we know we will resume searching after ip1 */
++            hashTable[hash1] = (U32)(ip1 - base);
++
+             goto _offset;
+         }
+ 
+@@ -224,6 +292,21 @@ _start: /* Requires: ip0 */
+         /* check match at ip[0] */
+         if (MEM_read32(ip0) == mval) {
+             /* found a match! */
++
++            /* first write next hash table entry; we've already calculated it */
++            if (step <= 4) {
++                /* We need to avoid writing an index into the hash table >= the
++                 * position at which we will pick up our searching after we've
++                 * taken this match.
++                 *
++                 * The minimum possible match has length 4, so the earliest ip0
++                 * can be after we take this match will be the current ip0 + 4.
++                 * ip1 is ip0 + step - 1. If ip1 is >= ip0 + 4, we can't safely
++                 * write this position.
++                 */
++                hashTable[hash1] = (U32)(ip1 - base);
++            }
++
+             goto _offset;
+         }
+ 
+@@ -254,9 +337,24 @@ _cleanup:
+      * However, it seems to be a meaningful performance hit to try to search
+      * them. So let's not. */
+ 
++    /* When the repcodes are outside of the prefix, we set them to zero before the loop.
++     * When the offsets are still zero, we need to restore them after the block to have a correct
++     * repcode history. If only one offset was invalid, it is easy. The tricky case is when both
++     * offsets were invalid. We need to figure out which offset to refill with.
++     *     - If both offsets are zero they are in the same order.
++     *     - If both offsets are non-zero, we won't restore the offsets from `offsetSaved[12]`.
++     *     - If only one is zero, we need to decide which offset to restore.
++     *         - If rep_offset1 is non-zero, then rep_offset2 must be offsetSaved1.
++     *         - It is impossible for rep_offset2 to be non-zero.
++     *
++     * So if rep_offset1 started invalid (offsetSaved1 != 0) and became valid (rep_offset1 != 0), then
++     * set rep[0] = rep_offset1 and rep[1] = offsetSaved1.
++     */
++    offsetSaved2 = ((offsetSaved1 != 0) && (rep_offset1 != 0)) ? offsetSaved1 : offsetSaved2;
++
+     /* save reps for next block */
+-    rep[0] = rep_offset1 ? rep_offset1 : offsetSaved;
+-    rep[1] = rep_offset2 ? rep_offset2 : offsetSaved;
++    rep[0] = rep_offset1 ? rep_offset1 : offsetSaved1;
++    rep[1] = rep_offset2 ? rep_offset2 : offsetSaved2;
+ 
+     /* Return the last literals size */
+     return (size_t)(iend - anchor);
+@@ -267,7 +365,7 @@ _offset: /* Requires: ip0, idx */
+     match0 = base + idx;
+     rep_offset2 = rep_offset1;
+     rep_offset1 = (U32)(ip0-match0);
+-    offcode = STORE_OFFSET(rep_offset1);
++    offcode = OFFSET_TO_OFFBASE(rep_offset1);
+     mLength = 4;
+ 
+     /* Count the backwards match length. */
+@@ -287,11 +385,6 @@ _match: /* Requires: ip0, match0, offcod
+     ip0 += mLength;
+     anchor = ip0;
+ 
+-    /* write next hash table entry */
+-    if (ip1 < ip0) {
+-        hashTable[hash1] = (U32)(ip1 - base);
+-    }
+-
+     /* Fill table and check for immediate repcode. */
+     if (ip0 <= ilimit) {
+         /* Fill Table */
+@@ -306,7 +399,7 @@ _match: /* Requires: ip0, match0, offcod
+                 { U32 const tmpOff = rep_offset2; rep_offset2 = rep_offset1; rep_offset1 = tmpOff; } /* swap rep_offset2 <=> rep_offset1 */
+                 hashTable[ZSTD_hashPtr(ip0, hlog, mls)] = (U32)(ip0-base);
+                 ip0 += rLength;
+-                ZSTD_storeSeq(seqStore, 0 /*litLen*/, anchor, iend, STORE_REPCODE_1, rLength);
++                ZSTD_storeSeq(seqStore, 0 /*litLen*/, anchor, iend, REPCODE1_TO_OFFBASE, rLength);
+                 anchor = ip0;
+                 continue;   /* faster when present (confirmed on gcc-8) ... (?) */
+     }   }   }
+@@ -369,6 +462,7 @@ size_t ZSTD_compressBlock_fast(
+ }
+ 
+ FORCE_INLINE_TEMPLATE
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+ size_t ZSTD_compressBlock_fast_dictMatchState_generic(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize, U32 const mls, U32 const hasStep)
+@@ -380,14 +474,14 @@ size_t ZSTD_compressBlock_fast_dictMatch
+     U32 const stepSize = cParams->targetLength + !(cParams->targetLength);
+     const BYTE* const base = ms->window.base;
+     const BYTE* const istart = (const BYTE*)src;
+-    const BYTE* ip = istart;
++    const BYTE* ip0 = istart;
++    const BYTE* ip1 = ip0 + stepSize; /* we assert below that stepSize >= 1 */
+     const BYTE* anchor = istart;
+     const U32   prefixStartIndex = ms->window.dictLimit;
+     const BYTE* const prefixStart = base + prefixStartIndex;
+     const BYTE* const iend = istart + srcSize;
+     const BYTE* const ilimit = iend - HASH_READ_SIZE;
+     U32 offset_1=rep[0], offset_2=rep[1];
+-    U32 offsetSaved = 0;
+ 
+     const ZSTD_matchState_t* const dms = ms->dictMatchState;
+     const ZSTD_compressionParameters* const dictCParams = &dms->cParams ;
+@@ -397,13 +491,13 @@ size_t ZSTD_compressBlock_fast_dictMatch
+     const BYTE* const dictStart    = dictBase + dictStartIndex;
+     const BYTE* const dictEnd      = dms->window.nextSrc;
+     const U32 dictIndexDelta       = prefixStartIndex - (U32)(dictEnd - dictBase);
+-    const U32 dictAndPrefixLength  = (U32)(ip - prefixStart + dictEnd - dictStart);
+-    const U32 dictHLog             = dictCParams->hashLog;
++    const U32 dictAndPrefixLength  = (U32)(istart - prefixStart + dictEnd - dictStart);
++    const U32 dictHBits            = dictCParams->hashLog + ZSTD_SHORT_CACHE_TAG_BITS;
+ 
+     /* if a dictionary is still attached, it necessarily means that
+      * it is within window size. So we just check it. */
+     const U32 maxDistance = 1U << cParams->windowLog;
+-    const U32 endIndex = (U32)((size_t)(ip - base) + srcSize);
++    const U32 endIndex = (U32)((size_t)(istart - base) + srcSize);
+     assert(endIndex - prefixStartIndex <= maxDistance);
+     (void)maxDistance; (void)endIndex;   /* these variables are not used when assert() is disabled */
+ 
+@@ -413,106 +507,155 @@ size_t ZSTD_compressBlock_fast_dictMatch
+      * when translating a dict index into a local index */
+     assert(prefixStartIndex >= (U32)(dictEnd - dictBase));
+ 
++    if (ms->prefetchCDictTables) {
++        size_t const hashTableBytes = (((size_t)1) << dictCParams->hashLog) * sizeof(U32);
++        PREFETCH_AREA(dictHashTable, hashTableBytes);
++    }
++
+     /* init */
+     DEBUGLOG(5, "ZSTD_compressBlock_fast_dictMatchState_generic");
+-    ip += (dictAndPrefixLength == 0);
++    ip0 += (dictAndPrefixLength == 0);
+     /* dictMatchState repCode checks don't currently handle repCode == 0
+      * disabling. */
+     assert(offset_1 <= dictAndPrefixLength);
+     assert(offset_2 <= dictAndPrefixLength);
+ 
+-    /* Main Search Loop */
+-    while (ip < ilimit) {   /* < instead of <=, because repcode check at (ip+1) */
++    /* Outer search loop */
++    assert(stepSize >= 1);
++    while (ip1 <= ilimit) {   /* repcode check at (ip0 + 1) is safe because ip0 < ip1 */
+         size_t mLength;
+-        size_t const h = ZSTD_hashPtr(ip, hlog, mls);
+-        U32 const curr = (U32)(ip-base);
+-        U32 const matchIndex = hashTable[h];
+-        const BYTE* match = base + matchIndex;
+-        const U32 repIndex = curr + 1 - offset_1;
+-        const BYTE* repMatch = (repIndex < prefixStartIndex) ?
+-                               dictBase + (repIndex - dictIndexDelta) :
+-                               base + repIndex;
+-        hashTable[h] = curr;   /* update hash table */
+-
+-        if ( ((U32)((prefixStartIndex-1) - repIndex) >= 3) /* intentional underflow : ensure repIndex isn't overlapping dict + prefix */
+-          && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) {
+-            const BYTE* const repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend;
+-            mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixStart) + 4;
+-            ip++;
+-            ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_REPCODE_1, mLength);
+-        } else if ( (matchIndex <= prefixStartIndex) ) {
+-            size_t const dictHash = ZSTD_hashPtr(ip, dictHLog, mls);
+-            U32 const dictMatchIndex = dictHashTable[dictHash];
+-            const BYTE* dictMatch = dictBase + dictMatchIndex;
+-            if (dictMatchIndex <= dictStartIndex ||
+-                MEM_read32(dictMatch) != MEM_read32(ip)) {
+-                assert(stepSize >= 1);
+-                ip += ((ip-anchor) >> kSearchStrength) + stepSize;
+-                continue;
+-            } else {
+-                /* found a dict match */
+-                U32 const offset = (U32)(curr-dictMatchIndex-dictIndexDelta);
+-                mLength = ZSTD_count_2segments(ip+4, dictMatch+4, iend, dictEnd, prefixStart) + 4;
+-                while (((ip>anchor) & (dictMatch>dictStart))
+-                     && (ip[-1] == dictMatch[-1])) {
+-                    ip--; dictMatch--; mLength++;
++        size_t hash0 = ZSTD_hashPtr(ip0, hlog, mls);
++
++        size_t const dictHashAndTag0 = ZSTD_hashPtr(ip0, dictHBits, mls);
++        U32 dictMatchIndexAndTag = dictHashTable[dictHashAndTag0 >> ZSTD_SHORT_CACHE_TAG_BITS];
++        int dictTagsMatch = ZSTD_comparePackedTags(dictMatchIndexAndTag, dictHashAndTag0);
++
++        U32 matchIndex = hashTable[hash0];
++        U32 curr = (U32)(ip0 - base);
++        size_t step = stepSize;
++        const size_t kStepIncr = 1 << kSearchStrength;
++        const BYTE* nextStep = ip0 + kStepIncr;
++
++        /* Inner search loop */
++        while (1) {
++            const BYTE* match = base + matchIndex;
++            const U32 repIndex = curr + 1 - offset_1;
++            const BYTE* repMatch = (repIndex < prefixStartIndex) ?
++                                   dictBase + (repIndex - dictIndexDelta) :
++                                   base + repIndex;
++            const size_t hash1 = ZSTD_hashPtr(ip1, hlog, mls);
++            size_t const dictHashAndTag1 = ZSTD_hashPtr(ip1, dictHBits, mls);
++            hashTable[hash0] = curr;   /* update hash table */
++
++            if (((U32) ((prefixStartIndex - 1) - repIndex) >=
++                 3) /* intentional underflow : ensure repIndex isn't overlapping dict + prefix */
++                && (MEM_read32(repMatch) == MEM_read32(ip0 + 1))) {
++                const BYTE* const repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend;
++                mLength = ZSTD_count_2segments(ip0 + 1 + 4, repMatch + 4, iend, repMatchEnd, prefixStart) + 4;
++                ip0++;
++                ZSTD_storeSeq(seqStore, (size_t) (ip0 - anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength);
++                break;
++            }
++
++            if (dictTagsMatch) {
++                /* Found a possible dict match */
++                const U32 dictMatchIndex = dictMatchIndexAndTag >> ZSTD_SHORT_CACHE_TAG_BITS;
++                const BYTE* dictMatch = dictBase + dictMatchIndex;
++                if (dictMatchIndex > dictStartIndex &&
++                    MEM_read32(dictMatch) == MEM_read32(ip0)) {
++                    /* To replicate extDict parse behavior, we only use dict matches when the normal matchIndex is invalid */
++                    if (matchIndex <= prefixStartIndex) {
++                        U32 const offset = (U32) (curr - dictMatchIndex - dictIndexDelta);
++                        mLength = ZSTD_count_2segments(ip0 + 4, dictMatch + 4, iend, dictEnd, prefixStart) + 4;
++                        while (((ip0 > anchor) & (dictMatch > dictStart))
++                            && (ip0[-1] == dictMatch[-1])) {
++                            ip0--;
++                            dictMatch--;
++                            mLength++;
++                        } /* catch up */
++                        offset_2 = offset_1;
++                        offset_1 = offset;
++                        ZSTD_storeSeq(seqStore, (size_t) (ip0 - anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength);
++                        break;
++                    }
++                }
++            }
++
++            if (matchIndex > prefixStartIndex && MEM_read32(match) == MEM_read32(ip0)) {
++                /* found a regular match */
++                U32 const offset = (U32) (ip0 - match);
++                mLength = ZSTD_count(ip0 + 4, match + 4, iend) + 4;
++                while (((ip0 > anchor) & (match > prefixStart))
++                       && (ip0[-1] == match[-1])) {
++                    ip0--;
++                    match--;
++                    mLength++;
+                 } /* catch up */
+                 offset_2 = offset_1;
+                 offset_1 = offset;
+-                ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength);
++                ZSTD_storeSeq(seqStore, (size_t) (ip0 - anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength);
++                break;
+             }
+-        } else if (MEM_read32(match) != MEM_read32(ip)) {
+-            /* it's not a match, and we're not going to check the dictionary */
+-            assert(stepSize >= 1);
+-            ip += ((ip-anchor) >> kSearchStrength) + stepSize;
+-            continue;
+-        } else {
+-            /* found a regular match */
+-            U32 const offset = (U32)(ip-match);
+-            mLength = ZSTD_count(ip+4, match+4, iend) + 4;
+-            while (((ip>anchor) & (match>prefixStart))
+-                 && (ip[-1] == match[-1])) { ip--; match--; mLength++; } /* catch up */
+-            offset_2 = offset_1;
+-            offset_1 = offset;
+-            ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength);
+-        }
++
++            /* Prepare for next iteration */
++            dictMatchIndexAndTag = dictHashTable[dictHashAndTag1 >> ZSTD_SHORT_CACHE_TAG_BITS];
++            dictTagsMatch = ZSTD_comparePackedTags(dictMatchIndexAndTag, dictHashAndTag1);
++            matchIndex = hashTable[hash1];
++
++            if (ip1 >= nextStep) {
++                step++;
++                nextStep += kStepIncr;
++            }
++            ip0 = ip1;
++            ip1 = ip1 + step;
++            if (ip1 > ilimit) goto _cleanup;
++
++            curr = (U32)(ip0 - base);
++            hash0 = hash1;
++        }   /* end inner search loop */
+ 
+         /* match found */
+-        ip += mLength;
+-        anchor = ip;
++        assert(mLength);
++        ip0 += mLength;
++        anchor = ip0;
+ 
+-        if (ip <= ilimit) {
++        if (ip0 <= ilimit) {
+             /* Fill Table */
+             assert(base+curr+2 > istart);  /* check base overflow */
+             hashTable[ZSTD_hashPtr(base+curr+2, hlog, mls)] = curr+2;  /* here because curr+2 could be > iend-8 */
+-            hashTable[ZSTD_hashPtr(ip-2, hlog, mls)] = (U32)(ip-2-base);
++            hashTable[ZSTD_hashPtr(ip0-2, hlog, mls)] = (U32)(ip0-2-base);
+ 
+             /* check immediate repcode */
+-            while (ip <= ilimit) {
+-                U32 const current2 = (U32)(ip-base);
++            while (ip0 <= ilimit) {
++                U32 const current2 = (U32)(ip0-base);
+                 U32 const repIndex2 = current2 - offset_2;
+                 const BYTE* repMatch2 = repIndex2 < prefixStartIndex ?
+                         dictBase - dictIndexDelta + repIndex2 :
+                         base + repIndex2;
+                 if ( ((U32)((prefixStartIndex-1) - (U32)repIndex2) >= 3 /* intentional overflow */)
+-                   && (MEM_read32(repMatch2) == MEM_read32(ip)) ) {
++                   && (MEM_read32(repMatch2) == MEM_read32(ip0))) {
+                     const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend;
+-                    size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixStart) + 4;
++                    size_t const repLength2 = ZSTD_count_2segments(ip0+4, repMatch2+4, iend, repEnd2, prefixStart) + 4;
+                     U32 tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset;   /* swap offset_2 <=> offset_1 */
+-                    ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, repLength2);
+-                    hashTable[ZSTD_hashPtr(ip, hlog, mls)] = current2;
+-                    ip += repLength2;
+-                    anchor = ip;
++                    ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, repLength2);
++                    hashTable[ZSTD_hashPtr(ip0, hlog, mls)] = current2;
++                    ip0 += repLength2;
++                    anchor = ip0;
+                     continue;
+                 }
+                 break;
+             }
+         }
++
++        /* Prepare for next iteration */
++        assert(ip0 == anchor);
++        ip1 = ip0 + stepSize;
+     }
+ 
++_cleanup:
+     /* save reps for next block */
+-    rep[0] = offset_1 ? offset_1 : offsetSaved;
+-    rep[1] = offset_2 ? offset_2 : offsetSaved;
++    rep[0] = offset_1;
++    rep[1] = offset_2;
+ 
+     /* Return the last literals size */
+     return (size_t)(iend - anchor);
+@@ -545,7 +688,9 @@ size_t ZSTD_compressBlock_fast_dictMatch
+ }
+ 
+ 
+-static size_t ZSTD_compressBlock_fast_extDict_generic(
++static
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++size_t ZSTD_compressBlock_fast_extDict_generic(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize, U32 const mls, U32 const hasStep)
+ {
+@@ -553,11 +698,10 @@ static size_t ZSTD_compressBlock_fast_ex
+     U32* const hashTable = ms->hashTable;
+     U32 const hlog = cParams->hashLog;
+     /* support stepSize of 0 */
+-    U32 const stepSize = cParams->targetLength + !(cParams->targetLength);
++    size_t const stepSize = cParams->targetLength + !(cParams->targetLength) + 1;
+     const BYTE* const base = ms->window.base;
+     const BYTE* const dictBase = ms->window.dictBase;
+     const BYTE* const istart = (const BYTE*)src;
+-    const BYTE* ip = istart;
+     const BYTE* anchor = istart;
+     const U32   endIndex = (U32)((size_t)(istart - base) + srcSize);
+     const U32   lowLimit = ZSTD_getLowestMatchIndex(ms, endIndex, cParams->windowLog);
+@@ -570,6 +714,28 @@ static size_t ZSTD_compressBlock_fast_ex
+     const BYTE* const iend = istart + srcSize;
+     const BYTE* const ilimit = iend - 8;
+     U32 offset_1=rep[0], offset_2=rep[1];
++    U32 offsetSaved1 = 0, offsetSaved2 = 0;
++
++    const BYTE* ip0 = istart;
++    const BYTE* ip1;
++    const BYTE* ip2;
++    const BYTE* ip3;
++    U32 current0;
++
++
++    size_t hash0; /* hash for ip0 */
++    size_t hash1; /* hash for ip1 */
++    U32 idx; /* match idx for ip0 */
++    const BYTE* idxBase; /* base pointer for idx */
++
++    U32 offcode;
++    const BYTE* match0;
++    size_t mLength;
++    const BYTE* matchEnd = 0; /* initialize to avoid warning, assert != 0 later */
++
++    size_t step;
++    const BYTE* nextStep;
++    const size_t kStepIncr = (1 << (kSearchStrength - 1));
+ 
+     (void)hasStep; /* not currently specialized on whether it's accelerated */
+ 
+@@ -579,75 +745,202 @@ static size_t ZSTD_compressBlock_fast_ex
+     if (prefixStartIndex == dictStartIndex)
+         return ZSTD_compressBlock_fast(ms, seqStore, rep, src, srcSize);
+ 
+-    /* Search Loop */
+-    while (ip < ilimit) {  /* < instead of <=, because (ip+1) */
+-        const size_t h = ZSTD_hashPtr(ip, hlog, mls);
+-        const U32    matchIndex = hashTable[h];
+-        const BYTE* const matchBase = matchIndex < prefixStartIndex ? dictBase : base;
+-        const BYTE*  match = matchBase + matchIndex;
+-        const U32    curr = (U32)(ip-base);
+-        const U32    repIndex = curr + 1 - offset_1;
+-        const BYTE* const repBase = repIndex < prefixStartIndex ? dictBase : base;
+-        const BYTE* const repMatch = repBase + repIndex;
+-        hashTable[h] = curr;   /* update hash table */
+-        DEBUGLOG(7, "offset_1 = %u , curr = %u", offset_1, curr);
+-
+-        if ( ( ((U32)((prefixStartIndex-1) - repIndex) >= 3) /* intentional underflow */
+-             & (offset_1 <= curr+1 - dictStartIndex) ) /* note: we are searching at curr+1 */
+-           && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) {
+-            const BYTE* const repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend;
+-            size_t const rLength = ZSTD_count_2segments(ip+1 +4, repMatch +4, iend, repMatchEnd, prefixStart) + 4;
+-            ip++;
+-            ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_REPCODE_1, rLength);
+-            ip += rLength;
+-            anchor = ip;
+-        } else {
+-            if ( (matchIndex < dictStartIndex) ||
+-                 (MEM_read32(match) != MEM_read32(ip)) ) {
+-                assert(stepSize >= 1);
+-                ip += ((ip-anchor) >> kSearchStrength) + stepSize;
+-                continue;
++    {   U32 const curr = (U32)(ip0 - base);
++        U32 const maxRep = curr - dictStartIndex;
++        if (offset_2 >= maxRep) offsetSaved2 = offset_2, offset_2 = 0;
++        if (offset_1 >= maxRep) offsetSaved1 = offset_1, offset_1 = 0;
++    }
++
++    /* start each op */
++_start: /* Requires: ip0 */
++
++    step = stepSize;
++    nextStep = ip0 + kStepIncr;
++
++    /* calculate positions, ip0 - anchor == 0, so we skip step calc */
++    ip1 = ip0 + 1;
++    ip2 = ip0 + step;
++    ip3 = ip2 + 1;
++
++    if (ip3 >= ilimit) {
++        goto _cleanup;
++    }
++
++    hash0 = ZSTD_hashPtr(ip0, hlog, mls);
++    hash1 = ZSTD_hashPtr(ip1, hlog, mls);
++
++    idx = hashTable[hash0];
++    idxBase = idx < prefixStartIndex ? dictBase : base;
++
++    do {
++        {   /* load repcode match for ip[2] */
++            U32 const current2 = (U32)(ip2 - base);
++            U32 const repIndex = current2 - offset_1;
++            const BYTE* const repBase = repIndex < prefixStartIndex ? dictBase : base;
++            U32 rval;
++            if ( ((U32)(prefixStartIndex - repIndex) >= 4) /* intentional underflow */
++                 & (offset_1 > 0) ) {
++                rval = MEM_read32(repBase + repIndex);
++            } else {
++                rval = MEM_read32(ip2) ^ 1; /* guaranteed to not match. */
+             }
+-            {   const BYTE* const matchEnd = matchIndex < prefixStartIndex ? dictEnd : iend;
+-                const BYTE* const lowMatchPtr = matchIndex < prefixStartIndex ? dictStart : prefixStart;
+-                U32 const offset = curr - matchIndex;
+-                size_t mLength = ZSTD_count_2segments(ip+4, match+4, iend, matchEnd, prefixStart) + 4;
+-                while (((ip>anchor) & (match>lowMatchPtr)) && (ip[-1] == match[-1])) { ip--; match--; mLength++; }   /* catch up */
+-                offset_2 = offset_1; offset_1 = offset;  /* update offset history */
+-                ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength);
+-                ip += mLength;
+-                anchor = ip;
++
++            /* write back hash table entry */
++            current0 = (U32)(ip0 - base);
++            hashTable[hash0] = current0;
++
++            /* check repcode at ip[2] */
++            if (MEM_read32(ip2) == rval) {
++                ip0 = ip2;
++                match0 = repBase + repIndex;
++                matchEnd = repIndex < prefixStartIndex ? dictEnd : iend;
++                assert((match0 != prefixStart) & (match0 != dictStart));
++                mLength = ip0[-1] == match0[-1];
++                ip0 -= mLength;
++                match0 -= mLength;
++                offcode = REPCODE1_TO_OFFBASE;
++                mLength += 4;
++                goto _match;
+         }   }
+ 
+-        if (ip <= ilimit) {
+-            /* Fill Table */
+-            hashTable[ZSTD_hashPtr(base+curr+2, hlog, mls)] = curr+2;
+-            hashTable[ZSTD_hashPtr(ip-2, hlog, mls)] = (U32)(ip-2-base);
+-            /* check immediate repcode */
+-            while (ip <= ilimit) {
+-                U32 const current2 = (U32)(ip-base);
+-                U32 const repIndex2 = current2 - offset_2;
+-                const BYTE* const repMatch2 = repIndex2 < prefixStartIndex ? dictBase + repIndex2 : base + repIndex2;
+-                if ( (((U32)((prefixStartIndex-1) - repIndex2) >= 3) & (offset_2 <= curr - dictStartIndex))  /* intentional overflow */
+-                   && (MEM_read32(repMatch2) == MEM_read32(ip)) ) {
+-                    const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend;
+-                    size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixStart) + 4;
+-                    { U32 const tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; }  /* swap offset_2 <=> offset_1 */
+-                    ZSTD_storeSeq(seqStore, 0 /*litlen*/, anchor, iend, STORE_REPCODE_1, repLength2);
+-                    hashTable[ZSTD_hashPtr(ip, hlog, mls)] = current2;
+-                    ip += repLength2;
+-                    anchor = ip;
+-                    continue;
+-                }
+-                break;
+-    }   }   }
++        {   /* load match for ip[0] */
++            U32 const mval = idx >= dictStartIndex ?
++                    MEM_read32(idxBase + idx) :
++                    MEM_read32(ip0) ^ 1; /* guaranteed not to match */
++
++            /* check match at ip[0] */
++            if (MEM_read32(ip0) == mval) {
++                /* found a match! */
++                goto _offset;
++        }   }
++
++        /* lookup ip[1] */
++        idx = hashTable[hash1];
++        idxBase = idx < prefixStartIndex ? dictBase : base;
++
++        /* hash ip[2] */
++        hash0 = hash1;
++        hash1 = ZSTD_hashPtr(ip2, hlog, mls);
++
++        /* advance to next positions */
++        ip0 = ip1;
++        ip1 = ip2;
++        ip2 = ip3;
++
++        /* write back hash table entry */
++        current0 = (U32)(ip0 - base);
++        hashTable[hash0] = current0;
++
++        {   /* load match for ip[0] */
++            U32 const mval = idx >= dictStartIndex ?
++                    MEM_read32(idxBase + idx) :
++                    MEM_read32(ip0) ^ 1; /* guaranteed not to match */
++
++            /* check match at ip[0] */
++            if (MEM_read32(ip0) == mval) {
++                /* found a match! */
++                goto _offset;
++        }   }
++
++        /* lookup ip[1] */
++        idx = hashTable[hash1];
++        idxBase = idx < prefixStartIndex ? dictBase : base;
++
++        /* hash ip[2] */
++        hash0 = hash1;
++        hash1 = ZSTD_hashPtr(ip2, hlog, mls);
++
++        /* advance to next positions */
++        ip0 = ip1;
++        ip1 = ip2;
++        ip2 = ip0 + step;
++        ip3 = ip1 + step;
++
++        /* calculate step */
++        if (ip2 >= nextStep) {
++            step++;
++            PREFETCH_L1(ip1 + 64);
++            PREFETCH_L1(ip1 + 128);
++            nextStep += kStepIncr;
++        }
++    } while (ip3 < ilimit);
++
++_cleanup:
++    /* Note that there are probably still a couple positions we could search.
++     * However, it seems to be a meaningful performance hit to try to search
++     * them. So let's not. */
++
++    /* If offset_1 started invalid (offsetSaved1 != 0) and became valid (offset_1 != 0),
++     * rotate saved offsets. See comment in ZSTD_compressBlock_fast_noDict for more context. */
++    offsetSaved2 = ((offsetSaved1 != 0) && (offset_1 != 0)) ? offsetSaved1 : offsetSaved2;
+ 
+     /* save reps for next block */
+-    rep[0] = offset_1;
+-    rep[1] = offset_2;
++    rep[0] = offset_1 ? offset_1 : offsetSaved1;
++    rep[1] = offset_2 ? offset_2 : offsetSaved2;
+ 
+     /* Return the last literals size */
+     return (size_t)(iend - anchor);
++
++_offset: /* Requires: ip0, idx, idxBase */
++
++    /* Compute the offset code. */
++    {   U32 const offset = current0 - idx;
++        const BYTE* const lowMatchPtr = idx < prefixStartIndex ? dictStart : prefixStart;
++        matchEnd = idx < prefixStartIndex ? dictEnd : iend;
++        match0 = idxBase + idx;
++        offset_2 = offset_1;
++        offset_1 = offset;
++        offcode = OFFSET_TO_OFFBASE(offset);
++        mLength = 4;
++
++        /* Count the backwards match length. */
++        while (((ip0>anchor) & (match0>lowMatchPtr)) && (ip0[-1] == match0[-1])) {
++            ip0--;
++            match0--;
++            mLength++;
++    }   }
++
++_match: /* Requires: ip0, match0, offcode, matchEnd */
++
++    /* Count the forward length. */
++    assert(matchEnd != 0);
++    mLength += ZSTD_count_2segments(ip0 + mLength, match0 + mLength, iend, matchEnd, prefixStart);
++
++    ZSTD_storeSeq(seqStore, (size_t)(ip0 - anchor), anchor, iend, offcode, mLength);
++
++    ip0 += mLength;
++    anchor = ip0;
++
++    /* write next hash table entry */
++    if (ip1 < ip0) {
++        hashTable[hash1] = (U32)(ip1 - base);
++    }
++
++    /* Fill table and check for immediate repcode. */
++    if (ip0 <= ilimit) {
++        /* Fill Table */
++        assert(base+current0+2 > istart);  /* check base overflow */
++        hashTable[ZSTD_hashPtr(base+current0+2, hlog, mls)] = current0+2;  /* here because current+2 could be > iend-8 */
++        hashTable[ZSTD_hashPtr(ip0-2, hlog, mls)] = (U32)(ip0-2-base);
++
++        while (ip0 <= ilimit) {
++            U32 const repIndex2 = (U32)(ip0-base) - offset_2;
++            const BYTE* const repMatch2 = repIndex2 < prefixStartIndex ? dictBase + repIndex2 : base + repIndex2;
++            if ( (((U32)((prefixStartIndex-1) - repIndex2) >= 3) & (offset_2 > 0))  /* intentional underflow */
++                 && (MEM_read32(repMatch2) == MEM_read32(ip0)) ) {
++                const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend;
++                size_t const repLength2 = ZSTD_count_2segments(ip0+4, repMatch2+4, iend, repEnd2, prefixStart) + 4;
++                { U32 const tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; }  /* swap offset_2 <=> offset_1 */
++                ZSTD_storeSeq(seqStore, 0 /*litlen*/, anchor, iend, REPCODE1_TO_OFFBASE, repLength2);
++                hashTable[ZSTD_hashPtr(ip0, hlog, mls)] = (U32)(ip0-base);
++                ip0 += repLength2;
++                anchor = ip0;
++                continue;
++            }
++            break;
++    }   }
++
++    goto _start;
+ }
+ 
+ ZSTD_GEN_FAST_FN(extDict, 4, 0)
+@@ -660,6 +953,7 @@ size_t ZSTD_compressBlock_fast_extDict(
+         void const* src, size_t srcSize)
+ {
+     U32 const mls = ms->cParams.minMatch;
++    assert(ms->dictMatchState == NULL);
+     switch(mls)
+     {
+     default: /* includes case 3 */
+--- a/lib/zstd/compress/zstd_fast.h
++++ b/lib/zstd/compress/zstd_fast.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -16,7 +17,8 @@
+ #include "zstd_compress_internal.h"
+ 
+ void ZSTD_fillHashTable(ZSTD_matchState_t* ms,
+-                        void const* end, ZSTD_dictTableLoadMethod_e dtlm);
++                        void const* end, ZSTD_dictTableLoadMethod_e dtlm,
++                        ZSTD_tableFillPurpose_e tfp);
+ size_t ZSTD_compressBlock_fast(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+--- a/lib/zstd/compress/zstd_lazy.c
++++ b/lib/zstd/compress/zstd_lazy.c
+@@ -1,5 +1,6 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -10,14 +11,23 @@
+ 
+ #include "zstd_compress_internal.h"
+ #include "zstd_lazy.h"
++#include "../common/bits.h" /* ZSTD_countTrailingZeros64 */
++
++#if !defined(ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR) \
++ || !defined(ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR) \
++ || !defined(ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR) \
++ || !defined(ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR)
++
++#define kLazySkippingStep 8
+ 
+ 
+ /*-*************************************
+ *  Binary Tree search
+ ***************************************/
+ 
+-static void
+-ZSTD_updateDUBT(ZSTD_matchState_t* ms,
++static
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++void ZSTD_updateDUBT(ZSTD_matchState_t* ms,
+                 const BYTE* ip, const BYTE* iend,
+                 U32 mls)
+ {
+@@ -60,8 +70,9 @@ ZSTD_updateDUBT(ZSTD_matchState_t* ms,
+  *  sort one already inserted but unsorted position
+  *  assumption : curr >= btlow == (curr - btmask)
+  *  doesn't fail */
+-static void
+-ZSTD_insertDUBT1(const ZSTD_matchState_t* ms,
++static
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++void ZSTD_insertDUBT1(const ZSTD_matchState_t* ms,
+                  U32 curr, const BYTE* inputEnd,
+                  U32 nbCompares, U32 btLow,
+                  const ZSTD_dictMode_e dictMode)
+@@ -149,8 +160,9 @@ ZSTD_insertDUBT1(const ZSTD_matchState_t
+ }
+ 
+ 
+-static size_t
+-ZSTD_DUBT_findBetterDictMatch (
++static
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++size_t ZSTD_DUBT_findBetterDictMatch (
+         const ZSTD_matchState_t* ms,
+         const BYTE* const ip, const BYTE* const iend,
+         size_t* offsetPtr,
+@@ -197,8 +209,8 @@ ZSTD_DUBT_findBetterDictMatch (
+             U32 matchIndex = dictMatchIndex + dictIndexDelta;
+             if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) ) {
+                 DEBUGLOG(9, "ZSTD_DUBT_findBetterDictMatch(%u) : found better match length %u -> %u and offsetCode %u -> %u (dictMatchIndex %u, matchIndex %u)",
+-                    curr, (U32)bestLength, (U32)matchLength, (U32)*offsetPtr, STORE_OFFSET(curr - matchIndex), dictMatchIndex, matchIndex);
+-                bestLength = matchLength, *offsetPtr = STORE_OFFSET(curr - matchIndex);
++                    curr, (U32)bestLength, (U32)matchLength, (U32)*offsetPtr, OFFSET_TO_OFFBASE(curr - matchIndex), dictMatchIndex, matchIndex);
++                bestLength = matchLength, *offsetPtr = OFFSET_TO_OFFBASE(curr - matchIndex);
+             }
+             if (ip+matchLength == iend) {   /* reached end of input : ip[matchLength] is not valid, no way to know if it's larger or smaller than match */
+                 break;   /* drop, to guarantee consistency (miss a little bit of compression) */
+@@ -218,7 +230,7 @@ ZSTD_DUBT_findBetterDictMatch (
+     }
+ 
+     if (bestLength >= MINMATCH) {
+-        U32 const mIndex = curr - (U32)STORED_OFFSET(*offsetPtr); (void)mIndex;
++        U32 const mIndex = curr - (U32)OFFBASE_TO_OFFSET(*offsetPtr); (void)mIndex;
+         DEBUGLOG(8, "ZSTD_DUBT_findBetterDictMatch(%u) : found match of length %u and offsetCode %u (pos %u)",
+                     curr, (U32)bestLength, (U32)*offsetPtr, mIndex);
+     }
+@@ -227,10 +239,11 @@ ZSTD_DUBT_findBetterDictMatch (
+ }
+ 
+ 
+-static size_t
+-ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
++static
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++size_t ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
+                         const BYTE* const ip, const BYTE* const iend,
+-                        size_t* offsetPtr,
++                        size_t* offBasePtr,
+                         U32 const mls,
+                         const ZSTD_dictMode_e dictMode)
+ {
+@@ -327,8 +340,8 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_
+             if (matchLength > bestLength) {
+                 if (matchLength > matchEndIdx - matchIndex)
+                     matchEndIdx = matchIndex + (U32)matchLength;
+-                if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) )
+-                    bestLength = matchLength, *offsetPtr = STORE_OFFSET(curr - matchIndex);
++                if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr - matchIndex + 1) - ZSTD_highbit32((U32)*offBasePtr)) )
++                    bestLength = matchLength, *offBasePtr = OFFSET_TO_OFFBASE(curr - matchIndex);
+                 if (ip+matchLength == iend) {   /* equal : no way to know if inf or sup */
+                     if (dictMode == ZSTD_dictMatchState) {
+                         nbCompares = 0; /* in addition to avoiding checking any
+@@ -361,16 +374,16 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_
+         if (dictMode == ZSTD_dictMatchState && nbCompares) {
+             bestLength = ZSTD_DUBT_findBetterDictMatch(
+                     ms, ip, iend,
+-                    offsetPtr, bestLength, nbCompares,
++                    offBasePtr, bestLength, nbCompares,
+                     mls, dictMode);
+         }
+ 
+         assert(matchEndIdx > curr+8); /* ensure nextToUpdate is increased */
+         ms->nextToUpdate = matchEndIdx - 8;   /* skip repetitive patterns */
+         if (bestLength >= MINMATCH) {
+-            U32 const mIndex = curr - (U32)STORED_OFFSET(*offsetPtr); (void)mIndex;
++            U32 const mIndex = curr - (U32)OFFBASE_TO_OFFSET(*offBasePtr); (void)mIndex;
+             DEBUGLOG(8, "ZSTD_DUBT_findBestMatch(%u) : found match of length %u and offsetCode %u (pos %u)",
+-                        curr, (U32)bestLength, (U32)*offsetPtr, mIndex);
++                        curr, (U32)bestLength, (U32)*offBasePtr, mIndex);
+         }
+         return bestLength;
+     }
+@@ -378,17 +391,18 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_
+ 
+ 
+ /* ZSTD_BtFindBestMatch() : Tree updater, providing best match */
+-FORCE_INLINE_TEMPLATE size_t
+-ZSTD_BtFindBestMatch( ZSTD_matchState_t* ms,
++FORCE_INLINE_TEMPLATE
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++size_t ZSTD_BtFindBestMatch( ZSTD_matchState_t* ms,
+                 const BYTE* const ip, const BYTE* const iLimit,
+-                      size_t* offsetPtr,
++                      size_t* offBasePtr,
+                 const U32 mls /* template */,
+                 const ZSTD_dictMode_e dictMode)
+ {
+     DEBUGLOG(7, "ZSTD_BtFindBestMatch");
+     if (ip < ms->window.base + ms->nextToUpdate) return 0;   /* skipped area */
+     ZSTD_updateDUBT(ms, ip, iLimit, mls);
+-    return ZSTD_DUBT_findBestMatch(ms, ip, iLimit, offsetPtr, mls, dictMode);
++    return ZSTD_DUBT_findBestMatch(ms, ip, iLimit, offBasePtr, mls, dictMode);
+ }
+ 
+ /* *********************************
+@@ -561,7 +575,7 @@ size_t ZSTD_dedicatedDictSearch_lazy_sea
+         /* save best solution */
+         if (currentMl > ml) {
+             ml = currentMl;
+-            *offsetPtr = STORE_OFFSET(curr - (matchIndex + ddsIndexDelta));
++            *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + ddsIndexDelta));
+             if (ip+currentMl == iLimit) {
+                 /* best possible, avoids read overflow on next attempt */
+                 return ml;
+@@ -598,7 +612,7 @@ size_t ZSTD_dedicatedDictSearch_lazy_sea
+             /* save best solution */
+             if (currentMl > ml) {
+                 ml = currentMl;
+-                *offsetPtr = STORE_OFFSET(curr - (matchIndex + ddsIndexDelta));
++                *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + ddsIndexDelta));
+                 if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
+             }
+         }
+@@ -614,10 +628,12 @@ size_t ZSTD_dedicatedDictSearch_lazy_sea
+ 
+ /* Update chains up to ip (excluded)
+    Assumption : always within prefix (i.e. not within extDict) */
+-FORCE_INLINE_TEMPLATE U32 ZSTD_insertAndFindFirstIndex_internal(
++FORCE_INLINE_TEMPLATE
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++U32 ZSTD_insertAndFindFirstIndex_internal(
+                         ZSTD_matchState_t* ms,
+                         const ZSTD_compressionParameters* const cParams,
+-                        const BYTE* ip, U32 const mls)
++                        const BYTE* ip, U32 const mls, U32 const lazySkipping)
+ {
+     U32* const hashTable  = ms->hashTable;
+     const U32 hashLog = cParams->hashLog;
+@@ -632,6 +648,9 @@ FORCE_INLINE_TEMPLATE U32 ZSTD_insertAnd
+         NEXT_IN_CHAIN(idx, chainMask) = hashTable[h];
+         hashTable[h] = idx;
+         idx++;
++        /* Stop inserting every position when in the lazy skipping mode. */
++        if (lazySkipping)
++            break;
+     }
+ 
+     ms->nextToUpdate = target;
+@@ -640,11 +659,12 @@ FORCE_INLINE_TEMPLATE U32 ZSTD_insertAnd
+ 
+ U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip) {
+     const ZSTD_compressionParameters* const cParams = &ms->cParams;
+-    return ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, ms->cParams.minMatch);
++    return ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, ms->cParams.minMatch, /* lazySkipping*/ 0);
+ }
+ 
+ /* inlining is important to hardwire a hot branch (template emulation) */
+ FORCE_INLINE_TEMPLATE
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+ size_t ZSTD_HcFindBestMatch(
+                         ZSTD_matchState_t* ms,
+                         const BYTE* const ip, const BYTE* const iLimit,
+@@ -684,14 +704,15 @@ size_t ZSTD_HcFindBestMatch(
+     }
+ 
+     /* HC4 match finder */
+-    matchIndex = ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, mls);
++    matchIndex = ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, mls, ms->lazySkipping);
+ 
+     for ( ; (matchIndex>=lowLimit) & (nbAttempts>0) ; nbAttempts--) {
+         size_t currentMl=0;
+         if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) {
+             const BYTE* const match = base + matchIndex;
+             assert(matchIndex >= dictLimit);   /* ensures this is true if dictMode != ZSTD_extDict */
+-            if (match[ml] == ip[ml])   /* potentially better */
++            /* read 4B starting from (match + ml + 1 - sizeof(U32)) */
++            if (MEM_read32(match + ml - 3) == MEM_read32(ip + ml - 3))   /* potentially better */
+                 currentMl = ZSTD_count(ip, match, iLimit);
+         } else {
+             const BYTE* const match = dictBase + matchIndex;
+@@ -703,7 +724,7 @@ size_t ZSTD_HcFindBestMatch(
+         /* save best solution */
+         if (currentMl > ml) {
+             ml = currentMl;
+-            *offsetPtr = STORE_OFFSET(curr - matchIndex);
++            *offsetPtr = OFFSET_TO_OFFBASE(curr - matchIndex);
+             if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
+         }
+ 
+@@ -739,7 +760,7 @@ size_t ZSTD_HcFindBestMatch(
+             if (currentMl > ml) {
+                 ml = currentMl;
+                 assert(curr > matchIndex + dmsIndexDelta);
+-                *offsetPtr = STORE_OFFSET(curr - (matchIndex + dmsIndexDelta));
++                *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + dmsIndexDelta));
+                 if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
+             }
+ 
+@@ -756,8 +777,6 @@ size_t ZSTD_HcFindBestMatch(
+ * (SIMD) Row-based matchfinder
+ ***********************************/
+ /* Constants for row-based hash */
+-#define ZSTD_ROW_HASH_TAG_OFFSET 16     /* byte offset of hashes in the match state's tagTable from the beginning of a row */
+-#define ZSTD_ROW_HASH_TAG_BITS 8        /* nb bits to use for the tag */
+ #define ZSTD_ROW_HASH_TAG_MASK ((1u << ZSTD_ROW_HASH_TAG_BITS) - 1)
+ #define ZSTD_ROW_HASH_MAX_ENTRIES 64    /* absolute maximum number of entries per row, for all configurations */
+ 
+@@ -769,64 +788,19 @@ typedef U64 ZSTD_VecMask;   /* Clarifies
+  * Starting from the LSB, returns the idx of the next non-zero bit.
+  * Basically counting the nb of trailing zeroes.
+  */
+-static U32 ZSTD_VecMask_next(ZSTD_VecMask val) {
+-    assert(val != 0);
+-#   if (defined(__GNUC__) && ((__GNUC__ > 3) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 4))))
+-    if (sizeof(size_t) == 4) {
+-        U32 mostSignificantWord = (U32)(val >> 32);
+-        U32 leastSignificantWord = (U32)val;
+-        if (leastSignificantWord == 0) {
+-            return 32 + (U32)__builtin_ctz(mostSignificantWord);
+-        } else {
+-            return (U32)__builtin_ctz(leastSignificantWord);
+-        }
+-    } else {
+-        return (U32)__builtin_ctzll(val);
+-    }
+-#   else
+-    /* Software ctz version: http://aggregate.org/MAGIC/#Trailing%20Zero%20Count
+-     * and: https://stackoverflow.com/questions/2709430/count-number-of-bits-in-a-64-bit-long-big-integer
+-     */
+-    val = ~val & (val - 1ULL); /* Lowest set bit mask */
+-    val = val - ((val >> 1) & 0x5555555555555555);
+-    val = (val & 0x3333333333333333ULL) + ((val >> 2) & 0x3333333333333333ULL);
+-    return (U32)((((val + (val >> 4)) & 0xF0F0F0F0F0F0F0FULL) * 0x101010101010101ULL) >> 56);
+-#   endif
+-}
+-
+-/* ZSTD_rotateRight_*():
+- * Rotates a bitfield to the right by "count" bits.
+- * https://en.wikipedia.org/w/index.php?title=Circular_shift&oldid=991635599#Implementing_circular_shifts
+- */
+-FORCE_INLINE_TEMPLATE
+-U64 ZSTD_rotateRight_U64(U64 const value, U32 count) {
+-    assert(count < 64);
+-    count &= 0x3F; /* for fickle pattern recognition */
+-    return (value >> count) | (U64)(value << ((0U - count) & 0x3F));
+-}
+-
+-FORCE_INLINE_TEMPLATE
+-U32 ZSTD_rotateRight_U32(U32 const value, U32 count) {
+-    assert(count < 32);
+-    count &= 0x1F; /* for fickle pattern recognition */
+-    return (value >> count) | (U32)(value << ((0U - count) & 0x1F));
+-}
+-
+-FORCE_INLINE_TEMPLATE
+-U16 ZSTD_rotateRight_U16(U16 const value, U32 count) {
+-    assert(count < 16);
+-    count &= 0x0F; /* for fickle pattern recognition */
+-    return (value >> count) | (U16)(value << ((0U - count) & 0x0F));
++MEM_STATIC U32 ZSTD_VecMask_next(ZSTD_VecMask val) {
++    return ZSTD_countTrailingZeros64(val);
+ }
+ 
+ /* ZSTD_row_nextIndex():
+  * Returns the next index to insert at within a tagTable row, and updates the "head"
+- * value to reflect the update. Essentially cycles backwards from [0, {entries per row})
++ * value to reflect the update. Essentially cycles backwards from [1, {entries per row})
+  */
+ FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextIndex(BYTE* const tagRow, U32 const rowMask) {
+-  U32 const next = (*tagRow - 1) & rowMask;
+-  *tagRow = (BYTE)next;
+-  return next;
++    U32 next = (*tagRow-1) & rowMask;
++    next += (next == 0) ? rowMask : 0; /* skip first position */
++    *tagRow = (BYTE)next;
++    return next;
+ }
+ 
+ /* ZSTD_isAligned():
+@@ -840,7 +814,7 @@ MEM_STATIC int ZSTD_isAligned(void const
+ /* ZSTD_row_prefetch():
+  * Performs prefetching for the hashTable and tagTable at a given row.
+  */
+-FORCE_INLINE_TEMPLATE void ZSTD_row_prefetch(U32 const* hashTable, U16 const* tagTable, U32 const relRow, U32 const rowLog) {
++FORCE_INLINE_TEMPLATE void ZSTD_row_prefetch(U32 const* hashTable, BYTE const* tagTable, U32 const relRow, U32 const rowLog) {
+     PREFETCH_L1(hashTable + relRow);
+     if (rowLog >= 5) {
+         PREFETCH_L1(hashTable + relRow + 16);
+@@ -859,18 +833,20 @@ FORCE_INLINE_TEMPLATE void ZSTD_row_pref
+  * Fill up the hash cache starting at idx, prefetching up to ZSTD_ROW_HASH_CACHE_SIZE entries,
+  * but not beyond iLimit.
+  */
+-FORCE_INLINE_TEMPLATE void ZSTD_row_fillHashCache(ZSTD_matchState_t* ms, const BYTE* base,
++FORCE_INLINE_TEMPLATE
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++void ZSTD_row_fillHashCache(ZSTD_matchState_t* ms, const BYTE* base,
+                                    U32 const rowLog, U32 const mls,
+                                    U32 idx, const BYTE* const iLimit)
+ {
+     U32 const* const hashTable = ms->hashTable;
+-    U16 const* const tagTable = ms->tagTable;
++    BYTE const* const tagTable = ms->tagTable;
+     U32 const hashLog = ms->rowHashLog;
+     U32 const maxElemsToPrefetch = (base + idx) > iLimit ? 0 : (U32)(iLimit - (base + idx) + 1);
+     U32 const lim = idx + MIN(ZSTD_ROW_HASH_CACHE_SIZE, maxElemsToPrefetch);
+ 
+     for (; idx < lim; ++idx) {
+-        U32 const hash = (U32)ZSTD_hashPtr(base + idx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
++        U32 const hash = (U32)ZSTD_hashPtrSalted(base + idx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, ms->hashSalt);
+         U32 const row = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
+         ZSTD_row_prefetch(hashTable, tagTable, row, rowLog);
+         ms->hashCache[idx & ZSTD_ROW_HASH_CACHE_MASK] = hash;
+@@ -885,12 +861,15 @@ FORCE_INLINE_TEMPLATE void ZSTD_row_fill
+  * Returns the hash of base + idx, and replaces the hash in the hash cache with the byte at
+  * base + idx + ZSTD_ROW_HASH_CACHE_SIZE. Also prefetches the appropriate rows from hashTable and tagTable.
+  */
+-FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextCachedHash(U32* cache, U32 const* hashTable,
+-                                                  U16 const* tagTable, BYTE const* base,
++FORCE_INLINE_TEMPLATE
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++U32 ZSTD_row_nextCachedHash(U32* cache, U32 const* hashTable,
++                                                  BYTE const* tagTable, BYTE const* base,
+                                                   U32 idx, U32 const hashLog,
+-                                                  U32 const rowLog, U32 const mls)
++                                                  U32 const rowLog, U32 const mls,
++                                                  U64 const hashSalt)
+ {
+-    U32 const newHash = (U32)ZSTD_hashPtr(base+idx+ZSTD_ROW_HASH_CACHE_SIZE, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
++    U32 const newHash = (U32)ZSTD_hashPtrSalted(base+idx+ZSTD_ROW_HASH_CACHE_SIZE, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, hashSalt);
+     U32 const row = (newHash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
+     ZSTD_row_prefetch(hashTable, tagTable, row, rowLog);
+     {   U32 const hash = cache[idx & ZSTD_ROW_HASH_CACHE_MASK];
+@@ -902,28 +881,29 @@ FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextC
+ /* ZSTD_row_update_internalImpl():
+  * Updates the hash table with positions starting from updateStartIdx until updateEndIdx.
+  */
+-FORCE_INLINE_TEMPLATE void ZSTD_row_update_internalImpl(ZSTD_matchState_t* ms,
+-                                                        U32 updateStartIdx, U32 const updateEndIdx,
+-                                                        U32 const mls, U32 const rowLog,
+-                                                        U32 const rowMask, U32 const useCache)
++FORCE_INLINE_TEMPLATE
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++void ZSTD_row_update_internalImpl(ZSTD_matchState_t* ms,
++                                  U32 updateStartIdx, U32 const updateEndIdx,
++                                  U32 const mls, U32 const rowLog,
++                                  U32 const rowMask, U32 const useCache)
+ {
+     U32* const hashTable = ms->hashTable;
+-    U16* const tagTable = ms->tagTable;
++    BYTE* const tagTable = ms->tagTable;
+     U32 const hashLog = ms->rowHashLog;
+     const BYTE* const base = ms->window.base;
+ 
+     DEBUGLOG(6, "ZSTD_row_update_internalImpl(): updateStartIdx=%u, updateEndIdx=%u", updateStartIdx, updateEndIdx);
+     for (; updateStartIdx < updateEndIdx; ++updateStartIdx) {
+-        U32 const hash = useCache ? ZSTD_row_nextCachedHash(ms->hashCache, hashTable, tagTable, base, updateStartIdx, hashLog, rowLog, mls)
+-                                  : (U32)ZSTD_hashPtr(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
++        U32 const hash = useCache ? ZSTD_row_nextCachedHash(ms->hashCache, hashTable, tagTable, base, updateStartIdx, hashLog, rowLog, mls, ms->hashSalt)
++                                  : (U32)ZSTD_hashPtrSalted(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, ms->hashSalt);
+         U32 const relRow = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
+         U32* const row = hashTable + relRow;
+-        BYTE* tagRow = (BYTE*)(tagTable + relRow);  /* Though tagTable is laid out as a table of U16, each tag is only 1 byte.
+-                                                       Explicit cast allows us to get exact desired position within each row */
++        BYTE* tagRow = tagTable + relRow;
+         U32 const pos = ZSTD_row_nextIndex(tagRow, rowMask);
+ 
+-        assert(hash == ZSTD_hashPtr(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls));
+-        ((BYTE*)tagRow)[pos + ZSTD_ROW_HASH_TAG_OFFSET] = hash & ZSTD_ROW_HASH_TAG_MASK;
++        assert(hash == ZSTD_hashPtrSalted(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, ms->hashSalt));
++        tagRow[pos] = hash & ZSTD_ROW_HASH_TAG_MASK;
+         row[pos] = updateStartIdx;
+     }
+ }
+@@ -932,9 +912,11 @@ FORCE_INLINE_TEMPLATE void ZSTD_row_upda
+  * Inserts the byte at ip into the appropriate position in the hash table, and updates ms->nextToUpdate.
+  * Skips sections of long matches as is necessary.
+  */
+-FORCE_INLINE_TEMPLATE void ZSTD_row_update_internal(ZSTD_matchState_t* ms, const BYTE* ip,
+-                                                    U32 const mls, U32 const rowLog,
+-                                                    U32 const rowMask, U32 const useCache)
++FORCE_INLINE_TEMPLATE
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++void ZSTD_row_update_internal(ZSTD_matchState_t* ms, const BYTE* ip,
++                              U32 const mls, U32 const rowLog,
++                              U32 const rowMask, U32 const useCache)
+ {
+     U32 idx = ms->nextToUpdate;
+     const BYTE* const base = ms->window.base;
+@@ -971,7 +953,35 @@ void ZSTD_row_update(ZSTD_matchState_t*
+     const U32 mls = MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */);
+ 
+     DEBUGLOG(5, "ZSTD_row_update(), rowLog=%u", rowLog);
+-    ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 0 /* dont use cache */);
++    ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 0 /* don't use cache */);
++}
++
++/* Returns the mask width of bits group of which will be set to 1. Given not all
++ * architectures have easy movemask instruction, this helps to iterate over
++ * groups of bits easier and faster.
++ */
++FORCE_INLINE_TEMPLATE U32
++ZSTD_row_matchMaskGroupWidth(const U32 rowEntries)
++{
++    assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64);
++    assert(rowEntries <= ZSTD_ROW_HASH_MAX_ENTRIES);
++    (void)rowEntries;
++#if defined(ZSTD_ARCH_ARM_NEON)
++    /* NEON path only works for little endian */
++    if (!MEM_isLittleEndian()) {
++        return 1;
++    }
++    if (rowEntries == 16) {
++        return 4;
++    }
++    if (rowEntries == 32) {
++        return 2;
++    }
++    if (rowEntries == 64) {
++        return 1;
++    }
++#endif
++    return 1;
+ }
+ 
+ #if defined(ZSTD_ARCH_X86_SSE2)
+@@ -994,71 +1004,82 @@ ZSTD_row_getSSEMask(int nbChunks, const
+ }
+ #endif
+ 
+-/* Returns a ZSTD_VecMask (U32) that has the nth bit set to 1 if the newly-computed "tag" matches
+- * the hash at the nth position in a row of the tagTable.
+- * Each row is a circular buffer beginning at the value of "head". So we must rotate the "matches" bitfield
+- * to match up with the actual layout of the entries within the hashTable */
++#if defined(ZSTD_ARCH_ARM_NEON)
++FORCE_INLINE_TEMPLATE ZSTD_VecMask
++ZSTD_row_getNEONMask(const U32 rowEntries, const BYTE* const src, const BYTE tag, const U32 headGrouped)
++{
++    assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64);
++    if (rowEntries == 16) {
++        /* vshrn_n_u16 shifts by 4 every u16 and narrows to 8 lower bits.
++         * After that groups of 4 bits represent the equalMask. We lower
++         * all bits except the highest in these groups by doing AND with
++         * 0x88 = 0b10001000.
++         */
++        const uint8x16_t chunk = vld1q_u8(src);
++        const uint16x8_t equalMask = vreinterpretq_u16_u8(vceqq_u8(chunk, vdupq_n_u8(tag)));
++        const uint8x8_t res = vshrn_n_u16(equalMask, 4);
++        const U64 matches = vget_lane_u64(vreinterpret_u64_u8(res), 0);
++        return ZSTD_rotateRight_U64(matches, headGrouped) & 0x8888888888888888ull;
++    } else if (rowEntries == 32) {
++        /* Same idea as with rowEntries == 16 but doing AND with
++         * 0x55 = 0b01010101.
++         */
++        const uint16x8x2_t chunk = vld2q_u16((const uint16_t*)(const void*)src);
++        const uint8x16_t chunk0 = vreinterpretq_u8_u16(chunk.val[0]);
++        const uint8x16_t chunk1 = vreinterpretq_u8_u16(chunk.val[1]);
++        const uint8x16_t dup = vdupq_n_u8(tag);
++        const uint8x8_t t0 = vshrn_n_u16(vreinterpretq_u16_u8(vceqq_u8(chunk0, dup)), 6);
++        const uint8x8_t t1 = vshrn_n_u16(vreinterpretq_u16_u8(vceqq_u8(chunk1, dup)), 6);
++        const uint8x8_t res = vsli_n_u8(t0, t1, 4);
++        const U64 matches = vget_lane_u64(vreinterpret_u64_u8(res), 0) ;
++        return ZSTD_rotateRight_U64(matches, headGrouped) & 0x5555555555555555ull;
++    } else { /* rowEntries == 64 */
++        const uint8x16x4_t chunk = vld4q_u8(src);
++        const uint8x16_t dup = vdupq_n_u8(tag);
++        const uint8x16_t cmp0 = vceqq_u8(chunk.val[0], dup);
++        const uint8x16_t cmp1 = vceqq_u8(chunk.val[1], dup);
++        const uint8x16_t cmp2 = vceqq_u8(chunk.val[2], dup);
++        const uint8x16_t cmp3 = vceqq_u8(chunk.val[3], dup);
++
++        const uint8x16_t t0 = vsriq_n_u8(cmp1, cmp0, 1);
++        const uint8x16_t t1 = vsriq_n_u8(cmp3, cmp2, 1);
++        const uint8x16_t t2 = vsriq_n_u8(t1, t0, 2);
++        const uint8x16_t t3 = vsriq_n_u8(t2, t2, 4);
++        const uint8x8_t t4 = vshrn_n_u16(vreinterpretq_u16_u8(t3), 4);
++        const U64 matches = vget_lane_u64(vreinterpret_u64_u8(t4), 0);
++        return ZSTD_rotateRight_U64(matches, headGrouped);
++    }
++}
++#endif
++
++/* Returns a ZSTD_VecMask (U64) that has the nth group (determined by
++ * ZSTD_row_matchMaskGroupWidth) of bits set to 1 if the newly-computed "tag"
++ * matches the hash at the nth position in a row of the tagTable.
++ * Each row is a circular buffer beginning at the value of "headGrouped". So we
++ * must rotate the "matches" bitfield to match up with the actual layout of the
++ * entries within the hashTable */
+ FORCE_INLINE_TEMPLATE ZSTD_VecMask
+-ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 head, const U32 rowEntries)
++ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 headGrouped, const U32 rowEntries)
+ {
+-    const BYTE* const src = tagRow + ZSTD_ROW_HASH_TAG_OFFSET;
++    const BYTE* const src = tagRow;
+     assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64);
+     assert(rowEntries <= ZSTD_ROW_HASH_MAX_ENTRIES);
++    assert(ZSTD_row_matchMaskGroupWidth(rowEntries) * rowEntries <= sizeof(ZSTD_VecMask) * 8);
+ 
+ #if defined(ZSTD_ARCH_X86_SSE2)
+ 
+-    return ZSTD_row_getSSEMask(rowEntries / 16, src, tag, head);
++    return ZSTD_row_getSSEMask(rowEntries / 16, src, tag, headGrouped);
+ 
+ #else /* SW or NEON-LE */
+ 
+ # if defined(ZSTD_ARCH_ARM_NEON)
+   /* This NEON path only works for little endian - otherwise use SWAR below */
+     if (MEM_isLittleEndian()) {
+-        if (rowEntries == 16) {
+-            const uint8x16_t chunk = vld1q_u8(src);
+-            const uint16x8_t equalMask = vreinterpretq_u16_u8(vceqq_u8(chunk, vdupq_n_u8(tag)));
+-            const uint16x8_t t0 = vshlq_n_u16(equalMask, 7);
+-            const uint32x4_t t1 = vreinterpretq_u32_u16(vsriq_n_u16(t0, t0, 14));
+-            const uint64x2_t t2 = vreinterpretq_u64_u32(vshrq_n_u32(t1, 14));
+-            const uint8x16_t t3 = vreinterpretq_u8_u64(vsraq_n_u64(t2, t2, 28));
+-            const U16 hi = (U16)vgetq_lane_u8(t3, 8);
+-            const U16 lo = (U16)vgetq_lane_u8(t3, 0);
+-            return ZSTD_rotateRight_U16((hi << 8) | lo, head);
+-        } else if (rowEntries == 32) {
+-            const uint16x8x2_t chunk = vld2q_u16((const U16*)(const void*)src);
+-            const uint8x16_t chunk0 = vreinterpretq_u8_u16(chunk.val[0]);
+-            const uint8x16_t chunk1 = vreinterpretq_u8_u16(chunk.val[1]);
+-            const uint8x16_t equalMask0 = vceqq_u8(chunk0, vdupq_n_u8(tag));
+-            const uint8x16_t equalMask1 = vceqq_u8(chunk1, vdupq_n_u8(tag));
+-            const int8x8_t pack0 = vqmovn_s16(vreinterpretq_s16_u8(equalMask0));
+-            const int8x8_t pack1 = vqmovn_s16(vreinterpretq_s16_u8(equalMask1));
+-            const uint8x8_t t0 = vreinterpret_u8_s8(pack0);
+-            const uint8x8_t t1 = vreinterpret_u8_s8(pack1);
+-            const uint8x8_t t2 = vsri_n_u8(t1, t0, 2);
+-            const uint8x8x2_t t3 = vuzp_u8(t2, t0);
+-            const uint8x8_t t4 = vsri_n_u8(t3.val[1], t3.val[0], 4);
+-            const U32 matches = vget_lane_u32(vreinterpret_u32_u8(t4), 0);
+-            return ZSTD_rotateRight_U32(matches, head);
+-        } else { /* rowEntries == 64 */
+-            const uint8x16x4_t chunk = vld4q_u8(src);
+-            const uint8x16_t dup = vdupq_n_u8(tag);
+-            const uint8x16_t cmp0 = vceqq_u8(chunk.val[0], dup);
+-            const uint8x16_t cmp1 = vceqq_u8(chunk.val[1], dup);
+-            const uint8x16_t cmp2 = vceqq_u8(chunk.val[2], dup);
+-            const uint8x16_t cmp3 = vceqq_u8(chunk.val[3], dup);
+-
+-            const uint8x16_t t0 = vsriq_n_u8(cmp1, cmp0, 1);
+-            const uint8x16_t t1 = vsriq_n_u8(cmp3, cmp2, 1);
+-            const uint8x16_t t2 = vsriq_n_u8(t1, t0, 2);
+-            const uint8x16_t t3 = vsriq_n_u8(t2, t2, 4);
+-            const uint8x8_t t4 = vshrn_n_u16(vreinterpretq_u16_u8(t3), 4);
+-            const U64 matches = vget_lane_u64(vreinterpret_u64_u8(t4), 0);
+-            return ZSTD_rotateRight_U64(matches, head);
+-        }
++        return ZSTD_row_getNEONMask(rowEntries, src, tag, headGrouped);
+     }
+ # endif /* ZSTD_ARCH_ARM_NEON */
+     /* SWAR */
+-    {   const size_t chunkSize = sizeof(size_t);
++    {   const int chunkSize = sizeof(size_t);
+         const size_t shiftAmount = ((chunkSize * 8) - chunkSize);
+         const size_t xFF = ~((size_t)0);
+         const size_t x01 = xFF / 0xFF;
+@@ -1091,11 +1112,11 @@ ZSTD_row_getMatchMask(const BYTE* const
+         }
+         matches = ~matches;
+         if (rowEntries == 16) {
+-            return ZSTD_rotateRight_U16((U16)matches, head);
++            return ZSTD_rotateRight_U16((U16)matches, headGrouped);
+         } else if (rowEntries == 32) {
+-            return ZSTD_rotateRight_U32((U32)matches, head);
++            return ZSTD_rotateRight_U32((U32)matches, headGrouped);
+         } else {
+-            return ZSTD_rotateRight_U64((U64)matches, head);
++            return ZSTD_rotateRight_U64((U64)matches, headGrouped);
+         }
+     }
+ #endif
+@@ -1103,20 +1124,21 @@ ZSTD_row_getMatchMask(const BYTE* const
+ 
+ /* The high-level approach of the SIMD row based match finder is as follows:
+  * - Figure out where to insert the new entry:
+- *      - Generate a hash from a byte along with an additional 1-byte "short hash". The additional byte is our "tag"
+- *      - The hashTable is effectively split into groups or "rows" of 16 or 32 entries of U32, and the hash determines
++ *      - Generate a hash for current input posistion and split it into a one byte of tag and `rowHashLog` bits of index.
++ *           - The hash is salted by a value that changes on every contex reset, so when the same table is used
++ *             we will avoid collisions that would otherwise slow us down by intorducing phantom matches.
++ *      - The hashTable is effectively split into groups or "rows" of 15 or 31 entries of U32, and the index determines
+  *        which row to insert into.
+- *      - Determine the correct position within the row to insert the entry into. Each row of 16 or 32 can
+- *        be considered as a circular buffer with a "head" index that resides in the tagTable.
+- *      - Also insert the "tag" into the equivalent row and position in the tagTable.
+- *          - Note: The tagTable has 17 or 33 1-byte entries per row, due to 16 or 32 tags, and 1 "head" entry.
+- *                  The 17 or 33 entry rows are spaced out to occur every 32 or 64 bytes, respectively,
+- *                  for alignment/performance reasons, leaving some bytes unused.
+- * - Use SIMD to efficiently compare the tags in the tagTable to the 1-byte "short hash" and
++ *      - Determine the correct position within the row to insert the entry into. Each row of 15 or 31 can
++ *        be considered as a circular buffer with a "head" index that resides in the tagTable (overall 16 or 32 bytes
++ *        per row).
++ * - Use SIMD to efficiently compare the tags in the tagTable to the 1-byte tag calculated for the position and
+  *   generate a bitfield that we can cycle through to check the collisions in the hash table.
+  * - Pick the longest match.
++ * - Insert the tag into the equivalent row and position in the tagTable.
+  */
+ FORCE_INLINE_TEMPLATE
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+ size_t ZSTD_RowFindBestMatch(
+                         ZSTD_matchState_t* ms,
+                         const BYTE* const ip, const BYTE* const iLimit,
+@@ -1125,7 +1147,7 @@ size_t ZSTD_RowFindBestMatch(
+                         const U32 rowLog)
+ {
+     U32* const hashTable = ms->hashTable;
+-    U16* const tagTable = ms->tagTable;
++    BYTE* const tagTable = ms->tagTable;
+     U32* const hashCache = ms->hashCache;
+     const U32 hashLog = ms->rowHashLog;
+     const ZSTD_compressionParameters* const cParams = &ms->cParams;
+@@ -1143,8 +1165,11 @@ size_t ZSTD_RowFindBestMatch(
+     const U32 rowEntries = (1U << rowLog);
+     const U32 rowMask = rowEntries - 1;
+     const U32 cappedSearchLog = MIN(cParams->searchLog, rowLog); /* nb of searches is capped at nb entries per row */
++    const U32 groupWidth = ZSTD_row_matchMaskGroupWidth(rowEntries);
++    const U64 hashSalt = ms->hashSalt;
+     U32 nbAttempts = 1U << cappedSearchLog;
+     size_t ml=4-1;
++    U32 hash;
+ 
+     /* DMS/DDS variables that may be referenced laster */
+     const ZSTD_matchState_t* const dms = ms->dictMatchState;
+@@ -1168,7 +1193,7 @@ size_t ZSTD_RowFindBestMatch(
+     if (dictMode == ZSTD_dictMatchState) {
+         /* Prefetch DMS rows */
+         U32* const dmsHashTable = dms->hashTable;
+-        U16* const dmsTagTable = dms->tagTable;
++        BYTE* const dmsTagTable = dms->tagTable;
+         U32 const dmsHash = (U32)ZSTD_hashPtr(ip, dms->rowHashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
+         U32 const dmsRelRow = (dmsHash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
+         dmsTag = dmsHash & ZSTD_ROW_HASH_TAG_MASK;
+@@ -1178,23 +1203,34 @@ size_t ZSTD_RowFindBestMatch(
+     }
+ 
+     /* Update the hashTable and tagTable up to (but not including) ip */
+-    ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 1 /* useCache */);
++    if (!ms->lazySkipping) {
++        ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 1 /* useCache */);
++        hash = ZSTD_row_nextCachedHash(hashCache, hashTable, tagTable, base, curr, hashLog, rowLog, mls, hashSalt);
++    } else {
++        /* Stop inserting every position when in the lazy skipping mode.
++         * The hash cache is also not kept up to date in this mode.
++         */
++        hash = (U32)ZSTD_hashPtrSalted(ip, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, hashSalt);
++        ms->nextToUpdate = curr;
++    }
++    ms->hashSaltEntropy += hash; /* collect salt entropy */
++
+     {   /* Get the hash for ip, compute the appropriate row */
+-        U32 const hash = ZSTD_row_nextCachedHash(hashCache, hashTable, tagTable, base, curr, hashLog, rowLog, mls);
+         U32 const relRow = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
+         U32 const tag = hash & ZSTD_ROW_HASH_TAG_MASK;
+         U32* const row = hashTable + relRow;
+         BYTE* tagRow = (BYTE*)(tagTable + relRow);
+-        U32 const head = *tagRow & rowMask;
++        U32 const headGrouped = (*tagRow & rowMask) * groupWidth;
+         U32 matchBuffer[ZSTD_ROW_HASH_MAX_ENTRIES];
+         size_t numMatches = 0;
+         size_t currMatch = 0;
+-        ZSTD_VecMask matches = ZSTD_row_getMatchMask(tagRow, (BYTE)tag, head, rowEntries);
++        ZSTD_VecMask matches = ZSTD_row_getMatchMask(tagRow, (BYTE)tag, headGrouped, rowEntries);
+ 
+         /* Cycle through the matches and prefetch */
+-        for (; (matches > 0) && (nbAttempts > 0); --nbAttempts, matches &= (matches - 1)) {
+-            U32 const matchPos = (head + ZSTD_VecMask_next(matches)) & rowMask;
++        for (; (matches > 0) && (nbAttempts > 0); matches &= (matches - 1)) {
++            U32 const matchPos = ((headGrouped + ZSTD_VecMask_next(matches)) / groupWidth) & rowMask;
+             U32 const matchIndex = row[matchPos];
++            if(matchPos == 0) continue;
+             assert(numMatches < rowEntries);
+             if (matchIndex < lowLimit)
+                 break;
+@@ -1204,13 +1240,14 @@ size_t ZSTD_RowFindBestMatch(
+                 PREFETCH_L1(dictBase + matchIndex);
+             }
+             matchBuffer[numMatches++] = matchIndex;
++            --nbAttempts;
+         }
+ 
+         /* Speed opt: insert current byte into hashtable too. This allows us to avoid one iteration of the loop
+            in ZSTD_row_update_internal() at the next search. */
+         {
+             U32 const pos = ZSTD_row_nextIndex(tagRow, rowMask);
+-            tagRow[pos + ZSTD_ROW_HASH_TAG_OFFSET] = (BYTE)tag;
++            tagRow[pos] = (BYTE)tag;
+             row[pos] = ms->nextToUpdate++;
+         }
+ 
+@@ -1224,7 +1261,8 @@ size_t ZSTD_RowFindBestMatch(
+             if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) {
+                 const BYTE* const match = base + matchIndex;
+                 assert(matchIndex >= dictLimit);   /* ensures this is true if dictMode != ZSTD_extDict */
+-                if (match[ml] == ip[ml])   /* potentially better */
++                /* read 4B starting from (match + ml + 1 - sizeof(U32)) */
++                if (MEM_read32(match + ml - 3) == MEM_read32(ip + ml - 3))   /* potentially better */
+                     currentMl = ZSTD_count(ip, match, iLimit);
+             } else {
+                 const BYTE* const match = dictBase + matchIndex;
+@@ -1236,7 +1274,7 @@ size_t ZSTD_RowFindBestMatch(
+             /* Save best solution */
+             if (currentMl > ml) {
+                 ml = currentMl;
+-                *offsetPtr = STORE_OFFSET(curr - matchIndex);
++                *offsetPtr = OFFSET_TO_OFFBASE(curr - matchIndex);
+                 if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
+             }
+         }
+@@ -1254,19 +1292,21 @@ size_t ZSTD_RowFindBestMatch(
+         const U32 dmsSize              = (U32)(dmsEnd - dmsBase);
+         const U32 dmsIndexDelta        = dictLimit - dmsSize;
+ 
+-        {   U32 const head = *dmsTagRow & rowMask;
++        {   U32 const headGrouped = (*dmsTagRow & rowMask) * groupWidth;
+             U32 matchBuffer[ZSTD_ROW_HASH_MAX_ENTRIES];
+             size_t numMatches = 0;
+             size_t currMatch = 0;
+-            ZSTD_VecMask matches = ZSTD_row_getMatchMask(dmsTagRow, (BYTE)dmsTag, head, rowEntries);
++            ZSTD_VecMask matches = ZSTD_row_getMatchMask(dmsTagRow, (BYTE)dmsTag, headGrouped, rowEntries);
+ 
+-            for (; (matches > 0) && (nbAttempts > 0); --nbAttempts, matches &= (matches - 1)) {
+-                U32 const matchPos = (head + ZSTD_VecMask_next(matches)) & rowMask;
++            for (; (matches > 0) && (nbAttempts > 0); matches &= (matches - 1)) {
++                U32 const matchPos = ((headGrouped + ZSTD_VecMask_next(matches)) / groupWidth) & rowMask;
+                 U32 const matchIndex = dmsRow[matchPos];
++                if(matchPos == 0) continue;
+                 if (matchIndex < dmsLowestIndex)
+                     break;
+                 PREFETCH_L1(dmsBase + matchIndex);
+                 matchBuffer[numMatches++] = matchIndex;
++                --nbAttempts;
+             }
+ 
+             /* Return the longest match */
+@@ -1285,7 +1325,7 @@ size_t ZSTD_RowFindBestMatch(
+                 if (currentMl > ml) {
+                     ml = currentMl;
+                     assert(curr > matchIndex + dmsIndexDelta);
+-                    *offsetPtr = STORE_OFFSET(curr - (matchIndex + dmsIndexDelta));
++                    *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + dmsIndexDelta));
+                     if (ip+currentMl == iLimit) break;
+                 }
+             }
+@@ -1472,8 +1512,9 @@ FORCE_INLINE_TEMPLATE size_t ZSTD_search
+ *  Common parser - lazy strategy
+ *********************************/
+ 
+-FORCE_INLINE_TEMPLATE size_t
+-ZSTD_compressBlock_lazy_generic(
++FORCE_INLINE_TEMPLATE
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++size_t ZSTD_compressBlock_lazy_generic(
+                         ZSTD_matchState_t* ms, seqStore_t* seqStore,
+                         U32 rep[ZSTD_REP_NUM],
+                         const void* src, size_t srcSize,
+@@ -1491,7 +1532,8 @@ ZSTD_compressBlock_lazy_generic(
+     const U32 mls = BOUNDED(4, ms->cParams.minMatch, 6);
+     const U32 rowLog = BOUNDED(4, ms->cParams.searchLog, 6);
+ 
+-    U32 offset_1 = rep[0], offset_2 = rep[1], savedOffset=0;
++    U32 offset_1 = rep[0], offset_2 = rep[1];
++    U32 offsetSaved1 = 0, offsetSaved2 = 0;
+ 
+     const int isDMS = dictMode == ZSTD_dictMatchState;
+     const int isDDS = dictMode == ZSTD_dedicatedDictSearch;
+@@ -1512,8 +1554,8 @@ ZSTD_compressBlock_lazy_generic(
+         U32 const curr = (U32)(ip - base);
+         U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, curr, ms->cParams.windowLog);
+         U32 const maxRep = curr - windowLow;
+-        if (offset_2 > maxRep) savedOffset = offset_2, offset_2 = 0;
+-        if (offset_1 > maxRep) savedOffset = offset_1, offset_1 = 0;
++        if (offset_2 > maxRep) offsetSaved2 = offset_2, offset_2 = 0;
++        if (offset_1 > maxRep) offsetSaved1 = offset_1, offset_1 = 0;
+     }
+     if (isDxS) {
+         /* dictMatchState repCode checks don't currently handle repCode == 0
+@@ -1522,10 +1564,11 @@ ZSTD_compressBlock_lazy_generic(
+         assert(offset_2 <= dictAndPrefixLength);
+     }
+ 
++    /* Reset the lazy skipping state */
++    ms->lazySkipping = 0;
++
+     if (searchMethod == search_rowHash) {
+-        ZSTD_row_fillHashCache(ms, base, rowLog,
+-                            MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */),
+-                            ms->nextToUpdate, ilimit);
++        ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit);
+     }
+ 
+     /* Match Loop */
+@@ -1537,7 +1580,7 @@ ZSTD_compressBlock_lazy_generic(
+ #endif
+     while (ip < ilimit) {
+         size_t matchLength=0;
+-        size_t offcode=STORE_REPCODE_1;
++        size_t offBase = REPCODE1_TO_OFFBASE;
+         const BYTE* start=ip+1;
+         DEBUGLOG(7, "search baseline (depth 0)");
+ 
+@@ -1562,14 +1605,23 @@ ZSTD_compressBlock_lazy_generic(
+         }
+ 
+         /* first search (depth 0) */
+-        {   size_t offsetFound = 999999999;
+-            size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offsetFound, mls, rowLog, searchMethod, dictMode);
++        {   size_t offbaseFound = 999999999;
++            size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offbaseFound, mls, rowLog, searchMethod, dictMode);
+             if (ml2 > matchLength)
+-                matchLength = ml2, start = ip, offcode=offsetFound;
++                matchLength = ml2, start = ip, offBase = offbaseFound;
+         }
+ 
+         if (matchLength < 4) {
+-            ip += ((ip-anchor) >> kSearchStrength) + 1;   /* jump faster over incompressible sections */
++            size_t const step = ((size_t)(ip-anchor) >> kSearchStrength) + 1;   /* jump faster over incompressible sections */;
++            ip += step;
++            /* Enter the lazy skipping mode once we are skipping more than 8 bytes at a time.
++             * In this mode we stop inserting every position into our tables, and only insert
++             * positions that we search, which is one in step positions.
++             * The exact cutoff is flexible, I've just chosen a number that is reasonably high,
++             * so we minimize the compression ratio loss in "normal" scenarios. This mode gets
++             * triggered once we've gone 2KB without finding any matches.
++             */
++            ms->lazySkipping = step > kLazySkippingStep;
+             continue;
+         }
+ 
+@@ -1579,12 +1631,12 @@ ZSTD_compressBlock_lazy_generic(
+             DEBUGLOG(7, "search depth 1");
+             ip ++;
+             if ( (dictMode == ZSTD_noDict)
+-              && (offcode) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {
++              && (offBase) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {
+                 size_t const mlRep = ZSTD_count(ip+4, ip+4-offset_1, iend) + 4;
+                 int const gain2 = (int)(mlRep * 3);
+-                int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1);
++                int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offBase) + 1);
+                 if ((mlRep >= 4) && (gain2 > gain1))
+-                    matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip;
++                    matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip;
+             }
+             if (isDxS) {
+                 const U32 repIndex = (U32)(ip - base) - offset_1;
+@@ -1596,17 +1648,17 @@ ZSTD_compressBlock_lazy_generic(
+                     const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend;
+                     size_t const mlRep = ZSTD_count_2segments(ip+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4;
+                     int const gain2 = (int)(mlRep * 3);
+-                    int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1);
++                    int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offBase) + 1);
+                     if ((mlRep >= 4) && (gain2 > gain1))
+-                        matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip;
++                        matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip;
+                 }
+             }
+-            {   size_t offset2=999999999;
+-                size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offset2, mls, rowLog, searchMethod, dictMode);
+-                int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2)));   /* raw approx */
+-                int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 4);
++            {   size_t ofbCandidate=999999999;
++                size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, dictMode);
++                int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate));   /* raw approx */
++                int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 4);
+                 if ((ml2 >= 4) && (gain2 > gain1)) {
+-                    matchLength = ml2, offcode = offset2, start = ip;
++                    matchLength = ml2, offBase = ofbCandidate, start = ip;
+                     continue;   /* search a better one */
+             }   }
+ 
+@@ -1615,12 +1667,12 @@ ZSTD_compressBlock_lazy_generic(
+                 DEBUGLOG(7, "search depth 2");
+                 ip ++;
+                 if ( (dictMode == ZSTD_noDict)
+-                  && (offcode) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {
++                  && (offBase) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {
+                     size_t const mlRep = ZSTD_count(ip+4, ip+4-offset_1, iend) + 4;
+                     int const gain2 = (int)(mlRep * 4);
+-                    int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1);
++                    int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 1);
+                     if ((mlRep >= 4) && (gain2 > gain1))
+-                        matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip;
++                        matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip;
+                 }
+                 if (isDxS) {
+                     const U32 repIndex = (U32)(ip - base) - offset_1;
+@@ -1632,17 +1684,17 @@ ZSTD_compressBlock_lazy_generic(
+                         const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend;
+                         size_t const mlRep = ZSTD_count_2segments(ip+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4;
+                         int const gain2 = (int)(mlRep * 4);
+-                        int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1);
++                        int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 1);
+                         if ((mlRep >= 4) && (gain2 > gain1))
+-                            matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip;
++                            matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip;
+                     }
+                 }
+-                {   size_t offset2=999999999;
+-                    size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offset2, mls, rowLog, searchMethod, dictMode);
+-                    int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2)));   /* raw approx */
+-                    int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 7);
++                {   size_t ofbCandidate=999999999;
++                    size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, dictMode);
++                    int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate));   /* raw approx */
++                    int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 7);
+                     if ((ml2 >= 4) && (gain2 > gain1)) {
+-                        matchLength = ml2, offcode = offset2, start = ip;
++                        matchLength = ml2, offBase = ofbCandidate, start = ip;
+                         continue;
+             }   }   }
+             break;  /* nothing found : store previous solution */
+@@ -1653,26 +1705,33 @@ ZSTD_compressBlock_lazy_generic(
+          * notably if `value` is unsigned, resulting in a large positive `-value`.
+          */
+         /* catch up */
+-        if (STORED_IS_OFFSET(offcode)) {
++        if (OFFBASE_IS_OFFSET(offBase)) {
+             if (dictMode == ZSTD_noDict) {
+-                while ( ((start > anchor) & (start - STORED_OFFSET(offcode) > prefixLowest))
+-                     && (start[-1] == (start-STORED_OFFSET(offcode))[-1]) )  /* only search for offset within prefix */
++                while ( ((start > anchor) & (start - OFFBASE_TO_OFFSET(offBase) > prefixLowest))
++                     && (start[-1] == (start-OFFBASE_TO_OFFSET(offBase))[-1]) )  /* only search for offset within prefix */
+                     { start--; matchLength++; }
+             }
+             if (isDxS) {
+-                U32 const matchIndex = (U32)((size_t)(start-base) - STORED_OFFSET(offcode));
++                U32 const matchIndex = (U32)((size_t)(start-base) - OFFBASE_TO_OFFSET(offBase));
+                 const BYTE* match = (matchIndex < prefixLowestIndex) ? dictBase + matchIndex - dictIndexDelta : base + matchIndex;
+                 const BYTE* const mStart = (matchIndex < prefixLowestIndex) ? dictLowest : prefixLowest;
+                 while ((start>anchor) && (match>mStart) && (start[-1] == match[-1])) { start--; match--; matchLength++; }  /* catch up */
+             }
+-            offset_2 = offset_1; offset_1 = (U32)STORED_OFFSET(offcode);
++            offset_2 = offset_1; offset_1 = (U32)OFFBASE_TO_OFFSET(offBase);
+         }
+         /* store sequence */
+ _storeSequence:
+         {   size_t const litLength = (size_t)(start - anchor);
+-            ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offcode, matchLength);
++            ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offBase, matchLength);
+             anchor = ip = start + matchLength;
+         }
++        if (ms->lazySkipping) {
++            /* We've found a match, disable lazy skipping mode, and refill the hash cache. */
++            if (searchMethod == search_rowHash) {
++                ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit);
++            }
++            ms->lazySkipping = 0;
++        }
+ 
+         /* check immediate repcode */
+         if (isDxS) {
+@@ -1686,8 +1745,8 @@ _storeSequence:
+                    && (MEM_read32(repMatch) == MEM_read32(ip)) ) {
+                     const BYTE* const repEnd2 = repIndex < prefixLowestIndex ? dictEnd : iend;
+                     matchLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd2, prefixLowest) + 4;
+-                    offcode = offset_2; offset_2 = offset_1; offset_1 = (U32)offcode;   /* swap offset_2 <=> offset_1 */
+-                    ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, matchLength);
++                    offBase = offset_2; offset_2 = offset_1; offset_1 = (U32)offBase;   /* swap offset_2 <=> offset_1 */
++                    ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, matchLength);
+                     ip += matchLength;
+                     anchor = ip;
+                     continue;
+@@ -1701,166 +1760,181 @@ _storeSequence:
+                  && (MEM_read32(ip) == MEM_read32(ip - offset_2)) ) {
+                 /* store sequence */
+                 matchLength = ZSTD_count(ip+4, ip+4-offset_2, iend) + 4;
+-                offcode = offset_2; offset_2 = offset_1; offset_1 = (U32)offcode; /* swap repcodes */
+-                ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, matchLength);
++                offBase = offset_2; offset_2 = offset_1; offset_1 = (U32)offBase; /* swap repcodes */
++                ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, matchLength);
+                 ip += matchLength;
+                 anchor = ip;
+                 continue;   /* faster when present ... (?) */
+     }   }   }
+ 
+-    /* Save reps for next block */
+-    rep[0] = offset_1 ? offset_1 : savedOffset;
+-    rep[1] = offset_2 ? offset_2 : savedOffset;
++    /* If offset_1 started invalid (offsetSaved1 != 0) and became valid (offset_1 != 0),
++     * rotate saved offsets. See comment in ZSTD_compressBlock_fast_noDict for more context. */
++    offsetSaved2 = ((offsetSaved1 != 0) && (offset_1 != 0)) ? offsetSaved1 : offsetSaved2;
++
++    /* save reps for next block */
++    rep[0] = offset_1 ? offset_1 : offsetSaved1;
++    rep[1] = offset_2 ? offset_2 : offsetSaved2;
+ 
+     /* Return the last literals size */
+     return (size_t)(iend - anchor);
+ }
++#endif /* build exclusions */
+ 
+ 
+-size_t ZSTD_compressBlock_btlazy2(
++#ifndef ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR
++size_t ZSTD_compressBlock_greedy(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2, ZSTD_noDict);
++    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_noDict);
+ }
+ 
+-size_t ZSTD_compressBlock_lazy2(
++size_t ZSTD_compressBlock_greedy_dictMatchState(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_noDict);
++    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_dictMatchState);
+ }
+ 
+-size_t ZSTD_compressBlock_lazy(
++size_t ZSTD_compressBlock_greedy_dedicatedDictSearch(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_noDict);
++    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_dedicatedDictSearch);
+ }
+ 
+-size_t ZSTD_compressBlock_greedy(
++size_t ZSTD_compressBlock_greedy_row(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_noDict);
++    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_noDict);
+ }
+ 
+-size_t ZSTD_compressBlock_btlazy2_dictMatchState(
++size_t ZSTD_compressBlock_greedy_dictMatchState_row(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2, ZSTD_dictMatchState);
++    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_dictMatchState);
+ }
+ 
+-size_t ZSTD_compressBlock_lazy2_dictMatchState(
++size_t ZSTD_compressBlock_greedy_dedicatedDictSearch_row(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_dictMatchState);
++    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_dedicatedDictSearch);
+ }
++#endif
+ 
+-size_t ZSTD_compressBlock_lazy_dictMatchState(
++#ifndef ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR
++size_t ZSTD_compressBlock_lazy(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_dictMatchState);
++    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_noDict);
+ }
+ 
+-size_t ZSTD_compressBlock_greedy_dictMatchState(
++size_t ZSTD_compressBlock_lazy_dictMatchState(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_dictMatchState);
++    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_dictMatchState);
+ }
+ 
+-
+-size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch(
++size_t ZSTD_compressBlock_lazy_dedicatedDictSearch(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_dedicatedDictSearch);
++    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_dedicatedDictSearch);
+ }
+ 
+-size_t ZSTD_compressBlock_lazy_dedicatedDictSearch(
++size_t ZSTD_compressBlock_lazy_row(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_dedicatedDictSearch);
++    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_noDict);
+ }
+ 
+-size_t ZSTD_compressBlock_greedy_dedicatedDictSearch(
++size_t ZSTD_compressBlock_lazy_dictMatchState_row(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_dedicatedDictSearch);
++    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_dictMatchState);
+ }
+ 
+-/* Row-based matchfinder */
+-size_t ZSTD_compressBlock_lazy2_row(
++size_t ZSTD_compressBlock_lazy_dedicatedDictSearch_row(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_noDict);
++    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_dedicatedDictSearch);
+ }
++#endif
+ 
+-size_t ZSTD_compressBlock_lazy_row(
++#ifndef ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR
++size_t ZSTD_compressBlock_lazy2(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_noDict);
++    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_noDict);
+ }
+ 
+-size_t ZSTD_compressBlock_greedy_row(
++size_t ZSTD_compressBlock_lazy2_dictMatchState(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_noDict);
++    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_dictMatchState);
+ }
+ 
+-size_t ZSTD_compressBlock_lazy2_dictMatchState_row(
++size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_dictMatchState);
++    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_dedicatedDictSearch);
+ }
+ 
+-size_t ZSTD_compressBlock_lazy_dictMatchState_row(
++size_t ZSTD_compressBlock_lazy2_row(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_dictMatchState);
++    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_noDict);
+ }
+ 
+-size_t ZSTD_compressBlock_greedy_dictMatchState_row(
++size_t ZSTD_compressBlock_lazy2_dictMatchState_row(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_dictMatchState);
++    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_dictMatchState);
+ }
+ 
+-
+ size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch_row(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+     return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_dedicatedDictSearch);
+ }
++#endif
+ 
+-size_t ZSTD_compressBlock_lazy_dedicatedDictSearch_row(
++#ifndef ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR
++size_t ZSTD_compressBlock_btlazy2(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_dedicatedDictSearch);
++    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2, ZSTD_noDict);
+ }
+ 
+-size_t ZSTD_compressBlock_greedy_dedicatedDictSearch_row(
++size_t ZSTD_compressBlock_btlazy2_dictMatchState(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_dedicatedDictSearch);
++    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2, ZSTD_dictMatchState);
+ }
++#endif
+ 
++#if !defined(ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR) \
++ || !defined(ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR) \
++ || !defined(ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR) \
++ || !defined(ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR)
+ FORCE_INLINE_TEMPLATE
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+ size_t ZSTD_compressBlock_lazy_extDict_generic(
+                         ZSTD_matchState_t* ms, seqStore_t* seqStore,
+                         U32 rep[ZSTD_REP_NUM],
+@@ -1886,12 +1960,13 @@ size_t ZSTD_compressBlock_lazy_extDict_g
+ 
+     DEBUGLOG(5, "ZSTD_compressBlock_lazy_extDict_generic (searchFunc=%u)", (U32)searchMethod);
+ 
++    /* Reset the lazy skipping state */
++    ms->lazySkipping = 0;
++
+     /* init */
+     ip += (ip == prefixStart);
+     if (searchMethod == search_rowHash) {
+-        ZSTD_row_fillHashCache(ms, base, rowLog,
+-                               MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */),
+-                               ms->nextToUpdate, ilimit);
++        ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit);
+     }
+ 
+     /* Match Loop */
+@@ -1903,7 +1978,7 @@ size_t ZSTD_compressBlock_lazy_extDict_g
+ #endif
+     while (ip < ilimit) {
+         size_t matchLength=0;
+-        size_t offcode=STORE_REPCODE_1;
++        size_t offBase = REPCODE1_TO_OFFBASE;
+         const BYTE* start=ip+1;
+         U32 curr = (U32)(ip-base);
+ 
+@@ -1922,14 +1997,23 @@ size_t ZSTD_compressBlock_lazy_extDict_g
+         }   }
+ 
+         /* first search (depth 0) */
+-        {   size_t offsetFound = 999999999;
+-            size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offsetFound, mls, rowLog, searchMethod, ZSTD_extDict);
++        {   size_t ofbCandidate = 999999999;
++            size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, ZSTD_extDict);
+             if (ml2 > matchLength)
+-                matchLength = ml2, start = ip, offcode=offsetFound;
++                matchLength = ml2, start = ip, offBase = ofbCandidate;
+         }
+ 
+         if (matchLength < 4) {
+-            ip += ((ip-anchor) >> kSearchStrength) + 1;   /* jump faster over incompressible sections */
++            size_t const step = ((size_t)(ip-anchor) >> kSearchStrength);
++            ip += step + 1;   /* jump faster over incompressible sections */
++            /* Enter the lazy skipping mode once we are skipping more than 8 bytes at a time.
++             * In this mode we stop inserting every position into our tables, and only insert
++             * positions that we search, which is one in step positions.
++             * The exact cutoff is flexible, I've just chosen a number that is reasonably high,
++             * so we minimize the compression ratio loss in "normal" scenarios. This mode gets
++             * triggered once we've gone 2KB without finding any matches.
++             */
++            ms->lazySkipping = step > kLazySkippingStep;
+             continue;
+         }
+ 
+@@ -1939,7 +2023,7 @@ size_t ZSTD_compressBlock_lazy_extDict_g
+             ip ++;
+             curr++;
+             /* check repCode */
+-            if (offcode) {
++            if (offBase) {
+                 const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr, windowLog);
+                 const U32 repIndex = (U32)(curr - offset_1);
+                 const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
+@@ -1951,18 +2035,18 @@ size_t ZSTD_compressBlock_lazy_extDict_g
+                     const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
+                     size_t const repLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4;
+                     int const gain2 = (int)(repLength * 3);
+-                    int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1);
++                    int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offBase) + 1);
+                     if ((repLength >= 4) && (gain2 > gain1))
+-                        matchLength = repLength, offcode = STORE_REPCODE_1, start = ip;
++                        matchLength = repLength, offBase = REPCODE1_TO_OFFBASE, start = ip;
+             }   }
+ 
+             /* search match, depth 1 */
+-            {   size_t offset2=999999999;
+-                size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offset2, mls, rowLog, searchMethod, ZSTD_extDict);
+-                int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2)));   /* raw approx */
+-                int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 4);
++            {   size_t ofbCandidate = 999999999;
++                size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, ZSTD_extDict);
++                int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate));   /* raw approx */
++                int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 4);
+                 if ((ml2 >= 4) && (gain2 > gain1)) {
+-                    matchLength = ml2, offcode = offset2, start = ip;
++                    matchLength = ml2, offBase = ofbCandidate, start = ip;
+                     continue;   /* search a better one */
+             }   }
+ 
+@@ -1971,7 +2055,7 @@ size_t ZSTD_compressBlock_lazy_extDict_g
+                 ip ++;
+                 curr++;
+                 /* check repCode */
+-                if (offcode) {
++                if (offBase) {
+                     const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr, windowLog);
+                     const U32 repIndex = (U32)(curr - offset_1);
+                     const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
+@@ -1983,38 +2067,45 @@ size_t ZSTD_compressBlock_lazy_extDict_g
+                         const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
+                         size_t const repLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4;
+                         int const gain2 = (int)(repLength * 4);
+-                        int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1);
++                        int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 1);
+                         if ((repLength >= 4) && (gain2 > gain1))
+-                            matchLength = repLength, offcode = STORE_REPCODE_1, start = ip;
++                            matchLength = repLength, offBase = REPCODE1_TO_OFFBASE, start = ip;
+                 }   }
+ 
+                 /* search match, depth 2 */
+-                {   size_t offset2=999999999;
+-                    size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offset2, mls, rowLog, searchMethod, ZSTD_extDict);
+-                    int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2)));   /* raw approx */
+-                    int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 7);
++                {   size_t ofbCandidate = 999999999;
++                    size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, ZSTD_extDict);
++                    int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate));   /* raw approx */
++                    int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 7);
+                     if ((ml2 >= 4) && (gain2 > gain1)) {
+-                        matchLength = ml2, offcode = offset2, start = ip;
++                        matchLength = ml2, offBase = ofbCandidate, start = ip;
+                         continue;
+             }   }   }
+             break;  /* nothing found : store previous solution */
+         }
+ 
+         /* catch up */
+-        if (STORED_IS_OFFSET(offcode)) {
+-            U32 const matchIndex = (U32)((size_t)(start-base) - STORED_OFFSET(offcode));
++        if (OFFBASE_IS_OFFSET(offBase)) {
++            U32 const matchIndex = (U32)((size_t)(start-base) - OFFBASE_TO_OFFSET(offBase));
+             const BYTE* match = (matchIndex < dictLimit) ? dictBase + matchIndex : base + matchIndex;
+             const BYTE* const mStart = (matchIndex < dictLimit) ? dictStart : prefixStart;
+             while ((start>anchor) && (match>mStart) && (start[-1] == match[-1])) { start--; match--; matchLength++; }  /* catch up */
+-            offset_2 = offset_1; offset_1 = (U32)STORED_OFFSET(offcode);
++            offset_2 = offset_1; offset_1 = (U32)OFFBASE_TO_OFFSET(offBase);
+         }
+ 
+         /* store sequence */
+ _storeSequence:
+         {   size_t const litLength = (size_t)(start - anchor);
+-            ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offcode, matchLength);
++            ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offBase, matchLength);
+             anchor = ip = start + matchLength;
+         }
++        if (ms->lazySkipping) {
++            /* We've found a match, disable lazy skipping mode, and refill the hash cache. */
++            if (searchMethod == search_rowHash) {
++                ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit);
++            }
++            ms->lazySkipping = 0;
++        }
+ 
+         /* check immediate repcode */
+         while (ip <= ilimit) {
+@@ -2029,8 +2120,8 @@ _storeSequence:
+                 /* repcode detected we should take it */
+                 const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
+                 matchLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4;
+-                offcode = offset_2; offset_2 = offset_1; offset_1 = (U32)offcode;   /* swap offset history */
+-                ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, matchLength);
++                offBase = offset_2; offset_2 = offset_1; offset_1 = (U32)offBase;   /* swap offset history */
++                ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, matchLength);
+                 ip += matchLength;
+                 anchor = ip;
+                 continue;   /* faster when present ... (?) */
+@@ -2045,8 +2136,9 @@ _storeSequence:
+     /* Return the last literals size */
+     return (size_t)(iend - anchor);
+ }
++#endif /* build exclusions */
+ 
+-
++#ifndef ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR
+ size_t ZSTD_compressBlock_greedy_extDict(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+@@ -2054,49 +2146,55 @@ size_t ZSTD_compressBlock_greedy_extDict
+     return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0);
+ }
+ 
+-size_t ZSTD_compressBlock_lazy_extDict(
++size_t ZSTD_compressBlock_greedy_extDict_row(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+-
+ {
+-    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1);
++    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0);
+ }
++#endif
+ 
+-size_t ZSTD_compressBlock_lazy2_extDict(
++#ifndef ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR
++size_t ZSTD_compressBlock_lazy_extDict(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ 
+ {
+-    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2);
++    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1);
+ }
+ 
+-size_t ZSTD_compressBlock_btlazy2_extDict(
++size_t ZSTD_compressBlock_lazy_extDict_row(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ 
+ {
+-    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2);
++    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1);
+ }
++#endif
+ 
+-size_t ZSTD_compressBlock_greedy_extDict_row(
++#ifndef ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR
++size_t ZSTD_compressBlock_lazy2_extDict(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
++
+ {
+-    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0);
++    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2);
+ }
+ 
+-size_t ZSTD_compressBlock_lazy_extDict_row(
++size_t ZSTD_compressBlock_lazy2_extDict_row(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+-
+ {
+-    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1);
++    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2);
+ }
++#endif
+ 
+-size_t ZSTD_compressBlock_lazy2_extDict_row(
++#ifndef ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR
++size_t ZSTD_compressBlock_btlazy2_extDict(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ 
+ {
+-    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2);
++    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2);
+ }
++#endif
+--- a/lib/zstd/compress/zstd_lazy.h
++++ b/lib/zstd/compress/zstd_lazy.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -22,98 +23,175 @@
+  */
+ #define ZSTD_LAZY_DDSS_BUCKET_LOG 2
+ 
++#define ZSTD_ROW_HASH_TAG_BITS 8        /* nb bits to use for the tag */
++
++#if !defined(ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR) \
++ || !defined(ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR) \
++ || !defined(ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR) \
++ || !defined(ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR)
+ U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip);
+ void ZSTD_row_update(ZSTD_matchState_t* const ms, const BYTE* ip);
+ 
+ void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZSTD_matchState_t* ms, const BYTE* const ip);
+ 
+ void ZSTD_preserveUnsortedMark (U32* const table, U32 const size, U32 const reducerValue);  /*! used in ZSTD_reduceIndex(). preemptively increase value of ZSTD_DUBT_UNSORTED_MARK */
++#endif
+ 
+-size_t ZSTD_compressBlock_btlazy2(
++#ifndef ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR
++size_t ZSTD_compressBlock_greedy(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_lazy2(
++size_t ZSTD_compressBlock_greedy_row(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_lazy(
++size_t ZSTD_compressBlock_greedy_dictMatchState(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_greedy(
++size_t ZSTD_compressBlock_greedy_dictMatchState_row(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_lazy2_row(
++size_t ZSTD_compressBlock_greedy_dedicatedDictSearch(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_lazy_row(
++size_t ZSTD_compressBlock_greedy_dedicatedDictSearch_row(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_greedy_row(
++size_t ZSTD_compressBlock_greedy_extDict(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-
+-size_t ZSTD_compressBlock_btlazy2_dictMatchState(
++size_t ZSTD_compressBlock_greedy_extDict_row(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_lazy2_dictMatchState(
++
++#define ZSTD_COMPRESSBLOCK_GREEDY ZSTD_compressBlock_greedy
++#define ZSTD_COMPRESSBLOCK_GREEDY_ROW ZSTD_compressBlock_greedy_row
++#define ZSTD_COMPRESSBLOCK_GREEDY_DICTMATCHSTATE ZSTD_compressBlock_greedy_dictMatchState
++#define ZSTD_COMPRESSBLOCK_GREEDY_DICTMATCHSTATE_ROW ZSTD_compressBlock_greedy_dictMatchState_row
++#define ZSTD_COMPRESSBLOCK_GREEDY_DEDICATEDDICTSEARCH ZSTD_compressBlock_greedy_dedicatedDictSearch
++#define ZSTD_COMPRESSBLOCK_GREEDY_DEDICATEDDICTSEARCH_ROW ZSTD_compressBlock_greedy_dedicatedDictSearch_row
++#define ZSTD_COMPRESSBLOCK_GREEDY_EXTDICT ZSTD_compressBlock_greedy_extDict
++#define ZSTD_COMPRESSBLOCK_GREEDY_EXTDICT_ROW ZSTD_compressBlock_greedy_extDict_row
++#else
++#define ZSTD_COMPRESSBLOCK_GREEDY NULL
++#define ZSTD_COMPRESSBLOCK_GREEDY_ROW NULL
++#define ZSTD_COMPRESSBLOCK_GREEDY_DICTMATCHSTATE NULL
++#define ZSTD_COMPRESSBLOCK_GREEDY_DICTMATCHSTATE_ROW NULL
++#define ZSTD_COMPRESSBLOCK_GREEDY_DEDICATEDDICTSEARCH NULL
++#define ZSTD_COMPRESSBLOCK_GREEDY_DEDICATEDDICTSEARCH_ROW NULL
++#define ZSTD_COMPRESSBLOCK_GREEDY_EXTDICT NULL
++#define ZSTD_COMPRESSBLOCK_GREEDY_EXTDICT_ROW NULL
++#endif
++
++#ifndef ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR
++size_t ZSTD_compressBlock_lazy(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_lazy_dictMatchState(
++size_t ZSTD_compressBlock_lazy_row(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_greedy_dictMatchState(
++size_t ZSTD_compressBlock_lazy_dictMatchState(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_lazy2_dictMatchState_row(
++size_t ZSTD_compressBlock_lazy_dictMatchState_row(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_lazy_dictMatchState_row(
++size_t ZSTD_compressBlock_lazy_dedicatedDictSearch(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_greedy_dictMatchState_row(
++size_t ZSTD_compressBlock_lazy_dedicatedDictSearch_row(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-
+-size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch(
++size_t ZSTD_compressBlock_lazy_extDict(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_lazy_dedicatedDictSearch(
++size_t ZSTD_compressBlock_lazy_extDict_row(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_greedy_dedicatedDictSearch(
++
++#define ZSTD_COMPRESSBLOCK_LAZY ZSTD_compressBlock_lazy
++#define ZSTD_COMPRESSBLOCK_LAZY_ROW ZSTD_compressBlock_lazy_row
++#define ZSTD_COMPRESSBLOCK_LAZY_DICTMATCHSTATE ZSTD_compressBlock_lazy_dictMatchState
++#define ZSTD_COMPRESSBLOCK_LAZY_DICTMATCHSTATE_ROW ZSTD_compressBlock_lazy_dictMatchState_row
++#define ZSTD_COMPRESSBLOCK_LAZY_DEDICATEDDICTSEARCH ZSTD_compressBlock_lazy_dedicatedDictSearch
++#define ZSTD_COMPRESSBLOCK_LAZY_DEDICATEDDICTSEARCH_ROW ZSTD_compressBlock_lazy_dedicatedDictSearch_row
++#define ZSTD_COMPRESSBLOCK_LAZY_EXTDICT ZSTD_compressBlock_lazy_extDict
++#define ZSTD_COMPRESSBLOCK_LAZY_EXTDICT_ROW ZSTD_compressBlock_lazy_extDict_row
++#else
++#define ZSTD_COMPRESSBLOCK_LAZY NULL
++#define ZSTD_COMPRESSBLOCK_LAZY_ROW NULL
++#define ZSTD_COMPRESSBLOCK_LAZY_DICTMATCHSTATE NULL
++#define ZSTD_COMPRESSBLOCK_LAZY_DICTMATCHSTATE_ROW NULL
++#define ZSTD_COMPRESSBLOCK_LAZY_DEDICATEDDICTSEARCH NULL
++#define ZSTD_COMPRESSBLOCK_LAZY_DEDICATEDDICTSEARCH_ROW NULL
++#define ZSTD_COMPRESSBLOCK_LAZY_EXTDICT NULL
++#define ZSTD_COMPRESSBLOCK_LAZY_EXTDICT_ROW NULL
++#endif
++
++#ifndef ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR
++size_t ZSTD_compressBlock_lazy2(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch_row(
++size_t ZSTD_compressBlock_lazy2_row(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_lazy_dedicatedDictSearch_row(
++size_t ZSTD_compressBlock_lazy2_dictMatchState(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_greedy_dedicatedDictSearch_row(
++size_t ZSTD_compressBlock_lazy2_dictMatchState_row(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-
+-size_t ZSTD_compressBlock_greedy_extDict(
++size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_lazy_extDict(
++size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch_row(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+ size_t ZSTD_compressBlock_lazy2_extDict(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_greedy_extDict_row(
++size_t ZSTD_compressBlock_lazy2_extDict_row(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_lazy_extDict_row(
++
++#define ZSTD_COMPRESSBLOCK_LAZY2 ZSTD_compressBlock_lazy2
++#define ZSTD_COMPRESSBLOCK_LAZY2_ROW ZSTD_compressBlock_lazy2_row
++#define ZSTD_COMPRESSBLOCK_LAZY2_DICTMATCHSTATE ZSTD_compressBlock_lazy2_dictMatchState
++#define ZSTD_COMPRESSBLOCK_LAZY2_DICTMATCHSTATE_ROW ZSTD_compressBlock_lazy2_dictMatchState_row
++#define ZSTD_COMPRESSBLOCK_LAZY2_DEDICATEDDICTSEARCH ZSTD_compressBlock_lazy2_dedicatedDictSearch
++#define ZSTD_COMPRESSBLOCK_LAZY2_DEDICATEDDICTSEARCH_ROW ZSTD_compressBlock_lazy2_dedicatedDictSearch_row
++#define ZSTD_COMPRESSBLOCK_LAZY2_EXTDICT ZSTD_compressBlock_lazy2_extDict
++#define ZSTD_COMPRESSBLOCK_LAZY2_EXTDICT_ROW ZSTD_compressBlock_lazy2_extDict_row
++#else
++#define ZSTD_COMPRESSBLOCK_LAZY2 NULL
++#define ZSTD_COMPRESSBLOCK_LAZY2_ROW NULL
++#define ZSTD_COMPRESSBLOCK_LAZY2_DICTMATCHSTATE NULL
++#define ZSTD_COMPRESSBLOCK_LAZY2_DICTMATCHSTATE_ROW NULL
++#define ZSTD_COMPRESSBLOCK_LAZY2_DEDICATEDDICTSEARCH NULL
++#define ZSTD_COMPRESSBLOCK_LAZY2_DEDICATEDDICTSEARCH_ROW NULL
++#define ZSTD_COMPRESSBLOCK_LAZY2_EXTDICT NULL
++#define ZSTD_COMPRESSBLOCK_LAZY2_EXTDICT_ROW NULL
++#endif
++
++#ifndef ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR
++size_t ZSTD_compressBlock_btlazy2(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_lazy2_extDict_row(
++size_t ZSTD_compressBlock_btlazy2_dictMatchState(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+ size_t ZSTD_compressBlock_btlazy2_extDict(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-        
++
++#define ZSTD_COMPRESSBLOCK_BTLAZY2 ZSTD_compressBlock_btlazy2
++#define ZSTD_COMPRESSBLOCK_BTLAZY2_DICTMATCHSTATE ZSTD_compressBlock_btlazy2_dictMatchState
++#define ZSTD_COMPRESSBLOCK_BTLAZY2_EXTDICT ZSTD_compressBlock_btlazy2_extDict
++#else
++#define ZSTD_COMPRESSBLOCK_BTLAZY2 NULL
++#define ZSTD_COMPRESSBLOCK_BTLAZY2_DICTMATCHSTATE NULL
++#define ZSTD_COMPRESSBLOCK_BTLAZY2_EXTDICT NULL
++#endif
++
+ 
+ 
+ #endif /* ZSTD_LAZY_H */
+--- a/lib/zstd/compress/zstd_ldm.c
++++ b/lib/zstd/compress/zstd_ldm.c
+@@ -1,5 +1,6 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -242,11 +243,15 @@ static size_t ZSTD_ldm_fillFastTables(ZS
+     switch(ms->cParams.strategy)
+     {
+     case ZSTD_fast:
+-        ZSTD_fillHashTable(ms, iend, ZSTD_dtlm_fast);
++        ZSTD_fillHashTable(ms, iend, ZSTD_dtlm_fast, ZSTD_tfp_forCCtx);
+         break;
+ 
+     case ZSTD_dfast:
+-        ZSTD_fillDoubleHashTable(ms, iend, ZSTD_dtlm_fast);
++#ifndef ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR
++        ZSTD_fillDoubleHashTable(ms, iend, ZSTD_dtlm_fast, ZSTD_tfp_forCCtx);
++#else
++        assert(0); /* shouldn't be called: cparams should've been adjusted. */
++#endif
+         break;
+ 
+     case ZSTD_greedy:
+@@ -318,7 +323,9 @@ static void ZSTD_ldm_limitTableUpdate(ZS
+     }
+ }
+ 
+-static size_t ZSTD_ldm_generateSequences_internal(
++static
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++size_t ZSTD_ldm_generateSequences_internal(
+         ldmState_t* ldmState, rawSeqStore_t* rawSeqStore,
+         ldmParams_t const* params, void const* src, size_t srcSize)
+ {
+@@ -549,7 +556,7 @@ size_t ZSTD_ldm_generateSequences(
+          * the window through early invalidation.
+          * TODO: * Test the chunk size.
+          *       * Try invalidation after the sequence generation and test the
+-         *         the offset against maxDist directly.
++         *         offset against maxDist directly.
+          *
+          * NOTE: Because of dictionaries + sequence splitting we MUST make sure
+          * that any offset used is valid at the END of the sequence, since it may
+@@ -689,7 +696,6 @@ size_t ZSTD_ldm_blockCompress(rawSeqStor
+         /* maybeSplitSequence updates rawSeqStore->pos */
+         rawSeq const sequence = maybeSplitSequence(rawSeqStore,
+                                                    (U32)(iend - ip), minMatch);
+-        int i;
+         /* End signal */
+         if (sequence.offset == 0)
+             break;
+@@ -702,6 +708,7 @@ size_t ZSTD_ldm_blockCompress(rawSeqStor
+         /* Run the block compressor */
+         DEBUGLOG(5, "pos %u : calling block compressor on segment of size %u", (unsigned)(ip-istart), sequence.litLength);
+         {
++            int i;
+             size_t const newLitLength =
+                 blockCompressor(ms, seqStore, rep, ip, sequence.litLength);
+             ip += sequence.litLength;
+@@ -711,7 +718,7 @@ size_t ZSTD_ldm_blockCompress(rawSeqStor
+             rep[0] = sequence.offset;
+             /* Store the sequence */
+             ZSTD_storeSeq(seqStore, newLitLength, ip - newLitLength, iend,
+-                          STORE_OFFSET(sequence.offset),
++                          OFFSET_TO_OFFBASE(sequence.offset),
+                           sequence.matchLength);
+             ip += sequence.matchLength;
+         }
+--- a/lib/zstd/compress/zstd_ldm.h
++++ b/lib/zstd/compress/zstd_ldm.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+--- a/lib/zstd/compress/zstd_ldm_geartab.h
++++ b/lib/zstd/compress/zstd_ldm_geartab.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+--- a/lib/zstd/compress/zstd_opt.c
++++ b/lib/zstd/compress/zstd_opt.c
+@@ -1,5 +1,6 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Przemyslaw Skibinski, Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -12,11 +13,14 @@
+ #include "hist.h"
+ #include "zstd_opt.h"
+ 
++#if !defined(ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR) \
++ || !defined(ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR) \
++ || !defined(ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR)
+ 
+ #define ZSTD_LITFREQ_ADD    2   /* scaling factor for litFreq, so that frequencies adapt faster to new stats */
+ #define ZSTD_MAX_PRICE     (1<<30)
+ 
+-#define ZSTD_PREDEF_THRESHOLD 1024   /* if srcSize < ZSTD_PREDEF_THRESHOLD, symbols' cost is assumed static, directly determined by pre-defined distributions */
++#define ZSTD_PREDEF_THRESHOLD 8   /* if srcSize < ZSTD_PREDEF_THRESHOLD, symbols' cost is assumed static, directly determined by pre-defined distributions */
+ 
+ 
+ /*-*************************************
+@@ -26,27 +30,35 @@
+ #if 0    /* approximation at bit level (for tests) */
+ #  define BITCOST_ACCURACY 0
+ #  define BITCOST_MULTIPLIER (1 << BITCOST_ACCURACY)
+-#  define WEIGHT(stat, opt) ((void)opt, ZSTD_bitWeight(stat))
++#  define WEIGHT(stat, opt) ((void)(opt), ZSTD_bitWeight(stat))
+ #elif 0  /* fractional bit accuracy (for tests) */
+ #  define BITCOST_ACCURACY 8
+ #  define BITCOST_MULTIPLIER (1 << BITCOST_ACCURACY)
+-#  define WEIGHT(stat,opt) ((void)opt, ZSTD_fracWeight(stat))
++#  define WEIGHT(stat,opt) ((void)(opt), ZSTD_fracWeight(stat))
+ #else    /* opt==approx, ultra==accurate */
+ #  define BITCOST_ACCURACY 8
+ #  define BITCOST_MULTIPLIER (1 << BITCOST_ACCURACY)
+-#  define WEIGHT(stat,opt) (opt ? ZSTD_fracWeight(stat) : ZSTD_bitWeight(stat))
++#  define WEIGHT(stat,opt) ((opt) ? ZSTD_fracWeight(stat) : ZSTD_bitWeight(stat))
+ #endif
+ 
++/* ZSTD_bitWeight() :
++ * provide estimated "cost" of a stat in full bits only */
+ MEM_STATIC U32 ZSTD_bitWeight(U32 stat)
+ {
+     return (ZSTD_highbit32(stat+1) * BITCOST_MULTIPLIER);
+ }
+ 
++/* ZSTD_fracWeight() :
++ * provide fractional-bit "cost" of a stat,
++ * using linear interpolation approximation */
+ MEM_STATIC U32 ZSTD_fracWeight(U32 rawStat)
+ {
+     U32 const stat = rawStat + 1;
+     U32 const hb = ZSTD_highbit32(stat);
+     U32 const BWeight = hb * BITCOST_MULTIPLIER;
++    /* Fweight was meant for "Fractional weight"
++     * but it's effectively a value between 1 and 2
++     * using fixed point arithmetic */
+     U32 const FWeight = (stat << BITCOST_ACCURACY) >> hb;
+     U32 const weight = BWeight + FWeight;
+     assert(hb + BITCOST_ACCURACY < 31);
+@@ -57,7 +69,7 @@ MEM_STATIC U32 ZSTD_fracWeight(U32 rawSt
+ /* debugging function,
+  * @return price in bytes as fractional value
+  * for debug messages only */
+-MEM_STATIC double ZSTD_fCost(U32 price)
++MEM_STATIC double ZSTD_fCost(int price)
+ {
+     return (double)price / (BITCOST_MULTIPLIER*8);
+ }
+@@ -88,20 +100,26 @@ static U32 sum_u32(const unsigned table[
+     return total;
+ }
+ 
+-static U32 ZSTD_downscaleStats(unsigned* table, U32 lastEltIndex, U32 shift)
++typedef enum { base_0possible=0, base_1guaranteed=1 } base_directive_e;
++
++static U32
++ZSTD_downscaleStats(unsigned* table, U32 lastEltIndex, U32 shift, base_directive_e base1)
+ {
+     U32 s, sum=0;
+-    DEBUGLOG(5, "ZSTD_downscaleStats (nbElts=%u, shift=%u)", (unsigned)lastEltIndex+1, (unsigned)shift);
++    DEBUGLOG(5, "ZSTD_downscaleStats (nbElts=%u, shift=%u)",
++            (unsigned)lastEltIndex+1, (unsigned)shift );
+     assert(shift < 30);
+     for (s=0; s<lastEltIndex+1; s++) {
+-        table[s] = 1 + (table[s] >> shift);
+-        sum += table[s];
++        unsigned const base = base1 ? 1 : (table[s]>0);
++        unsigned const newStat = base + (table[s] >> shift);
++        sum += newStat;
++        table[s] = newStat;
+     }
+     return sum;
+ }
+ 
+ /* ZSTD_scaleStats() :
+- * reduce all elements in table is sum too large
++ * reduce all elt frequencies in table if sum too large
+  * return the resulting sum of elements */
+ static U32 ZSTD_scaleStats(unsigned* table, U32 lastEltIndex, U32 logTarget)
+ {
+@@ -110,7 +128,7 @@ static U32 ZSTD_scaleStats(unsigned* tab
+     DEBUGLOG(5, "ZSTD_scaleStats (nbElts=%u, target=%u)", (unsigned)lastEltIndex+1, (unsigned)logTarget);
+     assert(logTarget < 30);
+     if (factor <= 1) return prevsum;
+-    return ZSTD_downscaleStats(table, lastEltIndex, ZSTD_highbit32(factor));
++    return ZSTD_downscaleStats(table, lastEltIndex, ZSTD_highbit32(factor), base_1guaranteed);
+ }
+ 
+ /* ZSTD_rescaleFreqs() :
+@@ -129,18 +147,22 @@ ZSTD_rescaleFreqs(optState_t* const optP
+     DEBUGLOG(5, "ZSTD_rescaleFreqs (srcSize=%u)", (unsigned)srcSize);
+     optPtr->priceType = zop_dynamic;
+ 
+-    if (optPtr->litLengthSum == 0) {  /* first block : init */
+-        if (srcSize <= ZSTD_PREDEF_THRESHOLD) {  /* heuristic */
+-            DEBUGLOG(5, "(srcSize <= ZSTD_PREDEF_THRESHOLD) => zop_predef");
++    if (optPtr->litLengthSum == 0) {  /* no literals stats collected -> first block assumed -> init */
++
++        /* heuristic: use pre-defined stats for too small inputs */
++        if (srcSize <= ZSTD_PREDEF_THRESHOLD) {
++            DEBUGLOG(5, "srcSize <= %i : use predefined stats", ZSTD_PREDEF_THRESHOLD);
+             optPtr->priceType = zop_predef;
+         }
+ 
+         assert(optPtr->symbolCosts != NULL);
+         if (optPtr->symbolCosts->huf.repeatMode == HUF_repeat_valid) {
+-            /* huffman table presumed generated by dictionary */
++
++            /* huffman stats covering the full value set : table presumed generated by dictionary */
+             optPtr->priceType = zop_dynamic;
+ 
+             if (compressedLiterals) {
++                /* generate literals statistics from huffman table */
+                 unsigned lit;
+                 assert(optPtr->litFreq != NULL);
+                 optPtr->litSum = 0;
+@@ -188,13 +210,14 @@ ZSTD_rescaleFreqs(optState_t* const optP
+                     optPtr->offCodeSum += optPtr->offCodeFreq[of];
+             }   }
+ 
+-        } else {  /* not a dictionary */
++        } else {  /* first block, no dictionary */
+ 
+             assert(optPtr->litFreq != NULL);
+             if (compressedLiterals) {
++                /* base initial cost of literals on direct frequency within src */
+                 unsigned lit = MaxLit;
+                 HIST_count_simple(optPtr->litFreq, &lit, src, srcSize);   /* use raw first block to init statistics */
+-                optPtr->litSum = ZSTD_downscaleStats(optPtr->litFreq, MaxLit, 8);
++                optPtr->litSum = ZSTD_downscaleStats(optPtr->litFreq, MaxLit, 8, base_0possible);
+             }
+ 
+             {   unsigned const baseLLfreqs[MaxLL+1] = {
+@@ -224,10 +247,9 @@ ZSTD_rescaleFreqs(optState_t* const optP
+                 optPtr->offCodeSum = sum_u32(baseOFCfreqs, MaxOff+1);
+             }
+ 
+-
+         }
+ 
+-    } else {   /* new block : re-use previous statistics, scaled down */
++    } else {   /* new block : scale down accumulated statistics */
+ 
+         if (compressedLiterals)
+             optPtr->litSum = ZSTD_scaleStats(optPtr->litFreq, MaxLit, 12);
+@@ -246,6 +268,7 @@ static U32 ZSTD_rawLiteralsCost(const BY
+                                 const optState_t* const optPtr,
+                                 int optLevel)
+ {
++    DEBUGLOG(8, "ZSTD_rawLiteralsCost (%u literals)", litLength);
+     if (litLength == 0) return 0;
+ 
+     if (!ZSTD_compressedLiterals(optPtr))
+@@ -255,11 +278,14 @@ static U32 ZSTD_rawLiteralsCost(const BY
+         return (litLength*6) * BITCOST_MULTIPLIER;  /* 6 bit per literal - no statistic used */
+ 
+     /* dynamic statistics */
+-    {   U32 price = litLength * optPtr->litSumBasePrice;
++    {   U32 price = optPtr->litSumBasePrice * litLength;
++        U32 const litPriceMax = optPtr->litSumBasePrice - BITCOST_MULTIPLIER;
+         U32 u;
++        assert(optPtr->litSumBasePrice >= BITCOST_MULTIPLIER);
+         for (u=0; u < litLength; u++) {
+-            assert(WEIGHT(optPtr->litFreq[literals[u]], optLevel) <= optPtr->litSumBasePrice);   /* literal cost should never be negative */
+-            price -= WEIGHT(optPtr->litFreq[literals[u]], optLevel);
++            U32 litPrice = WEIGHT(optPtr->litFreq[literals[u]], optLevel);
++            if (UNLIKELY(litPrice > litPriceMax)) litPrice = litPriceMax;
++            price -= litPrice;
+         }
+         return price;
+     }
+@@ -272,10 +298,11 @@ static U32 ZSTD_litLengthPrice(U32 const
+     assert(litLength <= ZSTD_BLOCKSIZE_MAX);
+     if (optPtr->priceType == zop_predef)
+         return WEIGHT(litLength, optLevel);
+-    /* We can't compute the litLength price for sizes >= ZSTD_BLOCKSIZE_MAX
+-     * because it isn't representable in the zstd format. So instead just
+-     * call it 1 bit more than ZSTD_BLOCKSIZE_MAX - 1. In this case the block
+-     * would be all literals.
++
++    /* ZSTD_LLcode() can't compute litLength price for sizes >= ZSTD_BLOCKSIZE_MAX
++     * because it isn't representable in the zstd format.
++     * So instead just pretend it would cost 1 bit more than ZSTD_BLOCKSIZE_MAX - 1.
++     * In such a case, the block would be all literals.
+      */
+     if (litLength == ZSTD_BLOCKSIZE_MAX)
+         return BITCOST_MULTIPLIER + ZSTD_litLengthPrice(ZSTD_BLOCKSIZE_MAX - 1, optPtr, optLevel);
+@@ -289,24 +316,25 @@ static U32 ZSTD_litLengthPrice(U32 const
+ }
+ 
+ /* ZSTD_getMatchPrice() :
+- * Provides the cost of the match part (offset + matchLength) of a sequence
++ * Provides the cost of the match part (offset + matchLength) of a sequence.
+  * Must be combined with ZSTD_fullLiteralsCost() to get the full cost of a sequence.
+- * @offcode : expects a scale where 0,1,2 are repcodes 1-3, and 3+ are real_offsets+2
++ * @offBase : sumtype, representing an offset or a repcode, and using numeric representation of ZSTD_storeSeq()
+  * @optLevel: when <2, favors small offset for decompression speed (improved cache efficiency)
+  */
+ FORCE_INLINE_TEMPLATE U32
+-ZSTD_getMatchPrice(U32 const offcode,
++ZSTD_getMatchPrice(U32 const offBase,
+                    U32 const matchLength,
+              const optState_t* const optPtr,
+                    int const optLevel)
+ {
+     U32 price;
+-    U32 const offCode = ZSTD_highbit32(STORED_TO_OFFBASE(offcode));
++    U32 const offCode = ZSTD_highbit32(offBase);
+     U32 const mlBase = matchLength - MINMATCH;
+     assert(matchLength >= MINMATCH);
+ 
+-    if (optPtr->priceType == zop_predef)  /* fixed scheme, do not use statistics */
+-        return WEIGHT(mlBase, optLevel) + ((16 + offCode) * BITCOST_MULTIPLIER);
++    if (optPtr->priceType == zop_predef)  /* fixed scheme, does not use statistics */
++        return WEIGHT(mlBase, optLevel)
++             + ((16 + offCode) * BITCOST_MULTIPLIER); /* emulated offset cost */
+ 
+     /* dynamic statistics */
+     price = (offCode * BITCOST_MULTIPLIER) + (optPtr->offCodeSumBasePrice - WEIGHT(optPtr->offCodeFreq[offCode], optLevel));
+@@ -325,10 +353,10 @@ ZSTD_getMatchPrice(U32 const offcode,
+ }
+ 
+ /* ZSTD_updateStats() :
+- * assumption : literals + litLengtn <= iend */
++ * assumption : literals + litLength <= iend */
+ static void ZSTD_updateStats(optState_t* const optPtr,
+                              U32 litLength, const BYTE* literals,
+-                             U32 offsetCode, U32 matchLength)
++                             U32 offBase, U32 matchLength)
+ {
+     /* literals */
+     if (ZSTD_compressedLiterals(optPtr)) {
+@@ -344,8 +372,8 @@ static void ZSTD_updateStats(optState_t*
+         optPtr->litLengthSum++;
+     }
+ 
+-    /* offset code : expected to follow storeSeq() numeric representation */
+-    {   U32 const offCode = ZSTD_highbit32(STORED_TO_OFFBASE(offsetCode));
++    /* offset code : follows storeSeq() numeric representation */
++    {   U32 const offCode = ZSTD_highbit32(offBase);
+         assert(offCode <= MaxOff);
+         optPtr->offCodeFreq[offCode]++;
+         optPtr->offCodeSum++;
+@@ -379,9 +407,11 @@ MEM_STATIC U32 ZSTD_readMINMATCH(const v
+ 
+ /* Update hashTable3 up to ip (excluded)
+    Assumption : always within prefix (i.e. not within extDict) */
+-static U32 ZSTD_insertAndFindFirstIndexHash3 (const ZSTD_matchState_t* ms,
+-                                              U32* nextToUpdate3,
+-                                              const BYTE* const ip)
++static
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++U32 ZSTD_insertAndFindFirstIndexHash3 (const ZSTD_matchState_t* ms,
++                                       U32* nextToUpdate3,
++                                       const BYTE* const ip)
+ {
+     U32* const hashTable3 = ms->hashTable3;
+     U32 const hashLog3 = ms->hashLog3;
+@@ -408,7 +438,9 @@ static U32 ZSTD_insertAndFindFirstIndexH
+  * @param ip assumed <= iend-8 .
+  * @param target The target of ZSTD_updateTree_internal() - we are filling to this position
+  * @return : nb of positions added */
+-static U32 ZSTD_insertBt1(
++static
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++U32 ZSTD_insertBt1(
+                 const ZSTD_matchState_t* ms,
+                 const BYTE* const ip, const BYTE* const iend,
+                 U32 const target,
+@@ -527,6 +559,7 @@ static U32 ZSTD_insertBt1(
+ }
+ 
+ FORCE_INLINE_TEMPLATE
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+ void ZSTD_updateTree_internal(
+                 ZSTD_matchState_t* ms,
+                 const BYTE* const ip, const BYTE* const iend,
+@@ -535,7 +568,7 @@ void ZSTD_updateTree_internal(
+     const BYTE* const base = ms->window.base;
+     U32 const target = (U32)(ip - base);
+     U32 idx = ms->nextToUpdate;
+-    DEBUGLOG(6, "ZSTD_updateTree_internal, from %u to %u  (dictMode:%u)",
++    DEBUGLOG(7, "ZSTD_updateTree_internal, from %u to %u  (dictMode:%u)",
+                 idx, target, dictMode);
+ 
+     while(idx < target) {
+@@ -553,15 +586,18 @@ void ZSTD_updateTree(ZSTD_matchState_t*
+ }
+ 
+ FORCE_INLINE_TEMPLATE
+-U32 ZSTD_insertBtAndGetAllMatches (
+-                    ZSTD_match_t* matches,   /* store result (found matches) in this table (presumed large enough) */
+-                    ZSTD_matchState_t* ms,
+-                    U32* nextToUpdate3,
+-                    const BYTE* const ip, const BYTE* const iLimit, const ZSTD_dictMode_e dictMode,
+-                    const U32 rep[ZSTD_REP_NUM],
+-                    U32 const ll0,   /* tells if associated literal length is 0 or not. This value must be 0 or 1 */
+-                    const U32 lengthToBeat,
+-                    U32 const mls /* template */)
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++U32
++ZSTD_insertBtAndGetAllMatches (
++                ZSTD_match_t* matches,  /* store result (found matches) in this table (presumed large enough) */
++                ZSTD_matchState_t* ms,
++                U32* nextToUpdate3,
++                const BYTE* const ip, const BYTE* const iLimit,
++                const ZSTD_dictMode_e dictMode,
++                const U32 rep[ZSTD_REP_NUM],
++                const U32 ll0,  /* tells if associated literal length is 0 or not. This value must be 0 or 1 */
++                const U32 lengthToBeat,
++                const U32 mls /* template */)
+ {
+     const ZSTD_compressionParameters* const cParams = &ms->cParams;
+     U32 const sufficient_len = MIN(cParams->targetLength, ZSTD_OPT_NUM -1);
+@@ -644,7 +680,7 @@ U32 ZSTD_insertBtAndGetAllMatches (
+                 DEBUGLOG(8, "found repCode %u (ll0:%u, offset:%u) of length %u",
+                             repCode, ll0, repOffset, repLen);
+                 bestLength = repLen;
+-                matches[mnum].off = STORE_REPCODE(repCode - ll0 + 1);  /* expect value between 1 and 3 */
++                matches[mnum].off = REPCODE_TO_OFFBASE(repCode - ll0 + 1);  /* expect value between 1 and 3 */
+                 matches[mnum].len = (U32)repLen;
+                 mnum++;
+                 if ( (repLen > sufficient_len)
+@@ -673,7 +709,7 @@ U32 ZSTD_insertBtAndGetAllMatches (
+                 bestLength = mlen;
+                 assert(curr > matchIndex3);
+                 assert(mnum==0);  /* no prior solution */
+-                matches[0].off = STORE_OFFSET(curr - matchIndex3);
++                matches[0].off = OFFSET_TO_OFFBASE(curr - matchIndex3);
+                 matches[0].len = (U32)mlen;
+                 mnum = 1;
+                 if ( (mlen > sufficient_len) |
+@@ -706,13 +742,13 @@ U32 ZSTD_insertBtAndGetAllMatches (
+         }
+ 
+         if (matchLength > bestLength) {
+-            DEBUGLOG(8, "found match of length %u at distance %u (offCode=%u)",
+-                    (U32)matchLength, curr - matchIndex, STORE_OFFSET(curr - matchIndex));
++            DEBUGLOG(8, "found match of length %u at distance %u (offBase=%u)",
++                    (U32)matchLength, curr - matchIndex, OFFSET_TO_OFFBASE(curr - matchIndex));
+             assert(matchEndIdx > matchIndex);
+             if (matchLength > matchEndIdx - matchIndex)
+                 matchEndIdx = matchIndex + (U32)matchLength;
+             bestLength = matchLength;
+-            matches[mnum].off = STORE_OFFSET(curr - matchIndex);
++            matches[mnum].off = OFFSET_TO_OFFBASE(curr - matchIndex);
+             matches[mnum].len = (U32)matchLength;
+             mnum++;
+             if ( (matchLength > ZSTD_OPT_NUM)
+@@ -754,12 +790,12 @@ U32 ZSTD_insertBtAndGetAllMatches (
+ 
+             if (matchLength > bestLength) {
+                 matchIndex = dictMatchIndex + dmsIndexDelta;
+-                DEBUGLOG(8, "found dms match of length %u at distance %u (offCode=%u)",
+-                        (U32)matchLength, curr - matchIndex, STORE_OFFSET(curr - matchIndex));
++                DEBUGLOG(8, "found dms match of length %u at distance %u (offBase=%u)",
++                        (U32)matchLength, curr - matchIndex, OFFSET_TO_OFFBASE(curr - matchIndex));
+                 if (matchLength > matchEndIdx - matchIndex)
+                     matchEndIdx = matchIndex + (U32)matchLength;
+                 bestLength = matchLength;
+-                matches[mnum].off = STORE_OFFSET(curr - matchIndex);
++                matches[mnum].off = OFFSET_TO_OFFBASE(curr - matchIndex);
+                 matches[mnum].len = (U32)matchLength;
+                 mnum++;
+                 if ( (matchLength > ZSTD_OPT_NUM)
+@@ -792,7 +828,9 @@ typedef U32 (*ZSTD_getAllMatchesFn)(
+     U32 const ll0,
+     U32 const lengthToBeat);
+ 
+-FORCE_INLINE_TEMPLATE U32 ZSTD_btGetAllMatches_internal(
++FORCE_INLINE_TEMPLATE
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++U32 ZSTD_btGetAllMatches_internal(
+         ZSTD_match_t* matches,
+         ZSTD_matchState_t* ms,
+         U32* nextToUpdate3,
+@@ -960,7 +998,7 @@ static void ZSTD_optLdm_maybeAddMatch(ZS
+                                       const ZSTD_optLdm_t* optLdm, U32 currPosInBlock)
+ {
+     U32 const posDiff = currPosInBlock - optLdm->startPosInBlock;
+-    /* Note: ZSTD_match_t actually contains offCode and matchLength (before subtracting MINMATCH) */
++    /* Note: ZSTD_match_t actually contains offBase and matchLength (before subtracting MINMATCH) */
+     U32 const candidateMatchLength = optLdm->endPosInBlock - optLdm->startPosInBlock - posDiff;
+ 
+     /* Ensure that current block position is not outside of the match */
+@@ -971,11 +1009,11 @@ static void ZSTD_optLdm_maybeAddMatch(ZS
+     }
+ 
+     if (*nbMatches == 0 || ((candidateMatchLength > matches[*nbMatches-1].len) && *nbMatches < ZSTD_OPT_NUM)) {
+-        U32 const candidateOffCode = STORE_OFFSET(optLdm->offset);
+-        DEBUGLOG(6, "ZSTD_optLdm_maybeAddMatch(): Adding ldm candidate match (offCode: %u matchLength %u) at block position=%u",
+-                 candidateOffCode, candidateMatchLength, currPosInBlock);
++        U32 const candidateOffBase = OFFSET_TO_OFFBASE(optLdm->offset);
++        DEBUGLOG(6, "ZSTD_optLdm_maybeAddMatch(): Adding ldm candidate match (offBase: %u matchLength %u) at block position=%u",
++                 candidateOffBase, candidateMatchLength, currPosInBlock);
+         matches[*nbMatches].len = candidateMatchLength;
+-        matches[*nbMatches].off = candidateOffCode;
++        matches[*nbMatches].off = candidateOffBase;
+         (*nbMatches)++;
+     }
+ }
+@@ -1011,11 +1049,6 @@ ZSTD_optLdm_processMatchCandidate(ZSTD_o
+ *  Optimal parser
+ *********************************/
+ 
+-static U32 ZSTD_totalLen(ZSTD_optimal_t sol)
+-{
+-    return sol.litlen + sol.mlen;
+-}
+-
+ #if 0 /* debug */
+ 
+ static void
+@@ -1033,7 +1066,13 @@ listStats(const U32* table, int lastEltI
+ 
+ #endif
+ 
+-FORCE_INLINE_TEMPLATE size_t
++#define LIT_PRICE(_p) (int)ZSTD_rawLiteralsCost(_p, 1, optStatePtr, optLevel)
++#define LL_PRICE(_l) (int)ZSTD_litLengthPrice(_l, optStatePtr, optLevel)
++#define LL_INCPRICE(_l) (LL_PRICE(_l) - LL_PRICE(_l-1))
++
++FORCE_INLINE_TEMPLATE
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++size_t
+ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
+                                seqStore_t* seqStore,
+                                U32 rep[ZSTD_REP_NUM],
+@@ -1059,9 +1098,11 @@ ZSTD_compressBlock_opt_generic(ZSTD_matc
+ 
+     ZSTD_optimal_t* const opt = optStatePtr->priceTable;
+     ZSTD_match_t* const matches = optStatePtr->matchTable;
+-    ZSTD_optimal_t lastSequence;
++    ZSTD_optimal_t lastStretch;
+     ZSTD_optLdm_t optLdm;
+ 
++    ZSTD_memset(&lastStretch, 0, sizeof(ZSTD_optimal_t));
++
+     optLdm.seqStore = ms->ldmSeqStore ? *ms->ldmSeqStore : kNullRawSeqStore;
+     optLdm.endPosInBlock = optLdm.startPosInBlock = optLdm.offset = 0;
+     ZSTD_opt_getNextMatchAndUpdateSeqStore(&optLdm, (U32)(ip-istart), (U32)(iend-ip));
+@@ -1082,103 +1123,139 @@ ZSTD_compressBlock_opt_generic(ZSTD_matc
+             U32 const ll0 = !litlen;
+             U32 nbMatches = getAllMatches(matches, ms, &nextToUpdate3, ip, iend, rep, ll0, minMatch);
+             ZSTD_optLdm_processMatchCandidate(&optLdm, matches, &nbMatches,
+-                                              (U32)(ip-istart), (U32)(iend - ip));
+-            if (!nbMatches) { ip++; continue; }
++                                              (U32)(ip-istart), (U32)(iend-ip));
++            if (!nbMatches) {
++                DEBUGLOG(8, "no match found at cPos %u", (unsigned)(ip-istart));
++                ip++;
++                continue;
++            }
++
++            /* Match found: let's store this solution, and eventually find more candidates.
++             * During this forward pass, @opt is used to store stretches,
++             * defined as "a match followed by N literals".
++             * Note how this is different from a Sequence, which is "N literals followed by a match".
++             * Storing stretches allows us to store different match predecessors
++             * for each literal position part of a literals run. */
+ 
+             /* initialize opt[0] */
+-            { U32 i ; for (i=0; i<ZSTD_REP_NUM; i++) opt[0].rep[i] = rep[i]; }
+-            opt[0].mlen = 0;  /* means is_a_literal */
++            opt[0].mlen = 0;  /* there are only literals so far */
+             opt[0].litlen = litlen;
+-            /* We don't need to include the actual price of the literals because
+-             * it is static for the duration of the forward pass, and is included
+-             * in every price. We include the literal length to avoid negative
+-             * prices when we subtract the previous literal length.
++            /* No need to include the actual price of the literals before the first match
++             * because it is static for the duration of the forward pass, and is included
++             * in every subsequent price. But, we include the literal length because
++             * the cost variation of litlen depends on the value of litlen.
+              */
+-            opt[0].price = (int)ZSTD_litLengthPrice(litlen, optStatePtr, optLevel);
++            opt[0].price = LL_PRICE(litlen);
++            ZSTD_STATIC_ASSERT(sizeof(opt[0].rep[0]) == sizeof(rep[0]));
++            ZSTD_memcpy(&opt[0].rep, rep, sizeof(opt[0].rep));
+ 
+             /* large match -> immediate encoding */
+             {   U32 const maxML = matches[nbMatches-1].len;
+-                U32 const maxOffcode = matches[nbMatches-1].off;
+-                DEBUGLOG(6, "found %u matches of maxLength=%u and maxOffCode=%u at cPos=%u => start new series",
+-                            nbMatches, maxML, maxOffcode, (U32)(ip-prefixStart));
++                U32 const maxOffBase = matches[nbMatches-1].off;
++                DEBUGLOG(6, "found %u matches of maxLength=%u and maxOffBase=%u at cPos=%u => start new series",
++                            nbMatches, maxML, maxOffBase, (U32)(ip-prefixStart));
+ 
+                 if (maxML > sufficient_len) {
+-                    lastSequence.litlen = litlen;
+-                    lastSequence.mlen = maxML;
+-                    lastSequence.off = maxOffcode;
+-                    DEBUGLOG(6, "large match (%u>%u), immediate encoding",
++                    lastStretch.litlen = 0;
++                    lastStretch.mlen = maxML;
++                    lastStretch.off = maxOffBase;
++                    DEBUGLOG(6, "large match (%u>%u) => immediate encoding",
+                                 maxML, sufficient_len);
+                     cur = 0;
+-                    last_pos = ZSTD_totalLen(lastSequence);
++                    last_pos = maxML;
+                     goto _shortestPath;
+             }   }
+ 
+             /* set prices for first matches starting position == 0 */
+             assert(opt[0].price >= 0);
+-            {   U32 const literalsPrice = (U32)opt[0].price + ZSTD_litLengthPrice(0, optStatePtr, optLevel);
+-                U32 pos;
++            {   U32 pos;
+                 U32 matchNb;
+                 for (pos = 1; pos < minMatch; pos++) {
+-                    opt[pos].price = ZSTD_MAX_PRICE;   /* mlen, litlen and price will be fixed during forward scanning */
++                    opt[pos].price = ZSTD_MAX_PRICE;
++                    opt[pos].mlen = 0;
++                    opt[pos].litlen = litlen + pos;
+                 }
+                 for (matchNb = 0; matchNb < nbMatches; matchNb++) {
+-                    U32 const offcode = matches[matchNb].off;
++                    U32 const offBase = matches[matchNb].off;
+                     U32 const end = matches[matchNb].len;
+                     for ( ; pos <= end ; pos++ ) {
+-                        U32 const matchPrice = ZSTD_getMatchPrice(offcode, pos, optStatePtr, optLevel);
+-                        U32 const sequencePrice = literalsPrice + matchPrice;
++                        int const matchPrice = (int)ZSTD_getMatchPrice(offBase, pos, optStatePtr, optLevel);
++                        int const sequencePrice = opt[0].price + matchPrice;
+                         DEBUGLOG(7, "rPos:%u => set initial price : %.2f",
+                                     pos, ZSTD_fCost(sequencePrice));
+                         opt[pos].mlen = pos;
+-                        opt[pos].off = offcode;
+-                        opt[pos].litlen = litlen;
+-                        opt[pos].price = (int)sequencePrice;
+-                }   }
++                        opt[pos].off = offBase;
++                        opt[pos].litlen = 0; /* end of match */
++                        opt[pos].price = sequencePrice + LL_PRICE(0);
++                    }
++                }
+                 last_pos = pos-1;
++                opt[pos].price = ZSTD_MAX_PRICE;
+             }
+         }
+ 
+         /* check further positions */
+         for (cur = 1; cur <= last_pos; cur++) {
+             const BYTE* const inr = ip + cur;
+-            assert(cur < ZSTD_OPT_NUM);
+-            DEBUGLOG(7, "cPos:%zi==rPos:%u", inr-istart, cur)
++            assert(cur <= ZSTD_OPT_NUM);
++            DEBUGLOG(7, "cPos:%zi==rPos:%u", inr-istart, cur);
+ 
+             /* Fix current position with one literal if cheaper */
+-            {   U32 const litlen = (opt[cur-1].mlen == 0) ? opt[cur-1].litlen + 1 : 1;
++            {   U32 const litlen = opt[cur-1].litlen + 1;
+                 int const price = opt[cur-1].price
+-                                + (int)ZSTD_rawLiteralsCost(ip+cur-1, 1, optStatePtr, optLevel)
+-                                + (int)ZSTD_litLengthPrice(litlen, optStatePtr, optLevel)
+-                                - (int)ZSTD_litLengthPrice(litlen-1, optStatePtr, optLevel);
++                                + LIT_PRICE(ip+cur-1)
++                                + LL_INCPRICE(litlen);
+                 assert(price < 1000000000); /* overflow check */
+                 if (price <= opt[cur].price) {
++                    ZSTD_optimal_t const prevMatch = opt[cur];
+                     DEBUGLOG(7, "cPos:%zi==rPos:%u : better price (%.2f<=%.2f) using literal (ll==%u) (hist:%u,%u,%u)",
+                                 inr-istart, cur, ZSTD_fCost(price), ZSTD_fCost(opt[cur].price), litlen,
+                                 opt[cur-1].rep[0], opt[cur-1].rep[1], opt[cur-1].rep[2]);
+-                    opt[cur].mlen = 0;
+-                    opt[cur].off = 0;
++                    opt[cur] = opt[cur-1];
+                     opt[cur].litlen = litlen;
+                     opt[cur].price = price;
++                    if ( (optLevel >= 1) /* additional check only for higher modes */
++                      && (prevMatch.litlen == 0) /* replace a match */
++                      && (LL_INCPRICE(1) < 0) /* ll1 is cheaper than ll0 */
++                      && LIKELY(ip + cur < iend)
++                    ) {
++                        /* check next position, in case it would be cheaper */
++                        int with1literal = prevMatch.price + LIT_PRICE(ip+cur) + LL_INCPRICE(1);
++                        int withMoreLiterals = price + LIT_PRICE(ip+cur) + LL_INCPRICE(litlen+1);
++                        DEBUGLOG(7, "then at next rPos %u : match+1lit %.2f vs %ulits %.2f",
++                                cur+1, ZSTD_fCost(with1literal), litlen+1, ZSTD_fCost(withMoreLiterals));
++                        if ( (with1literal < withMoreLiterals)
++                          && (with1literal < opt[cur+1].price) ) {
++                            /* update offset history - before it disappears */
++                            U32 const prev = cur - prevMatch.mlen;
++                            repcodes_t const newReps = ZSTD_newRep(opt[prev].rep, prevMatch.off, opt[prev].litlen==0);
++                            assert(cur >= prevMatch.mlen);
++                            DEBUGLOG(7, "==> match+1lit is cheaper (%.2f < %.2f) (hist:%u,%u,%u) !",
++                                        ZSTD_fCost(with1literal), ZSTD_fCost(withMoreLiterals),
++                                        newReps.rep[0], newReps.rep[1], newReps.rep[2] );
++                            opt[cur+1] = prevMatch;  /* mlen & offbase */
++                            ZSTD_memcpy(opt[cur+1].rep, &newReps, sizeof(repcodes_t));
++                            opt[cur+1].litlen = 1;
++                            opt[cur+1].price = with1literal;
++                            if (last_pos < cur+1) last_pos = cur+1;
++                        }
++                    }
+                 } else {
+-                    DEBUGLOG(7, "cPos:%zi==rPos:%u : literal would cost more (%.2f>%.2f) (hist:%u,%u,%u)",
+-                                inr-istart, cur, ZSTD_fCost(price), ZSTD_fCost(opt[cur].price),
+-                                opt[cur].rep[0], opt[cur].rep[1], opt[cur].rep[2]);
++                    DEBUGLOG(7, "cPos:%zi==rPos:%u : literal would cost more (%.2f>%.2f)",
++                                inr-istart, cur, ZSTD_fCost(price), ZSTD_fCost(opt[cur].price));
+                 }
+             }
+ 
+-            /* Set the repcodes of the current position. We must do it here
+-             * because we rely on the repcodes of the 2nd to last sequence being
+-             * correct to set the next chunks repcodes during the backward
+-             * traversal.
++            /* Offset history is not updated during match comparison.
++             * Do it here, now that the match is selected and confirmed.
+              */
+             ZSTD_STATIC_ASSERT(sizeof(opt[cur].rep) == sizeof(repcodes_t));
+             assert(cur >= opt[cur].mlen);
+-            if (opt[cur].mlen != 0) {
++            if (opt[cur].litlen == 0) {
++                /* just finished a match => alter offset history */
+                 U32 const prev = cur - opt[cur].mlen;
+-                repcodes_t const newReps = ZSTD_newRep(opt[prev].rep, opt[cur].off, opt[cur].litlen==0);
++                repcodes_t const newReps = ZSTD_newRep(opt[prev].rep, opt[cur].off, opt[prev].litlen==0);
+                 ZSTD_memcpy(opt[cur].rep, &newReps, sizeof(repcodes_t));
+-            } else {
+-                ZSTD_memcpy(opt[cur].rep, opt[cur - 1].rep, sizeof(repcodes_t));
+             }
+ 
+             /* last match must start at a minimum distance of 8 from oend */
+@@ -1188,15 +1265,14 @@ ZSTD_compressBlock_opt_generic(ZSTD_matc
+ 
+             if ( (optLevel==0) /*static_test*/
+               && (opt[cur+1].price <= opt[cur].price + (BITCOST_MULTIPLIER/2)) ) {
+-                DEBUGLOG(7, "move to next rPos:%u : price is <=", cur+1);
++                DEBUGLOG(7, "skip current position : next rPos(%u) price is cheaper", cur+1);
+                 continue;  /* skip unpromising positions; about ~+6% speed, -0.01 ratio */
+             }
+ 
+             assert(opt[cur].price >= 0);
+-            {   U32 const ll0 = (opt[cur].mlen != 0);
+-                U32 const litlen = (opt[cur].mlen == 0) ? opt[cur].litlen : 0;
+-                U32 const previousPrice = (U32)opt[cur].price;
+-                U32 const basePrice = previousPrice + ZSTD_litLengthPrice(0, optStatePtr, optLevel);
++            {   U32 const ll0 = (opt[cur].litlen == 0);
++                int const previousPrice = opt[cur].price;
++                int const basePrice = previousPrice + LL_PRICE(0);
+                 U32 nbMatches = getAllMatches(matches, ms, &nextToUpdate3, inr, iend, opt[cur].rep, ll0, minMatch);
+                 U32 matchNb;
+ 
+@@ -1208,18 +1284,17 @@ ZSTD_compressBlock_opt_generic(ZSTD_matc
+                     continue;
+                 }
+ 
+-                {   U32 const maxML = matches[nbMatches-1].len;
+-                    DEBUGLOG(7, "cPos:%zi==rPos:%u, found %u matches, of maxLength=%u",
+-                                inr-istart, cur, nbMatches, maxML);
+-
+-                    if ( (maxML > sufficient_len)
+-                      || (cur + maxML >= ZSTD_OPT_NUM) ) {
+-                        lastSequence.mlen = maxML;
+-                        lastSequence.off = matches[nbMatches-1].off;
+-                        lastSequence.litlen = litlen;
+-                        cur -= (opt[cur].mlen==0) ? opt[cur].litlen : 0;  /* last sequence is actually only literals, fix cur to last match - note : may underflow, in which case, it's first sequence, and it's okay */
+-                        last_pos = cur + ZSTD_totalLen(lastSequence);
+-                        if (cur > ZSTD_OPT_NUM) cur = 0;   /* underflow => first match */
++                {   U32 const longestML = matches[nbMatches-1].len;
++                    DEBUGLOG(7, "cPos:%zi==rPos:%u, found %u matches, of longest ML=%u",
++                                inr-istart, cur, nbMatches, longestML);
++
++                    if ( (longestML > sufficient_len)
++                      || (cur + longestML >= ZSTD_OPT_NUM)
++                      || (ip + cur + longestML >= iend) ) {
++                        lastStretch.mlen = longestML;
++                        lastStretch.off = matches[nbMatches-1].off;
++                        lastStretch.litlen = 0;
++                        last_pos = cur + longestML;
+                         goto _shortestPath;
+                 }   }
+ 
+@@ -1230,20 +1305,25 @@ ZSTD_compressBlock_opt_generic(ZSTD_matc
+                     U32 const startML = (matchNb>0) ? matches[matchNb-1].len+1 : minMatch;
+                     U32 mlen;
+ 
+-                    DEBUGLOG(7, "testing match %u => offCode=%4u, mlen=%2u, llen=%2u",
+-                                matchNb, matches[matchNb].off, lastML, litlen);
++                    DEBUGLOG(7, "testing match %u => offBase=%4u, mlen=%2u, llen=%2u",
++                                matchNb, matches[matchNb].off, lastML, opt[cur].litlen);
+ 
+                     for (mlen = lastML; mlen >= startML; mlen--) {  /* scan downward */
+                         U32 const pos = cur + mlen;
+-                        int const price = (int)basePrice + (int)ZSTD_getMatchPrice(offset, mlen, optStatePtr, optLevel);
++                        int const price = basePrice + (int)ZSTD_getMatchPrice(offset, mlen, optStatePtr, optLevel);
+ 
+                         if ((pos > last_pos) || (price < opt[pos].price)) {
+                             DEBUGLOG(7, "rPos:%u (ml=%2u) => new better price (%.2f<%.2f)",
+                                         pos, mlen, ZSTD_fCost(price), ZSTD_fCost(opt[pos].price));
+-                            while (last_pos < pos) { opt[last_pos+1].price = ZSTD_MAX_PRICE; last_pos++; }   /* fill empty positions */
++                            while (last_pos < pos) {
++                                /* fill empty positions, for future comparisons */
++                                last_pos++;
++                                opt[last_pos].price = ZSTD_MAX_PRICE;
++                                opt[last_pos].litlen = !0;  /* just needs to be != 0, to mean "not an end of match" */
++                            }
+                             opt[pos].mlen = mlen;
+                             opt[pos].off = offset;
+-                            opt[pos].litlen = litlen;
++                            opt[pos].litlen = 0;
+                             opt[pos].price = price;
+                         } else {
+                             DEBUGLOG(7, "rPos:%u (ml=%2u) => new price is worse (%.2f>=%.2f)",
+@@ -1251,52 +1331,86 @@ ZSTD_compressBlock_opt_generic(ZSTD_matc
+                             if (optLevel==0) break;  /* early update abort; gets ~+10% speed for about -0.01 ratio loss */
+                         }
+             }   }   }
++            opt[last_pos+1].price = ZSTD_MAX_PRICE;
+         }  /* for (cur = 1; cur <= last_pos; cur++) */
+ 
+-        lastSequence = opt[last_pos];
+-        cur = last_pos > ZSTD_totalLen(lastSequence) ? last_pos - ZSTD_totalLen(lastSequence) : 0;  /* single sequence, and it starts before `ip` */
+-        assert(cur < ZSTD_OPT_NUM);  /* control overflow*/
++        lastStretch = opt[last_pos];
++        assert(cur >= lastStretch.mlen);
++        cur = last_pos - lastStretch.mlen;
+ 
+ _shortestPath:   /* cur, last_pos, best_mlen, best_off have to be set */
+         assert(opt[0].mlen == 0);
++        assert(last_pos >= lastStretch.mlen);
++        assert(cur == last_pos - lastStretch.mlen);
+ 
+-        /* Set the next chunk's repcodes based on the repcodes of the beginning
+-         * of the last match, and the last sequence. This avoids us having to
+-         * update them while traversing the sequences.
+-         */
+-        if (lastSequence.mlen != 0) {
+-            repcodes_t const reps = ZSTD_newRep(opt[cur].rep, lastSequence.off, lastSequence.litlen==0);
+-            ZSTD_memcpy(rep, &reps, sizeof(reps));
++        if (lastStretch.mlen==0) {
++            /* no solution : all matches have been converted into literals */
++            assert(lastStretch.litlen == (ip - anchor) + last_pos);
++            ip += last_pos;
++            continue;
++        }
++        assert(lastStretch.off > 0);
++
++        /* Update offset history */
++        if (lastStretch.litlen == 0) {
++            /* finishing on a match : update offset history */
++            repcodes_t const reps = ZSTD_newRep(opt[cur].rep, lastStretch.off, opt[cur].litlen==0);
++            ZSTD_memcpy(rep, &reps, sizeof(repcodes_t));
+         } else {
+-            ZSTD_memcpy(rep, opt[cur].rep, sizeof(repcodes_t));
++            ZSTD_memcpy(rep, lastStretch.rep, sizeof(repcodes_t));
++            assert(cur >= lastStretch.litlen);
++            cur -= lastStretch.litlen;
+         }
+ 
+-        {   U32 const storeEnd = cur + 1;
++        /* Let's write the shortest path solution.
++         * It is stored in @opt in reverse order,
++         * starting from @storeEnd (==cur+2),
++         * effectively partially @opt overwriting.
++         * Content is changed too:
++         * - So far, @opt stored stretches, aka a match followed by literals
++         * - Now, it will store sequences, aka literals followed by a match
++         */
++        {   U32 const storeEnd = cur + 2;
+             U32 storeStart = storeEnd;
+-            U32 seqPos = cur;
++            U32 stretchPos = cur;
+ 
+             DEBUGLOG(6, "start reverse traversal (last_pos:%u, cur:%u)",
+                         last_pos, cur); (void)last_pos;
+-            assert(storeEnd < ZSTD_OPT_NUM);
+-            DEBUGLOG(6, "last sequence copied into pos=%u (llen=%u,mlen=%u,ofc=%u)",
+-                        storeEnd, lastSequence.litlen, lastSequence.mlen, lastSequence.off);
+-            opt[storeEnd] = lastSequence;
+-            while (seqPos > 0) {
+-                U32 const backDist = ZSTD_totalLen(opt[seqPos]);
++            assert(storeEnd < ZSTD_OPT_SIZE);
++            DEBUGLOG(6, "last stretch copied into pos=%u (llen=%u,mlen=%u,ofc=%u)",
++                        storeEnd, lastStretch.litlen, lastStretch.mlen, lastStretch.off);
++            if (lastStretch.litlen > 0) {
++                /* last "sequence" is unfinished: just a bunch of literals */
++                opt[storeEnd].litlen = lastStretch.litlen;
++                opt[storeEnd].mlen = 0;
++                storeStart = storeEnd-1;
++                opt[storeStart] = lastStretch;
++            } {
++                opt[storeEnd] = lastStretch;  /* note: litlen will be fixed */
++                storeStart = storeEnd;
++            }
++            while (1) {
++                ZSTD_optimal_t nextStretch = opt[stretchPos];
++                opt[storeStart].litlen = nextStretch.litlen;
++                DEBUGLOG(6, "selected sequence (llen=%u,mlen=%u,ofc=%u)",
++                            opt[storeStart].litlen, opt[storeStart].mlen, opt[storeStart].off);
++                if (nextStretch.mlen == 0) {
++                    /* reaching beginning of segment */
++                    break;
++                }
+                 storeStart--;
+-                DEBUGLOG(6, "sequence from rPos=%u copied into pos=%u (llen=%u,mlen=%u,ofc=%u)",
+-                            seqPos, storeStart, opt[seqPos].litlen, opt[seqPos].mlen, opt[seqPos].off);
+-                opt[storeStart] = opt[seqPos];
+-                seqPos = (seqPos > backDist) ? seqPos - backDist : 0;
++                opt[storeStart] = nextStretch; /* note: litlen will be fixed */
++                assert(nextStretch.litlen + nextStretch.mlen <= stretchPos);
++                stretchPos -= nextStretch.litlen + nextStretch.mlen;
+             }
+ 
+             /* save sequences */
+-            DEBUGLOG(6, "sending selected sequences into seqStore")
++            DEBUGLOG(6, "sending selected sequences into seqStore");
+             {   U32 storePos;
+                 for (storePos=storeStart; storePos <= storeEnd; storePos++) {
+                     U32 const llen = opt[storePos].litlen;
+                     U32 const mlen = opt[storePos].mlen;
+-                    U32 const offCode = opt[storePos].off;
++                    U32 const offBase = opt[storePos].off;
+                     U32 const advance = llen + mlen;
+                     DEBUGLOG(6, "considering seq starting at %zi, llen=%u, mlen=%u",
+                                 anchor - istart, (unsigned)llen, (unsigned)mlen);
+@@ -1308,11 +1422,14 @@ _shortestPath:   /* cur, last_pos, best_
+                     }
+ 
+                     assert(anchor + llen <= iend);
+-                    ZSTD_updateStats(optStatePtr, llen, anchor, offCode, mlen);
+-                    ZSTD_storeSeq(seqStore, llen, anchor, iend, offCode, mlen);
++                    ZSTD_updateStats(optStatePtr, llen, anchor, offBase, mlen);
++                    ZSTD_storeSeq(seqStore, llen, anchor, iend, offBase, mlen);
+                     anchor += advance;
+                     ip = anchor;
+             }   }
++            DEBUGLOG(7, "new offset history : %u, %u, %u", rep[0], rep[1], rep[2]);
++
++            /* update all costs */
+             ZSTD_setBasePrices(optStatePtr, optLevel);
+         }
+     }   /* while (ip < ilimit) */
+@@ -1320,21 +1437,27 @@ _shortestPath:   /* cur, last_pos, best_
+     /* Return the last literals size */
+     return (size_t)(iend - anchor);
+ }
++#endif /* build exclusions */
+ 
++#ifndef ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR
+ static size_t ZSTD_compressBlock_opt0(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         const void* src, size_t srcSize, const ZSTD_dictMode_e dictMode)
+ {
+     return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 0 /* optLevel */, dictMode);
+ }
++#endif
+ 
++#ifndef ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR
+ static size_t ZSTD_compressBlock_opt2(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         const void* src, size_t srcSize, const ZSTD_dictMode_e dictMode)
+ {
+     return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 2 /* optLevel */, dictMode);
+ }
++#endif
+ 
++#ifndef ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR
+ size_t ZSTD_compressBlock_btopt(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         const void* src, size_t srcSize)
+@@ -1342,20 +1465,23 @@ size_t ZSTD_compressBlock_btopt(
+     DEBUGLOG(5, "ZSTD_compressBlock_btopt");
+     return ZSTD_compressBlock_opt0(ms, seqStore, rep, src, srcSize, ZSTD_noDict);
+ }
++#endif
+ 
+ 
+ 
+ 
++#ifndef ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR
+ /* ZSTD_initStats_ultra():
+  * make a first compression pass, just to seed stats with more accurate starting values.
+  * only works on first block, with no dictionary and no ldm.
+- * this function cannot error, hence its contract must be respected.
++ * this function cannot error out, its narrow contract must be respected.
+  */
+-static void
+-ZSTD_initStats_ultra(ZSTD_matchState_t* ms,
+-                     seqStore_t* seqStore,
+-                     U32 rep[ZSTD_REP_NUM],
+-               const void* src, size_t srcSize)
++static
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++void ZSTD_initStats_ultra(ZSTD_matchState_t* ms,
++                          seqStore_t* seqStore,
++                          U32 rep[ZSTD_REP_NUM],
++                    const void* src, size_t srcSize)
+ {
+     U32 tmpRep[ZSTD_REP_NUM];  /* updated rep codes will sink here */
+     ZSTD_memcpy(tmpRep, rep, sizeof(tmpRep));
+@@ -1368,7 +1494,7 @@ ZSTD_initStats_ultra(ZSTD_matchState_t*
+ 
+     ZSTD_compressBlock_opt2(ms, seqStore, tmpRep, src, srcSize, ZSTD_noDict);   /* generate stats into ms->opt*/
+ 
+-    /* invalidate first scan from history */
++    /* invalidate first scan from history, only keep entropy stats */
+     ZSTD_resetSeqStore(seqStore);
+     ms->window.base -= srcSize;
+     ms->window.dictLimit += (U32)srcSize;
+@@ -1392,10 +1518,10 @@ size_t ZSTD_compressBlock_btultra2(
+     U32 const curr = (U32)((const BYTE*)src - ms->window.base);
+     DEBUGLOG(5, "ZSTD_compressBlock_btultra2 (srcSize=%zu)", srcSize);
+ 
+-    /* 2-pass strategy:
++    /* 2-passes strategy:
+      * this strategy makes a first pass over first block to collect statistics
+-     * and seed next round's statistics with it.
+-     * After 1st pass, function forgets everything, and starts a new block.
++     * in order to seed next round's statistics with it.
++     * After 1st pass, function forgets history, and starts a new block.
+      * Consequently, this can only work if no data has been previously loaded in tables,
+      * aka, no dictionary, no prefix, no ldm preprocessing.
+      * The compression ratio gain is generally small (~0.5% on first block),
+@@ -1404,15 +1530,17 @@ size_t ZSTD_compressBlock_btultra2(
+     if ( (ms->opt.litLengthSum==0)   /* first block */
+       && (seqStore->sequences == seqStore->sequencesStart)  /* no ldm */
+       && (ms->window.dictLimit == ms->window.lowLimit)   /* no dictionary */
+-      && (curr == ms->window.dictLimit)   /* start of frame, nothing already loaded nor skipped */
+-      && (srcSize > ZSTD_PREDEF_THRESHOLD)
++      && (curr == ms->window.dictLimit)    /* start of frame, nothing already loaded nor skipped */
++      && (srcSize > ZSTD_PREDEF_THRESHOLD) /* input large enough to not employ default stats */
+       ) {
+         ZSTD_initStats_ultra(ms, seqStore, rep, src, srcSize);
+     }
+ 
+     return ZSTD_compressBlock_opt2(ms, seqStore, rep, src, srcSize, ZSTD_noDict);
+ }
++#endif
+ 
++#ifndef ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR
+ size_t ZSTD_compressBlock_btopt_dictMatchState(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         const void* src, size_t srcSize)
+@@ -1420,18 +1548,20 @@ size_t ZSTD_compressBlock_btopt_dictMatc
+     return ZSTD_compressBlock_opt0(ms, seqStore, rep, src, srcSize, ZSTD_dictMatchState);
+ }
+ 
+-size_t ZSTD_compressBlock_btultra_dictMatchState(
++size_t ZSTD_compressBlock_btopt_extDict(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         const void* src, size_t srcSize)
+ {
+-    return ZSTD_compressBlock_opt2(ms, seqStore, rep, src, srcSize, ZSTD_dictMatchState);
++    return ZSTD_compressBlock_opt0(ms, seqStore, rep, src, srcSize, ZSTD_extDict);
+ }
++#endif
+ 
+-size_t ZSTD_compressBlock_btopt_extDict(
++#ifndef ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR
++size_t ZSTD_compressBlock_btultra_dictMatchState(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         const void* src, size_t srcSize)
+ {
+-    return ZSTD_compressBlock_opt0(ms, seqStore, rep, src, srcSize, ZSTD_extDict);
++    return ZSTD_compressBlock_opt2(ms, seqStore, rep, src, srcSize, ZSTD_dictMatchState);
+ }
+ 
+ size_t ZSTD_compressBlock_btultra_extDict(
+@@ -1440,6 +1570,7 @@ size_t ZSTD_compressBlock_btultra_extDic
+ {
+     return ZSTD_compressBlock_opt2(ms, seqStore, rep, src, srcSize, ZSTD_extDict);
+ }
++#endif
+ 
+ /* note : no btultra2 variant for extDict nor dictMatchState,
+  * because btultra2 is not meant to work with dictionaries
+--- a/lib/zstd/compress/zstd_opt.h
++++ b/lib/zstd/compress/zstd_opt.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -14,30 +15,40 @@
+ 
+ #include "zstd_compress_internal.h"
+ 
++#if !defined(ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR) \
++ || !defined(ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR) \
++ || !defined(ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR)
+ /* used in ZSTD_loadDictionaryContent() */
+ void ZSTD_updateTree(ZSTD_matchState_t* ms, const BYTE* ip, const BYTE* iend);
++#endif
+ 
++#ifndef ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR
+ size_t ZSTD_compressBlock_btopt(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_btultra(
++size_t ZSTD_compressBlock_btopt_dictMatchState(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_btultra2(
++size_t ZSTD_compressBlock_btopt_extDict(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+ 
++#define ZSTD_COMPRESSBLOCK_BTOPT ZSTD_compressBlock_btopt
++#define ZSTD_COMPRESSBLOCK_BTOPT_DICTMATCHSTATE ZSTD_compressBlock_btopt_dictMatchState
++#define ZSTD_COMPRESSBLOCK_BTOPT_EXTDICT ZSTD_compressBlock_btopt_extDict
++#else
++#define ZSTD_COMPRESSBLOCK_BTOPT NULL
++#define ZSTD_COMPRESSBLOCK_BTOPT_DICTMATCHSTATE NULL
++#define ZSTD_COMPRESSBLOCK_BTOPT_EXTDICT NULL
++#endif
+ 
+-size_t ZSTD_compressBlock_btopt_dictMatchState(
++#ifndef ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR
++size_t ZSTD_compressBlock_btultra(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+ size_t ZSTD_compressBlock_btultra_dictMatchState(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-
+-size_t ZSTD_compressBlock_btopt_extDict(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+-        void const* src, size_t srcSize);
+ size_t ZSTD_compressBlock_btultra_extDict(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+@@ -45,6 +56,20 @@ size_t ZSTD_compressBlock_btultra_extDic
+         /* note : no btultra2 variant for extDict nor dictMatchState,
+          * because btultra2 is not meant to work with dictionaries
+          * and is only specific for the first block (no prefix) */
++size_t ZSTD_compressBlock_btultra2(
++        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++        void const* src, size_t srcSize);
++
++#define ZSTD_COMPRESSBLOCK_BTULTRA ZSTD_compressBlock_btultra
++#define ZSTD_COMPRESSBLOCK_BTULTRA_DICTMATCHSTATE ZSTD_compressBlock_btultra_dictMatchState
++#define ZSTD_COMPRESSBLOCK_BTULTRA_EXTDICT ZSTD_compressBlock_btultra_extDict
++#define ZSTD_COMPRESSBLOCK_BTULTRA2 ZSTD_compressBlock_btultra2
++#else
++#define ZSTD_COMPRESSBLOCK_BTULTRA NULL
++#define ZSTD_COMPRESSBLOCK_BTULTRA_DICTMATCHSTATE NULL
++#define ZSTD_COMPRESSBLOCK_BTULTRA_EXTDICT NULL
++#define ZSTD_COMPRESSBLOCK_BTULTRA2 NULL
++#endif
+ 
+ 
+ #endif /* ZSTD_OPT_H */
+--- a/lib/zstd/decompress/huf_decompress.c
++++ b/lib/zstd/decompress/huf_decompress.c
+@@ -1,7 +1,8 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /* ******************************************************************
+  * huff0 huffman decoder,
+  * part of Finite State Entropy library
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  *
+  *  You can contact the author at :
+  *  - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy
+@@ -19,10 +20,10 @@
+ #include "../common/compiler.h"
+ #include "../common/bitstream.h"  /* BIT_* */
+ #include "../common/fse.h"        /* to compress headers */
+-#define HUF_STATIC_LINKING_ONLY
+ #include "../common/huf.h"
+ #include "../common/error_private.h"
+ #include "../common/zstd_internal.h"
++#include "../common/bits.h"       /* ZSTD_highbit32, ZSTD_countTrailingZeros64 */
+ 
+ /* **************************************************************
+ *  Constants
+@@ -34,6 +35,12 @@
+ *  Macros
+ ****************************************************************/
+ 
++#ifdef HUF_DISABLE_FAST_DECODE
++# define HUF_ENABLE_FAST_DECODE 0
++#else
++# define HUF_ENABLE_FAST_DECODE 1
++#endif
++
+ /* These two optional macros force the use one way or another of the two
+  * Huffman decompression implementations. You can't force in both directions
+  * at the same time.
+@@ -43,27 +50,25 @@
+ #error "Cannot force the use of the X1 and X2 decoders at the same time!"
+ #endif
+ 
+-#if ZSTD_ENABLE_ASM_X86_64_BMI2 && DYNAMIC_BMI2
+-# define HUF_ASM_X86_64_BMI2_ATTRS BMI2_TARGET_ATTRIBUTE
++/* When DYNAMIC_BMI2 is enabled, fast decoders are only called when bmi2 is
++ * supported at runtime, so we can add the BMI2 target attribute.
++ * When it is disabled, we will still get BMI2 if it is enabled statically.
++ */
++#if DYNAMIC_BMI2
++# define HUF_FAST_BMI2_ATTRS BMI2_TARGET_ATTRIBUTE
+ #else
+-# define HUF_ASM_X86_64_BMI2_ATTRS
++# define HUF_FAST_BMI2_ATTRS
+ #endif
+ 
+ #define HUF_EXTERN_C
+ #define HUF_ASM_DECL HUF_EXTERN_C
+ 
+-#if DYNAMIC_BMI2 || (ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__))
++#if DYNAMIC_BMI2
+ # define HUF_NEED_BMI2_FUNCTION 1
+ #else
+ # define HUF_NEED_BMI2_FUNCTION 0
+ #endif
+ 
+-#if !(ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__))
+-# define HUF_NEED_DEFAULT_FUNCTION 1
+-#else
+-# define HUF_NEED_DEFAULT_FUNCTION 0
+-#endif
+-
+ /* **************************************************************
+ *  Error Management
+ ****************************************************************/
+@@ -80,6 +85,11 @@
+ /* **************************************************************
+ *  BMI2 Variant Wrappers
+ ****************************************************************/
++typedef size_t (*HUF_DecompressUsingDTableFn)(void *dst, size_t dstSize,
++                                              const void *cSrc,
++                                              size_t cSrcSize,
++                                              const HUF_DTable *DTable);
++
+ #if DYNAMIC_BMI2
+ 
+ #define HUF_DGEN(fn)                                                        \
+@@ -101,9 +111,9 @@
+     }                                                                       \
+                                                                             \
+     static size_t fn(void* dst, size_t dstSize, void const* cSrc,           \
+-                     size_t cSrcSize, HUF_DTable const* DTable, int bmi2)   \
++                     size_t cSrcSize, HUF_DTable const* DTable, int flags)  \
+     {                                                                       \
+-        if (bmi2) {                                                         \
++        if (flags & HUF_flags_bmi2) {                                       \
+             return fn##_bmi2(dst, dstSize, cSrc, cSrcSize, DTable);         \
+         }                                                                   \
+         return fn##_default(dst, dstSize, cSrc, cSrcSize, DTable);          \
+@@ -113,9 +123,9 @@
+ 
+ #define HUF_DGEN(fn)                                                        \
+     static size_t fn(void* dst, size_t dstSize, void const* cSrc,           \
+-                     size_t cSrcSize, HUF_DTable const* DTable, int bmi2)   \
++                     size_t cSrcSize, HUF_DTable const* DTable, int flags)  \
+     {                                                                       \
+-        (void)bmi2;                                                         \
++        (void)flags;                                                        \
+         return fn##_body(dst, dstSize, cSrc, cSrcSize, DTable);             \
+     }
+ 
+@@ -134,43 +144,66 @@ static DTableDesc HUF_getDTableDesc(cons
+     return dtd;
+ }
+ 
+-#if ZSTD_ENABLE_ASM_X86_64_BMI2
+-
+-static size_t HUF_initDStream(BYTE const* ip) {
++static size_t HUF_initFastDStream(BYTE const* ip) {
+     BYTE const lastByte = ip[7];
+-    size_t const bitsConsumed = lastByte ? 8 - BIT_highbit32(lastByte) : 0;
++    size_t const bitsConsumed = lastByte ? 8 - ZSTD_highbit32(lastByte) : 0;
+     size_t const value = MEM_readLEST(ip) | 1;
+     assert(bitsConsumed <= 8);
++    assert(sizeof(size_t) == 8);
+     return value << bitsConsumed;
+ }
++
++
++/*
++ * The input/output arguments to the Huffman fast decoding loop:
++ *
++ * ip [in/out] - The input pointers, must be updated to reflect what is consumed.
++ * op [in/out] - The output pointers, must be updated to reflect what is written.
++ * bits [in/out] - The bitstream containers, must be updated to reflect the current state.
++ * dt [in] - The decoding table.
++ * ilowest [in] - The beginning of the valid range of the input. Decoders may read
++ *                down to this pointer. It may be below iend[0].
++ * oend [in] - The end of the output stream. op[3] must not cross oend.
++ * iend [in] - The end of each input stream. ip[i] may cross iend[i],
++ *             as long as it is above ilowest, but that indicates corruption.
++ */
+ typedef struct {
+     BYTE const* ip[4];
+     BYTE* op[4];
+     U64 bits[4];
+     void const* dt;
+-    BYTE const* ilimit;
++    BYTE const* ilowest;
+     BYTE* oend;
+     BYTE const* iend[4];
+-} HUF_DecompressAsmArgs;
++} HUF_DecompressFastArgs;
++
++typedef void (*HUF_DecompressFastLoopFn)(HUF_DecompressFastArgs*);
+ 
+ /*
+- * Initializes args for the asm decoding loop.
+- * @returns 0 on success
+- *          1 if the fallback implementation should be used.
++ * Initializes args for the fast decoding loop.
++ * @returns 1 on success
++ *          0 if the fallback implementation should be used.
+  *          Or an error code on failure.
+  */
+-static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst, size_t dstSize, void const* src, size_t srcSize, const HUF_DTable* DTable)
++static size_t HUF_DecompressFastArgs_init(HUF_DecompressFastArgs* args, void* dst, size_t dstSize, void const* src, size_t srcSize, const HUF_DTable* DTable)
+ {
+     void const* dt = DTable + 1;
+     U32 const dtLog = HUF_getDTableDesc(DTable).tableLog;
+ 
+-    const BYTE* const ilimit = (const BYTE*)src + 6 + 8;
++    const BYTE* const istart = (const BYTE*)src;
+ 
+-    BYTE* const oend = (BYTE*)dst + dstSize;
++    BYTE* const oend = ZSTD_maybeNullPtrAdd((BYTE*)dst, dstSize);
+ 
+-    /* The following condition is false on x32 platform,
+-     * but HUF_asm is not compatible with this ABI */
+-    if (!(MEM_isLittleEndian() && !MEM_32bits())) return 1;
++    /* The fast decoding loop assumes 64-bit little-endian.
++     * This condition is false on x32.
++     */
++    if (!MEM_isLittleEndian() || MEM_32bits())
++        return 0;
++
++    /* Avoid nullptr addition */
++    if (dstSize == 0)
++        return 0;
++    assert(dst != NULL);
+ 
+     /* strict minimum : jump table + 1 byte per stream */
+     if (srcSize < 10)
+@@ -181,11 +214,10 @@ static size_t HUF_DecompressAsmArgs_init
+      * On small inputs we don't have enough data to trigger the fast loop, so use the old decoder.
+      */
+     if (dtLog != HUF_DECODER_FAST_TABLELOG)
+-        return 1;
++        return 0;
+ 
+     /* Read the jump table. */
+     {
+-        const BYTE* const istart = (const BYTE*)src;
+         size_t const length1 = MEM_readLE16(istart);
+         size_t const length2 = MEM_readLE16(istart+2);
+         size_t const length3 = MEM_readLE16(istart+4);
+@@ -195,13 +227,11 @@ static size_t HUF_DecompressAsmArgs_init
+         args->iend[2] = args->iend[1] + length2;
+         args->iend[3] = args->iend[2] + length3;
+ 
+-        /* HUF_initDStream() requires this, and this small of an input
++        /* HUF_initFastDStream() requires this, and this small of an input
+          * won't benefit from the ASM loop anyways.
+-         * length1 must be >= 16 so that ip[0] >= ilimit before the loop
+-         * starts.
+          */
+-        if (length1 < 16 || length2 < 8 || length3 < 8 || length4 < 8)
+-            return 1;
++        if (length1 < 8 || length2 < 8 || length3 < 8 || length4 < 8)
++            return 0;
+         if (length4 > srcSize) return ERROR(corruption_detected);   /* overflow */
+     }
+     /* ip[] contains the position that is currently loaded into bits[]. */
+@@ -218,7 +248,7 @@ static size_t HUF_DecompressAsmArgs_init
+ 
+     /* No point to call the ASM loop for tiny outputs. */
+     if (args->op[3] >= oend)
+-        return 1;
++        return 0;
+ 
+     /* bits[] is the bit container.
+         * It is read from the MSB down to the LSB.
+@@ -227,24 +257,25 @@ static size_t HUF_DecompressAsmArgs_init
+         * set, so that CountTrailingZeros(bits[]) can be used
+         * to count how many bits we've consumed.
+         */
+-    args->bits[0] = HUF_initDStream(args->ip[0]);
+-    args->bits[1] = HUF_initDStream(args->ip[1]);
+-    args->bits[2] = HUF_initDStream(args->ip[2]);
+-    args->bits[3] = HUF_initDStream(args->ip[3]);
+-
+-    /* If ip[] >= ilimit, it is guaranteed to be safe to
+-        * reload bits[]. It may be beyond its section, but is
+-        * guaranteed to be valid (>= istart).
+-        */
+-    args->ilimit = ilimit;
++    args->bits[0] = HUF_initFastDStream(args->ip[0]);
++    args->bits[1] = HUF_initFastDStream(args->ip[1]);
++    args->bits[2] = HUF_initFastDStream(args->ip[2]);
++    args->bits[3] = HUF_initFastDStream(args->ip[3]);
++
++    /* The decoders must be sure to never read beyond ilowest.
++     * This is lower than iend[0], but allowing decoders to read
++     * down to ilowest can allow an extra iteration or two in the
++     * fast loop.
++     */
++    args->ilowest = istart;
+ 
+     args->oend = oend;
+     args->dt = dt;
+ 
+-    return 0;
++    return 1;
+ }
+ 
+-static size_t HUF_initRemainingDStream(BIT_DStream_t* bit, HUF_DecompressAsmArgs const* args, int stream, BYTE* segmentEnd)
++static size_t HUF_initRemainingDStream(BIT_DStream_t* bit, HUF_DecompressFastArgs const* args, int stream, BYTE* segmentEnd)
+ {
+     /* Validate that we haven't overwritten. */
+     if (args->op[stream] > segmentEnd)
+@@ -258,15 +289,33 @@ static size_t HUF_initRemainingDStream(B
+         return ERROR(corruption_detected);
+ 
+     /* Construct the BIT_DStream_t. */
+-    bit->bitContainer = MEM_readLE64(args->ip[stream]);
+-    bit->bitsConsumed = ZSTD_countTrailingZeros((size_t)args->bits[stream]);
+-    bit->start = (const char*)args->iend[0];
++    assert(sizeof(size_t) == 8);
++    bit->bitContainer = MEM_readLEST(args->ip[stream]);
++    bit->bitsConsumed = ZSTD_countTrailingZeros64(args->bits[stream]);
++    bit->start = (const char*)args->ilowest;
+     bit->limitPtr = bit->start + sizeof(size_t);
+     bit->ptr = (const char*)args->ip[stream];
+ 
+     return 0;
+ }
+-#endif
++
++/* Calls X(N) for each stream 0, 1, 2, 3. */
++#define HUF_4X_FOR_EACH_STREAM(X) \
++    do {                          \
++        X(0);                     \
++        X(1);                     \
++        X(2);                     \
++        X(3);                     \
++    } while (0)
++
++/* Calls X(N, var) for each stream 0, 1, 2, 3. */
++#define HUF_4X_FOR_EACH_STREAM_WITH_VAR(X, var) \
++    do {                                        \
++        X(0, (var));                            \
++        X(1, (var));                            \
++        X(2, (var));                            \
++        X(3, (var));                            \
++    } while (0)
+ 
+ 
+ #ifndef HUF_FORCE_DECOMPRESS_X2
+@@ -283,10 +332,11 @@ typedef struct { BYTE nbBits; BYTE byte;
+ static U64 HUF_DEltX1_set4(BYTE symbol, BYTE nbBits) {
+     U64 D4;
+     if (MEM_isLittleEndian()) {
+-        D4 = (symbol << 8) + nbBits;
++        D4 = (U64)((symbol << 8) + nbBits);
+     } else {
+-        D4 = symbol + (nbBits << 8);
++        D4 = (U64)(symbol + (nbBits << 8));
+     }
++    assert(D4 < (1U << 16));
+     D4 *= 0x0001000100010001ULL;
+     return D4;
+ }
+@@ -329,13 +379,7 @@ typedef struct {
+         BYTE huffWeight[HUF_SYMBOLVALUE_MAX + 1];
+ } HUF_ReadDTableX1_Workspace;
+ 
+-
+-size_t HUF_readDTableX1_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize)
+-{
+-    return HUF_readDTableX1_wksp_bmi2(DTable, src, srcSize, workSpace, wkspSize, /* bmi2 */ 0);
+-}
+-
+-size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int bmi2)
++size_t HUF_readDTableX1_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int flags)
+ {
+     U32 tableLog = 0;
+     U32 nbSymbols = 0;
+@@ -350,7 +394,7 @@ size_t HUF_readDTableX1_wksp_bmi2(HUF_DT
+     DEBUG_STATIC_ASSERT(sizeof(DTableDesc) == sizeof(HUF_DTable));
+     /* ZSTD_memset(huffWeight, 0, sizeof(huffWeight)); */   /* is not necessary, even though some analyzer complain ... */
+ 
+-    iSize = HUF_readStats_wksp(wksp->huffWeight, HUF_SYMBOLVALUE_MAX + 1, wksp->rankVal, &nbSymbols, &tableLog, src, srcSize, wksp->statsWksp, sizeof(wksp->statsWksp), bmi2);
++    iSize = HUF_readStats_wksp(wksp->huffWeight, HUF_SYMBOLVALUE_MAX + 1, wksp->rankVal, &nbSymbols, &tableLog, src, srcSize, wksp->statsWksp, sizeof(wksp->statsWksp), flags);
+     if (HUF_isError(iSize)) return iSize;
+ 
+ 
+@@ -377,9 +421,8 @@ size_t HUF_readDTableX1_wksp_bmi2(HUF_DT
+      * rankStart[0] is not filled because there are no entries in the table for
+      * weight 0.
+      */
+-    {
+-        int n;
+-        int nextRankStart = 0;
++    {   int n;
++        U32 nextRankStart = 0;
+         int const unroll = 4;
+         int const nLimit = (int)nbSymbols - unroll + 1;
+         for (n=0; n<(int)tableLog+1; n++) {
+@@ -406,10 +449,9 @@ size_t HUF_readDTableX1_wksp_bmi2(HUF_DT
+      * We can switch based on the length to a different inner loop which is
+      * optimized for that particular case.
+      */
+-    {
+-        U32 w;
+-        int symbol=wksp->rankVal[0];
+-        int rankStart=0;
++    {   U32 w;
++        int symbol = wksp->rankVal[0];
++        int rankStart = 0;
+         for (w=1; w<tableLog+1; ++w) {
+             int const symbolCount = wksp->rankVal[w];
+             int const length = (1 << w) >> 1;
+@@ -483,15 +525,19 @@ HUF_decodeSymbolX1(BIT_DStream_t* Dstrea
+ }
+ 
+ #define HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr) \
+-    *ptr++ = HUF_decodeSymbolX1(DStreamPtr, dt, dtLog)
++    do { *ptr++ = HUF_decodeSymbolX1(DStreamPtr, dt, dtLog); } while (0)
+ 
+-#define HUF_DECODE_SYMBOLX1_1(ptr, DStreamPtr)  \
+-    if (MEM_64bits() || (HUF_TABLELOG_MAX<=12)) \
+-        HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr)
+-
+-#define HUF_DECODE_SYMBOLX1_2(ptr, DStreamPtr) \
+-    if (MEM_64bits()) \
+-        HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr)
++#define HUF_DECODE_SYMBOLX1_1(ptr, DStreamPtr)      \
++    do {                                            \
++        if (MEM_64bits() || (HUF_TABLELOG_MAX<=12)) \
++            HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr); \
++    } while (0)
++
++#define HUF_DECODE_SYMBOLX1_2(ptr, DStreamPtr)      \
++    do {                                            \
++        if (MEM_64bits())                           \
++            HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr); \
++    } while (0)
+ 
+ HINT_INLINE size_t
+ HUF_decodeStreamX1(BYTE* p, BIT_DStream_t* const bitDPtr, BYTE* const pEnd, const HUF_DEltX1* const dt, const U32 dtLog)
+@@ -519,7 +565,7 @@ HUF_decodeStreamX1(BYTE* p, BIT_DStream_
+     while (p < pEnd)
+         HUF_DECODE_SYMBOLX1_0(p, bitDPtr);
+ 
+-    return pEnd-pStart;
++    return (size_t)(pEnd-pStart);
+ }
+ 
+ FORCE_INLINE_TEMPLATE size_t
+@@ -529,7 +575,7 @@ HUF_decompress1X1_usingDTable_internal_b
+     const HUF_DTable* DTable)
+ {
+     BYTE* op = (BYTE*)dst;
+-    BYTE* const oend = op + dstSize;
++    BYTE* const oend = ZSTD_maybeNullPtrAdd(op, dstSize);
+     const void* dtPtr = DTable + 1;
+     const HUF_DEltX1* const dt = (const HUF_DEltX1*)dtPtr;
+     BIT_DStream_t bitD;
+@@ -545,6 +591,10 @@ HUF_decompress1X1_usingDTable_internal_b
+     return dstSize;
+ }
+ 
++/* HUF_decompress4X1_usingDTable_internal_body():
++ * Conditions :
++ * @dstSize >= 6
++ */
+ FORCE_INLINE_TEMPLATE size_t
+ HUF_decompress4X1_usingDTable_internal_body(
+           void* dst,  size_t dstSize,
+@@ -553,6 +603,7 @@ HUF_decompress4X1_usingDTable_internal_b
+ {
+     /* Check */
+     if (cSrcSize < 10) return ERROR(corruption_detected);  /* strict minimum : jump table + 1 byte per stream */
++    if (dstSize < 6) return ERROR(corruption_detected);         /* stream 4-split doesn't work */
+ 
+     {   const BYTE* const istart = (const BYTE*) cSrc;
+         BYTE* const ostart = (BYTE*) dst;
+@@ -588,6 +639,7 @@ HUF_decompress4X1_usingDTable_internal_b
+ 
+         if (length4 > cSrcSize) return ERROR(corruption_detected);   /* overflow */
+         if (opStart4 > oend) return ERROR(corruption_detected);      /* overflow */
++        assert(dstSize >= 6); /* validated above */
+         CHECK_F( BIT_initDStream(&bitD1, istart1, length1) );
+         CHECK_F( BIT_initDStream(&bitD2, istart2, length2) );
+         CHECK_F( BIT_initDStream(&bitD3, istart3, length3) );
+@@ -650,52 +702,173 @@ size_t HUF_decompress4X1_usingDTable_int
+ }
+ #endif
+ 
+-#if HUF_NEED_DEFAULT_FUNCTION
+ static
+ size_t HUF_decompress4X1_usingDTable_internal_default(void* dst, size_t dstSize, void const* cSrc,
+                     size_t cSrcSize, HUF_DTable const* DTable) {
+     return HUF_decompress4X1_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable);
+ }
+-#endif
+ 
+ #if ZSTD_ENABLE_ASM_X86_64_BMI2
+ 
+-HUF_ASM_DECL void HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop(HUF_DecompressAsmArgs* args) ZSTDLIB_HIDDEN;
++HUF_ASM_DECL void HUF_decompress4X1_usingDTable_internal_fast_asm_loop(HUF_DecompressFastArgs* args) ZSTDLIB_HIDDEN;
++
++#endif
++
++static HUF_FAST_BMI2_ATTRS
++void HUF_decompress4X1_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs* args)
++{
++    U64 bits[4];
++    BYTE const* ip[4];
++    BYTE* op[4];
++    U16 const* const dtable = (U16 const*)args->dt;
++    BYTE* const oend = args->oend;
++    BYTE const* const ilowest = args->ilowest;
++
++    /* Copy the arguments to local variables */
++    ZSTD_memcpy(&bits, &args->bits, sizeof(bits));
++    ZSTD_memcpy((void*)(&ip), &args->ip, sizeof(ip));
++    ZSTD_memcpy(&op, &args->op, sizeof(op));
++
++    assert(MEM_isLittleEndian());
++    assert(!MEM_32bits());
++
++    for (;;) {
++        BYTE* olimit;
++        int stream;
++
++        /* Assert loop preconditions */
++#ifndef NDEBUG
++        for (stream = 0; stream < 4; ++stream) {
++            assert(op[stream] <= (stream == 3 ? oend : op[stream + 1]));
++            assert(ip[stream] >= ilowest);
++        }
++#endif
++        /* Compute olimit */
++        {
++            /* Each iteration produces 5 output symbols per stream */
++            size_t const oiters = (size_t)(oend - op[3]) / 5;
++            /* Each iteration consumes up to 11 bits * 5 = 55 bits < 7 bytes
++             * per stream.
++             */
++            size_t const iiters = (size_t)(ip[0] - ilowest) / 7;
++            /* We can safely run iters iterations before running bounds checks */
++            size_t const iters = MIN(oiters, iiters);
++            size_t const symbols = iters * 5;
++
++            /* We can simply check that op[3] < olimit, instead of checking all
++             * of our bounds, since we can't hit the other bounds until we've run
++             * iters iterations, which only happens when op[3] == olimit.
++             */
++            olimit = op[3] + symbols;
++
++            /* Exit fast decoding loop once we reach the end. */
++            if (op[3] == olimit)
++                break;
++
++            /* Exit the decoding loop if any input pointer has crossed the
++             * previous one. This indicates corruption, and a precondition
++             * to our loop is that ip[i] >= ip[0].
++             */
++            for (stream = 1; stream < 4; ++stream) {
++                if (ip[stream] < ip[stream - 1])
++                    goto _out;
++            }
++        }
++
++#ifndef NDEBUG
++        for (stream = 1; stream < 4; ++stream) {
++            assert(ip[stream] >= ip[stream - 1]);
++        }
++#endif
++
++#define HUF_4X1_DECODE_SYMBOL(_stream, _symbol)                 \
++    do {                                                        \
++        int const index = (int)(bits[(_stream)] >> 53);         \
++        int const entry = (int)dtable[index];                   \
++        bits[(_stream)] <<= (entry & 0x3F);                     \
++        op[(_stream)][(_symbol)] = (BYTE)((entry >> 8) & 0xFF); \
++    } while (0)
++
++#define HUF_4X1_RELOAD_STREAM(_stream)                              \
++    do {                                                            \
++        int const ctz = ZSTD_countTrailingZeros64(bits[(_stream)]); \
++        int const nbBits = ctz & 7;                                 \
++        int const nbBytes = ctz >> 3;                               \
++        op[(_stream)] += 5;                                         \
++        ip[(_stream)] -= nbBytes;                                   \
++        bits[(_stream)] = MEM_read64(ip[(_stream)]) | 1;            \
++        bits[(_stream)] <<= nbBits;                                 \
++    } while (0)
++
++        /* Manually unroll the loop because compilers don't consistently
++         * unroll the inner loops, which destroys performance.
++         */
++        do {
++            /* Decode 5 symbols in each of the 4 streams */
++            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 0);
++            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 1);
++            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 2);
++            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 3);
++            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 4);
++
++            /* Reload each of the 4 the bitstreams */
++            HUF_4X_FOR_EACH_STREAM(HUF_4X1_RELOAD_STREAM);
++        } while (op[3] < olimit);
++
++#undef HUF_4X1_DECODE_SYMBOL
++#undef HUF_4X1_RELOAD_STREAM
++    }
++
++_out:
++
++    /* Save the final values of each of the state variables back to args. */
++    ZSTD_memcpy(&args->bits, &bits, sizeof(bits));
++    ZSTD_memcpy((void*)(&args->ip), &ip, sizeof(ip));
++    ZSTD_memcpy(&args->op, &op, sizeof(op));
++}
+ 
+-static HUF_ASM_X86_64_BMI2_ATTRS
++/*
++ * @returns @p dstSize on success (>= 6)
++ *          0 if the fallback implementation should be used
++ *          An error if an error occurred
++ */
++static HUF_FAST_BMI2_ATTRS
+ size_t
+-HUF_decompress4X1_usingDTable_internal_bmi2_asm(
++HUF_decompress4X1_usingDTable_internal_fast(
+           void* dst,  size_t dstSize,
+     const void* cSrc, size_t cSrcSize,
+-    const HUF_DTable* DTable)
++    const HUF_DTable* DTable,
++    HUF_DecompressFastLoopFn loopFn)
+ {
+     void const* dt = DTable + 1;
+-    const BYTE* const iend = (const BYTE*)cSrc + 6;
+-    BYTE* const oend = (BYTE*)dst + dstSize;
+-    HUF_DecompressAsmArgs args;
+-    {
+-        size_t const ret = HUF_DecompressAsmArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable);
+-        FORWARD_IF_ERROR(ret, "Failed to init asm args");
+-        if (ret != 0)
+-            return HUF_decompress4X1_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable);
++    BYTE const* const ilowest = (BYTE const*)cSrc;
++    BYTE* const oend = ZSTD_maybeNullPtrAdd((BYTE*)dst, dstSize);
++    HUF_DecompressFastArgs args;
++    {   size_t const ret = HUF_DecompressFastArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable);
++        FORWARD_IF_ERROR(ret, "Failed to init fast loop args");
++        if (ret == 0)
++            return 0;
+     }
+ 
+-    assert(args.ip[0] >= args.ilimit);
+-    HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop(&args);
++    assert(args.ip[0] >= args.ilowest);
++    loopFn(&args);
+ 
+-    /* Our loop guarantees that ip[] >= ilimit and that we haven't
++    /* Our loop guarantees that ip[] >= ilowest and that we haven't
+     * overwritten any op[].
+     */
+-    assert(args.ip[0] >= iend);
+-    assert(args.ip[1] >= iend);
+-    assert(args.ip[2] >= iend);
+-    assert(args.ip[3] >= iend);
++    assert(args.ip[0] >= ilowest);
++    assert(args.ip[0] >= ilowest);
++    assert(args.ip[1] >= ilowest);
++    assert(args.ip[2] >= ilowest);
++    assert(args.ip[3] >= ilowest);
+     assert(args.op[3] <= oend);
+-    (void)iend;
++
++    assert(ilowest == args.ilowest);
++    assert(ilowest + 6 == args.iend[0]);
++    (void)ilowest;
+ 
+     /* finish bit streams one by one. */
+-    {
+-        size_t const segmentSize = (dstSize+3) / 4;
++    {   size_t const segmentSize = (dstSize+3) / 4;
+         BYTE* segmentEnd = (BYTE*)dst;
+         int i;
+         for (i = 0; i < 4; ++i) {
+@@ -712,97 +885,59 @@ HUF_decompress4X1_usingDTable_internal_b
+     }
+ 
+     /* decoded size */
++    assert(dstSize != 0);
+     return dstSize;
+ }
+-#endif /* ZSTD_ENABLE_ASM_X86_64_BMI2 */
+-
+-typedef size_t (*HUF_decompress_usingDTable_t)(void *dst, size_t dstSize,
+-                                               const void *cSrc,
+-                                               size_t cSrcSize,
+-                                               const HUF_DTable *DTable);
+ 
+ HUF_DGEN(HUF_decompress1X1_usingDTable_internal)
+ 
+ static size_t HUF_decompress4X1_usingDTable_internal(void* dst, size_t dstSize, void const* cSrc,
+-                    size_t cSrcSize, HUF_DTable const* DTable, int bmi2)
++                    size_t cSrcSize, HUF_DTable const* DTable, int flags)
+ {
++    HUF_DecompressUsingDTableFn fallbackFn = HUF_decompress4X1_usingDTable_internal_default;
++    HUF_DecompressFastLoopFn loopFn = HUF_decompress4X1_usingDTable_internal_fast_c_loop;
++
+ #if DYNAMIC_BMI2
+-    if (bmi2) {
++    if (flags & HUF_flags_bmi2) {
++        fallbackFn = HUF_decompress4X1_usingDTable_internal_bmi2;
+ # if ZSTD_ENABLE_ASM_X86_64_BMI2
+-        return HUF_decompress4X1_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable);
+-# else
+-        return HUF_decompress4X1_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable);
++        if (!(flags & HUF_flags_disableAsm)) {
++            loopFn = HUF_decompress4X1_usingDTable_internal_fast_asm_loop;
++        }
+ # endif
++    } else {
++        return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable);
+     }
+-#else
+-    (void)bmi2;
+ #endif
+ 
+ #if ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__)
+-    return HUF_decompress4X1_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable);
+-#else
+-    return HUF_decompress4X1_usingDTable_internal_default(dst, dstSize, cSrc, cSrcSize, DTable);
++    if (!(flags & HUF_flags_disableAsm)) {
++        loopFn = HUF_decompress4X1_usingDTable_internal_fast_asm_loop;
++    }
+ #endif
+-}
+-
+-
+-size_t HUF_decompress1X1_usingDTable(
+-          void* dst,  size_t dstSize,
+-    const void* cSrc, size_t cSrcSize,
+-    const HUF_DTable* DTable)
+-{
+-    DTableDesc dtd = HUF_getDTableDesc(DTable);
+-    if (dtd.tableType != 0) return ERROR(GENERIC);
+-    return HUF_decompress1X1_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+-}
+ 
+-size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* DCtx, void* dst, size_t dstSize,
+-                                   const void* cSrc, size_t cSrcSize,
+-                                   void* workSpace, size_t wkspSize)
+-{
+-    const BYTE* ip = (const BYTE*) cSrc;
+-
+-    size_t const hSize = HUF_readDTableX1_wksp(DCtx, cSrc, cSrcSize, workSpace, wkspSize);
+-    if (HUF_isError(hSize)) return hSize;
+-    if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
+-    ip += hSize; cSrcSize -= hSize;
+-
+-    return HUF_decompress1X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx, /* bmi2 */ 0);
+-}
+-
+-
+-size_t HUF_decompress4X1_usingDTable(
+-          void* dst,  size_t dstSize,
+-    const void* cSrc, size_t cSrcSize,
+-    const HUF_DTable* DTable)
+-{
+-    DTableDesc dtd = HUF_getDTableDesc(DTable);
+-    if (dtd.tableType != 0) return ERROR(GENERIC);
+-    return HUF_decompress4X1_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
++    if (HUF_ENABLE_FAST_DECODE && !(flags & HUF_flags_disableFast)) {
++        size_t const ret = HUF_decompress4X1_usingDTable_internal_fast(dst, dstSize, cSrc, cSrcSize, DTable, loopFn);
++        if (ret != 0)
++            return ret;
++    }
++    return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable);
+ }
+ 
+-static size_t HUF_decompress4X1_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize,
++static size_t HUF_decompress4X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
+                                    const void* cSrc, size_t cSrcSize,
+-                                   void* workSpace, size_t wkspSize, int bmi2)
++                                   void* workSpace, size_t wkspSize, int flags)
+ {
+     const BYTE* ip = (const BYTE*) cSrc;
+ 
+-    size_t const hSize = HUF_readDTableX1_wksp_bmi2(dctx, cSrc, cSrcSize, workSpace, wkspSize, bmi2);
++    size_t const hSize = HUF_readDTableX1_wksp(dctx, cSrc, cSrcSize, workSpace, wkspSize, flags);
+     if (HUF_isError(hSize)) return hSize;
+     if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
+     ip += hSize; cSrcSize -= hSize;
+ 
+-    return HUF_decompress4X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2);
++    return HUF_decompress4X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, flags);
+ }
+ 
+-size_t HUF_decompress4X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
+-                                   const void* cSrc, size_t cSrcSize,
+-                                   void* workSpace, size_t wkspSize)
+-{
+-    return HUF_decompress4X1_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, 0);
+-}
+-
+-
+ #endif /* HUF_FORCE_DECOMPRESS_X2 */
+ 
+ 
+@@ -985,7 +1120,7 @@ static void HUF_fillDTableX2Level2(HUF_D
+ 
+ static void HUF_fillDTableX2(HUF_DEltX2* DTable, const U32 targetLog,
+                            const sortedSymbol_t* sortedList,
+-                           const U32* rankStart, rankValCol_t *rankValOrigin, const U32 maxWeight,
++                           const U32* rankStart, rankValCol_t* rankValOrigin, const U32 maxWeight,
+                            const U32 nbBitsBaseline)
+ {
+     U32* const rankVal = rankValOrigin[0];
+@@ -1040,14 +1175,7 @@ typedef struct {
+ 
+ size_t HUF_readDTableX2_wksp(HUF_DTable* DTable,
+                        const void* src, size_t srcSize,
+-                             void* workSpace, size_t wkspSize)
+-{
+-    return HUF_readDTableX2_wksp_bmi2(DTable, src, srcSize, workSpace, wkspSize, /* bmi2 */ 0);
+-}
+-
+-size_t HUF_readDTableX2_wksp_bmi2(HUF_DTable* DTable,
+-                       const void* src, size_t srcSize,
+-                             void* workSpace, size_t wkspSize, int bmi2)
++                             void* workSpace, size_t wkspSize, int flags)
+ {
+     U32 tableLog, maxW, nbSymbols;
+     DTableDesc dtd = HUF_getDTableDesc(DTable);
+@@ -1069,7 +1197,7 @@ size_t HUF_readDTableX2_wksp_bmi2(HUF_DT
+     if (maxTableLog > HUF_TABLELOG_MAX) return ERROR(tableLog_tooLarge);
+     /* ZSTD_memset(weightList, 0, sizeof(weightList)); */  /* is not necessary, even though some analyzer complain ... */
+ 
+-    iSize = HUF_readStats_wksp(wksp->weightList, HUF_SYMBOLVALUE_MAX + 1, wksp->rankStats, &nbSymbols, &tableLog, src, srcSize, wksp->calleeWksp, sizeof(wksp->calleeWksp), bmi2);
++    iSize = HUF_readStats_wksp(wksp->weightList, HUF_SYMBOLVALUE_MAX + 1, wksp->rankStats, &nbSymbols, &tableLog, src, srcSize, wksp->calleeWksp, sizeof(wksp->calleeWksp), flags);
+     if (HUF_isError(iSize)) return iSize;
+ 
+     /* check result */
+@@ -1159,15 +1287,19 @@ HUF_decodeLastSymbolX2(void* op, BIT_DSt
+ }
+ 
+ #define HUF_DECODE_SYMBOLX2_0(ptr, DStreamPtr) \
+-    ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog)
++    do { ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog); } while (0)
+ 
+-#define HUF_DECODE_SYMBOLX2_1(ptr, DStreamPtr) \
+-    if (MEM_64bits() || (HUF_TABLELOG_MAX<=12)) \
+-        ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog)
+-
+-#define HUF_DECODE_SYMBOLX2_2(ptr, DStreamPtr) \
+-    if (MEM_64bits()) \
+-        ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog)
++#define HUF_DECODE_SYMBOLX2_1(ptr, DStreamPtr)                     \
++    do {                                                           \
++        if (MEM_64bits() || (HUF_TABLELOG_MAX<=12))                \
++            ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog); \
++    } while (0)
++
++#define HUF_DECODE_SYMBOLX2_2(ptr, DStreamPtr)                     \
++    do {                                                           \
++        if (MEM_64bits())                                          \
++            ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog); \
++    } while (0)
+ 
+ HINT_INLINE size_t
+ HUF_decodeStreamX2(BYTE* p, BIT_DStream_t* bitDPtr, BYTE* const pEnd,
+@@ -1227,7 +1359,7 @@ HUF_decompress1X2_usingDTable_internal_b
+ 
+     /* decode */
+     {   BYTE* const ostart = (BYTE*) dst;
+-        BYTE* const oend = ostart + dstSize;
++        BYTE* const oend = ZSTD_maybeNullPtrAdd(ostart, dstSize);
+         const void* const dtPtr = DTable+1;   /* force compiler to not use strict-aliasing */
+         const HUF_DEltX2* const dt = (const HUF_DEltX2*)dtPtr;
+         DTableDesc const dtd = HUF_getDTableDesc(DTable);
+@@ -1240,6 +1372,11 @@ HUF_decompress1X2_usingDTable_internal_b
+     /* decoded size */
+     return dstSize;
+ }
++
++/* HUF_decompress4X2_usingDTable_internal_body():
++ * Conditions:
++ * @dstSize >= 6
++ */
+ FORCE_INLINE_TEMPLATE size_t
+ HUF_decompress4X2_usingDTable_internal_body(
+           void* dst,  size_t dstSize,
+@@ -1247,6 +1384,7 @@ HUF_decompress4X2_usingDTable_internal_b
+     const HUF_DTable* DTable)
+ {
+     if (cSrcSize < 10) return ERROR(corruption_detected);   /* strict minimum : jump table + 1 byte per stream */
++    if (dstSize < 6) return ERROR(corruption_detected);         /* stream 4-split doesn't work */
+ 
+     {   const BYTE* const istart = (const BYTE*) cSrc;
+         BYTE* const ostart = (BYTE*) dst;
+@@ -1280,8 +1418,9 @@ HUF_decompress4X2_usingDTable_internal_b
+         DTableDesc const dtd = HUF_getDTableDesc(DTable);
+         U32 const dtLog = dtd.tableLog;
+ 
+-        if (length4 > cSrcSize) return ERROR(corruption_detected);   /* overflow */
+-        if (opStart4 > oend) return ERROR(corruption_detected);      /* overflow */
++        if (length4 > cSrcSize) return ERROR(corruption_detected);  /* overflow */
++        if (opStart4 > oend) return ERROR(corruption_detected);     /* overflow */
++        assert(dstSize >= 6 /* validated above */);
+         CHECK_F( BIT_initDStream(&bitD1, istart1, length1) );
+         CHECK_F( BIT_initDStream(&bitD2, istart2, length2) );
+         CHECK_F( BIT_initDStream(&bitD3, istart3, length3) );
+@@ -1366,44 +1505,191 @@ size_t HUF_decompress4X2_usingDTable_int
+ }
+ #endif
+ 
+-#if HUF_NEED_DEFAULT_FUNCTION
+ static
+ size_t HUF_decompress4X2_usingDTable_internal_default(void* dst, size_t dstSize, void const* cSrc,
+                     size_t cSrcSize, HUF_DTable const* DTable) {
+     return HUF_decompress4X2_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable);
+ }
+-#endif
+ 
+ #if ZSTD_ENABLE_ASM_X86_64_BMI2
+ 
+-HUF_ASM_DECL void HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop(HUF_DecompressAsmArgs* args) ZSTDLIB_HIDDEN;
++HUF_ASM_DECL void HUF_decompress4X2_usingDTable_internal_fast_asm_loop(HUF_DecompressFastArgs* args) ZSTDLIB_HIDDEN;
++
++#endif
++
++static HUF_FAST_BMI2_ATTRS
++void HUF_decompress4X2_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs* args)
++{
++    U64 bits[4];
++    BYTE const* ip[4];
++    BYTE* op[4];
++    BYTE* oend[4];
++    HUF_DEltX2 const* const dtable = (HUF_DEltX2 const*)args->dt;
++    BYTE const* const ilowest = args->ilowest;
++
++    /* Copy the arguments to local registers. */
++    ZSTD_memcpy(&bits, &args->bits, sizeof(bits));
++    ZSTD_memcpy((void*)(&ip), &args->ip, sizeof(ip));
++    ZSTD_memcpy(&op, &args->op, sizeof(op));
++
++    oend[0] = op[1];
++    oend[1] = op[2];
++    oend[2] = op[3];
++    oend[3] = args->oend;
++
++    assert(MEM_isLittleEndian());
++    assert(!MEM_32bits());
++
++    for (;;) {
++        BYTE* olimit;
++        int stream;
++
++        /* Assert loop preconditions */
++#ifndef NDEBUG
++        for (stream = 0; stream < 4; ++stream) {
++            assert(op[stream] <= oend[stream]);
++            assert(ip[stream] >= ilowest);
++        }
++#endif
++        /* Compute olimit */
++        {
++            /* Each loop does 5 table lookups for each of the 4 streams.
++             * Each table lookup consumes up to 11 bits of input, and produces
++             * up to 2 bytes of output.
++             */
++            /* We can consume up to 7 bytes of input per iteration per stream.
++             * We also know that each input pointer is >= ip[0]. So we can run
++             * iters loops before running out of input.
++             */
++            size_t iters = (size_t)(ip[0] - ilowest) / 7;
++            /* Each iteration can produce up to 10 bytes of output per stream.
++             * Each output stream my advance at different rates. So take the
++             * minimum number of safe iterations among all the output streams.
++             */
++            for (stream = 0; stream < 4; ++stream) {
++                size_t const oiters = (size_t)(oend[stream] - op[stream]) / 10;
++                iters = MIN(iters, oiters);
++            }
++
++            /* Each iteration produces at least 5 output symbols. So until
++             * op[3] crosses olimit, we know we haven't executed iters
++             * iterations yet. This saves us maintaining an iters counter,
++             * at the expense of computing the remaining # of iterations
++             * more frequently.
++             */
++            olimit = op[3] + (iters * 5);
++
++            /* Exit the fast decoding loop once we reach the end. */
++            if (op[3] == olimit)
++                break;
++
++            /* Exit the decoding loop if any input pointer has crossed the
++             * previous one. This indicates corruption, and a precondition
++             * to our loop is that ip[i] >= ip[0].
++             */
++            for (stream = 1; stream < 4; ++stream) {
++                if (ip[stream] < ip[stream - 1])
++                    goto _out;
++            }
++        }
++
++#ifndef NDEBUG
++        for (stream = 1; stream < 4; ++stream) {
++            assert(ip[stream] >= ip[stream - 1]);
++        }
++#endif
+ 
+-static HUF_ASM_X86_64_BMI2_ATTRS size_t
+-HUF_decompress4X2_usingDTable_internal_bmi2_asm(
++#define HUF_4X2_DECODE_SYMBOL(_stream, _decode3)                      \
++    do {                                                              \
++        if ((_decode3) || (_stream) != 3) {                           \
++            int const index = (int)(bits[(_stream)] >> 53);           \
++            HUF_DEltX2 const entry = dtable[index];                   \
++            MEM_write16(op[(_stream)], entry.sequence); \
++            bits[(_stream)] <<= (entry.nbBits) & 0x3F;                \
++            op[(_stream)] += (entry.length);                          \
++        }                                                             \
++    } while (0)
++
++#define HUF_4X2_RELOAD_STREAM(_stream)                                  \
++    do {                                                                \
++        HUF_4X2_DECODE_SYMBOL(3, 1);                                    \
++        {                                                               \
++            int const ctz = ZSTD_countTrailingZeros64(bits[(_stream)]); \
++            int const nbBits = ctz & 7;                                 \
++            int const nbBytes = ctz >> 3;                               \
++            ip[(_stream)] -= nbBytes;                                   \
++            bits[(_stream)] = MEM_read64(ip[(_stream)]) | 1;            \
++            bits[(_stream)] <<= nbBits;                                 \
++        }                                                               \
++    } while (0)
++
++        /* Manually unroll the loop because compilers don't consistently
++         * unroll the inner loops, which destroys performance.
++         */
++        do {
++            /* Decode 5 symbols from each of the first 3 streams.
++             * The final stream will be decoded during the reload phase
++             * to reduce register pressure.
++             */
++            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0);
++            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0);
++            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0);
++            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0);
++            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0);
++
++            /* Decode one symbol from the final stream */
++            HUF_4X2_DECODE_SYMBOL(3, 1);
++
++            /* Decode 4 symbols from the final stream & reload bitstreams.
++             * The final stream is reloaded last, meaning that all 5 symbols
++             * are decoded from the final stream before it is reloaded.
++             */
++            HUF_4X_FOR_EACH_STREAM(HUF_4X2_RELOAD_STREAM);
++        } while (op[3] < olimit);
++    }
++
++#undef HUF_4X2_DECODE_SYMBOL
++#undef HUF_4X2_RELOAD_STREAM
++
++_out:
++
++    /* Save the final values of each of the state variables back to args. */
++    ZSTD_memcpy(&args->bits, &bits, sizeof(bits));
++    ZSTD_memcpy((void*)(&args->ip), &ip, sizeof(ip));
++    ZSTD_memcpy(&args->op, &op, sizeof(op));
++}
++
++
++static HUF_FAST_BMI2_ATTRS size_t
++HUF_decompress4X2_usingDTable_internal_fast(
+           void* dst,  size_t dstSize,
+     const void* cSrc, size_t cSrcSize,
+-    const HUF_DTable* DTable) {
++    const HUF_DTable* DTable,
++    HUF_DecompressFastLoopFn loopFn) {
+     void const* dt = DTable + 1;
+-    const BYTE* const iend = (const BYTE*)cSrc + 6;
+-    BYTE* const oend = (BYTE*)dst + dstSize;
+-    HUF_DecompressAsmArgs args;
++    const BYTE* const ilowest = (const BYTE*)cSrc;
++    BYTE* const oend = ZSTD_maybeNullPtrAdd((BYTE*)dst, dstSize);
++    HUF_DecompressFastArgs args;
+     {
+-        size_t const ret = HUF_DecompressAsmArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable);
++        size_t const ret = HUF_DecompressFastArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable);
+         FORWARD_IF_ERROR(ret, "Failed to init asm args");
+-        if (ret != 0)
+-            return HUF_decompress4X2_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable);
++        if (ret == 0)
++            return 0;
+     }
+ 
+-    assert(args.ip[0] >= args.ilimit);
+-    HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop(&args);
++    assert(args.ip[0] >= args.ilowest);
++    loopFn(&args);
+ 
+     /* note : op4 already verified within main loop */
+-    assert(args.ip[0] >= iend);
+-    assert(args.ip[1] >= iend);
+-    assert(args.ip[2] >= iend);
+-    assert(args.ip[3] >= iend);
++    assert(args.ip[0] >= ilowest);
++    assert(args.ip[1] >= ilowest);
++    assert(args.ip[2] >= ilowest);
++    assert(args.ip[3] >= ilowest);
+     assert(args.op[3] <= oend);
+-    (void)iend;
++
++    assert(ilowest == args.ilowest);
++    assert(ilowest + 6 == args.iend[0]);
++    (void)ilowest;
+ 
+     /* finish bitStreams one by one */
+     {
+@@ -1426,91 +1712,72 @@ HUF_decompress4X2_usingDTable_internal_b
+     /* decoded size */
+     return dstSize;
+ }
+-#endif /* ZSTD_ENABLE_ASM_X86_64_BMI2 */
+ 
+ static size_t HUF_decompress4X2_usingDTable_internal(void* dst, size_t dstSize, void const* cSrc,
+-                    size_t cSrcSize, HUF_DTable const* DTable, int bmi2)
++                    size_t cSrcSize, HUF_DTable const* DTable, int flags)
+ {
++    HUF_DecompressUsingDTableFn fallbackFn = HUF_decompress4X2_usingDTable_internal_default;
++    HUF_DecompressFastLoopFn loopFn = HUF_decompress4X2_usingDTable_internal_fast_c_loop;
++
+ #if DYNAMIC_BMI2
+-    if (bmi2) {
++    if (flags & HUF_flags_bmi2) {
++        fallbackFn = HUF_decompress4X2_usingDTable_internal_bmi2;
+ # if ZSTD_ENABLE_ASM_X86_64_BMI2
+-        return HUF_decompress4X2_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable);
+-# else
+-        return HUF_decompress4X2_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable);
++        if (!(flags & HUF_flags_disableAsm)) {
++            loopFn = HUF_decompress4X2_usingDTable_internal_fast_asm_loop;
++        }
+ # endif
++    } else {
++        return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable);
+     }
+-#else
+-    (void)bmi2;
+ #endif
+ 
+ #if ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__)
+-    return HUF_decompress4X2_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable);
+-#else
+-    return HUF_decompress4X2_usingDTable_internal_default(dst, dstSize, cSrc, cSrcSize, DTable);
++    if (!(flags & HUF_flags_disableAsm)) {
++        loopFn = HUF_decompress4X2_usingDTable_internal_fast_asm_loop;
++    }
+ #endif
++
++    if (HUF_ENABLE_FAST_DECODE && !(flags & HUF_flags_disableFast)) {
++        size_t const ret = HUF_decompress4X2_usingDTable_internal_fast(dst, dstSize, cSrc, cSrcSize, DTable, loopFn);
++        if (ret != 0)
++            return ret;
++    }
++    return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable);
+ }
+ 
+ HUF_DGEN(HUF_decompress1X2_usingDTable_internal)
+ 
+-size_t HUF_decompress1X2_usingDTable(
+-          void* dst,  size_t dstSize,
+-    const void* cSrc, size_t cSrcSize,
+-    const HUF_DTable* DTable)
+-{
+-    DTableDesc dtd = HUF_getDTableDesc(DTable);
+-    if (dtd.tableType != 1) return ERROR(GENERIC);
+-    return HUF_decompress1X2_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+-}
+-
+ size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable* DCtx, void* dst, size_t dstSize,
+                                    const void* cSrc, size_t cSrcSize,
+-                                   void* workSpace, size_t wkspSize)
++                                   void* workSpace, size_t wkspSize, int flags)
+ {
+     const BYTE* ip = (const BYTE*) cSrc;
+ 
+     size_t const hSize = HUF_readDTableX2_wksp(DCtx, cSrc, cSrcSize,
+-                                               workSpace, wkspSize);
++                                               workSpace, wkspSize, flags);
+     if (HUF_isError(hSize)) return hSize;
+     if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
+     ip += hSize; cSrcSize -= hSize;
+ 
+-    return HUF_decompress1X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx, /* bmi2 */ 0);
+-}
+-
+-
+-size_t HUF_decompress4X2_usingDTable(
+-          void* dst,  size_t dstSize,
+-    const void* cSrc, size_t cSrcSize,
+-    const HUF_DTable* DTable)
+-{
+-    DTableDesc dtd = HUF_getDTableDesc(DTable);
+-    if (dtd.tableType != 1) return ERROR(GENERIC);
+-    return HUF_decompress4X2_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
++    return HUF_decompress1X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx, flags);
+ }
+ 
+-static size_t HUF_decompress4X2_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize,
++static size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
+                                    const void* cSrc, size_t cSrcSize,
+-                                   void* workSpace, size_t wkspSize, int bmi2)
++                                   void* workSpace, size_t wkspSize, int flags)
+ {
+     const BYTE* ip = (const BYTE*) cSrc;
+ 
+     size_t hSize = HUF_readDTableX2_wksp(dctx, cSrc, cSrcSize,
+-                                         workSpace, wkspSize);
++                                         workSpace, wkspSize, flags);
+     if (HUF_isError(hSize)) return hSize;
+     if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
+     ip += hSize; cSrcSize -= hSize;
+ 
+-    return HUF_decompress4X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2);
++    return HUF_decompress4X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, flags);
+ }
+ 
+-size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
+-                                   const void* cSrc, size_t cSrcSize,
+-                                   void* workSpace, size_t wkspSize)
+-{
+-    return HUF_decompress4X2_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, /* bmi2 */ 0);
+-}
+-
+-
+ #endif /* HUF_FORCE_DECOMPRESS_X1 */
+ 
+ 
+@@ -1518,44 +1785,6 @@ size_t HUF_decompress4X2_DCtx_wksp(HUF_D
+ /* Universal decompression selectors */
+ /* ***********************************/
+ 
+-size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize,
+-                                    const void* cSrc, size_t cSrcSize,
+-                                    const HUF_DTable* DTable)
+-{
+-    DTableDesc const dtd = HUF_getDTableDesc(DTable);
+-#if defined(HUF_FORCE_DECOMPRESS_X1)
+-    (void)dtd;
+-    assert(dtd.tableType == 0);
+-    return HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+-#elif defined(HUF_FORCE_DECOMPRESS_X2)
+-    (void)dtd;
+-    assert(dtd.tableType == 1);
+-    return HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+-#else
+-    return dtd.tableType ? HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0) :
+-                           HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+-#endif
+-}
+-
+-size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize,
+-                                    const void* cSrc, size_t cSrcSize,
+-                                    const HUF_DTable* DTable)
+-{
+-    DTableDesc const dtd = HUF_getDTableDesc(DTable);
+-#if defined(HUF_FORCE_DECOMPRESS_X1)
+-    (void)dtd;
+-    assert(dtd.tableType == 0);
+-    return HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+-#elif defined(HUF_FORCE_DECOMPRESS_X2)
+-    (void)dtd;
+-    assert(dtd.tableType == 1);
+-    return HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+-#else
+-    return dtd.tableType ? HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0) :
+-                           HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+-#endif
+-}
+-
+ 
+ #if !defined(HUF_FORCE_DECOMPRESS_X1) && !defined(HUF_FORCE_DECOMPRESS_X2)
+ typedef struct { U32 tableTime; U32 decode256Time; } algo_time_t;
+@@ -1610,36 +1839,9 @@ U32 HUF_selectDecoder (size_t dstSize, s
+ #endif
+ }
+ 
+-
+-size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst,
+-                                     size_t dstSize, const void* cSrc,
+-                                     size_t cSrcSize, void* workSpace,
+-                                     size_t wkspSize)
+-{
+-    /* validation checks */
+-    if (dstSize == 0) return ERROR(dstSize_tooSmall);
+-    if (cSrcSize == 0) return ERROR(corruption_detected);
+-
+-    {   U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize);
+-#if defined(HUF_FORCE_DECOMPRESS_X1)
+-        (void)algoNb;
+-        assert(algoNb == 0);
+-        return HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize);
+-#elif defined(HUF_FORCE_DECOMPRESS_X2)
+-        (void)algoNb;
+-        assert(algoNb == 1);
+-        return HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize);
+-#else
+-        return algoNb ? HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc,
+-                            cSrcSize, workSpace, wkspSize):
+-                        HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize);
+-#endif
+-    }
+-}
+-
+ size_t HUF_decompress1X_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
+                                   const void* cSrc, size_t cSrcSize,
+-                                  void* workSpace, size_t wkspSize)
++                                  void* workSpace, size_t wkspSize, int flags)
+ {
+     /* validation checks */
+     if (dstSize == 0) return ERROR(dstSize_tooSmall);
+@@ -1652,71 +1854,71 @@ size_t HUF_decompress1X_DCtx_wksp(HUF_DT
+         (void)algoNb;
+         assert(algoNb == 0);
+         return HUF_decompress1X1_DCtx_wksp(dctx, dst, dstSize, cSrc,
+-                                cSrcSize, workSpace, wkspSize);
++                                cSrcSize, workSpace, wkspSize, flags);
+ #elif defined(HUF_FORCE_DECOMPRESS_X2)
+         (void)algoNb;
+         assert(algoNb == 1);
+         return HUF_decompress1X2_DCtx_wksp(dctx, dst, dstSize, cSrc,
+-                                cSrcSize, workSpace, wkspSize);
++                                cSrcSize, workSpace, wkspSize, flags);
+ #else
+         return algoNb ? HUF_decompress1X2_DCtx_wksp(dctx, dst, dstSize, cSrc,
+-                                cSrcSize, workSpace, wkspSize):
++                                cSrcSize, workSpace, wkspSize, flags):
+                         HUF_decompress1X1_DCtx_wksp(dctx, dst, dstSize, cSrc,
+-                                cSrcSize, workSpace, wkspSize);
++                                cSrcSize, workSpace, wkspSize, flags);
+ #endif
+     }
+ }
+ 
+ 
+-size_t HUF_decompress1X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2)
++size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int flags)
+ {
+     DTableDesc const dtd = HUF_getDTableDesc(DTable);
+ #if defined(HUF_FORCE_DECOMPRESS_X1)
+     (void)dtd;
+     assert(dtd.tableType == 0);
+-    return HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
++    return HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
+ #elif defined(HUF_FORCE_DECOMPRESS_X2)
+     (void)dtd;
+     assert(dtd.tableType == 1);
+-    return HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
++    return HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
+ #else
+-    return dtd.tableType ? HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2) :
+-                           HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
++    return dtd.tableType ? HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags) :
++                           HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
+ #endif
+ }
+ 
+ #ifndef HUF_FORCE_DECOMPRESS_X2
+-size_t HUF_decompress1X1_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2)
++size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags)
+ {
+     const BYTE* ip = (const BYTE*) cSrc;
+ 
+-    size_t const hSize = HUF_readDTableX1_wksp_bmi2(dctx, cSrc, cSrcSize, workSpace, wkspSize, bmi2);
++    size_t const hSize = HUF_readDTableX1_wksp(dctx, cSrc, cSrcSize, workSpace, wkspSize, flags);
+     if (HUF_isError(hSize)) return hSize;
+     if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
+     ip += hSize; cSrcSize -= hSize;
+ 
+-    return HUF_decompress1X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2);
++    return HUF_decompress1X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, flags);
+ }
+ #endif
+ 
+-size_t HUF_decompress4X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2)
++size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int flags)
+ {
+     DTableDesc const dtd = HUF_getDTableDesc(DTable);
+ #if defined(HUF_FORCE_DECOMPRESS_X1)
+     (void)dtd;
+     assert(dtd.tableType == 0);
+-    return HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
++    return HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
+ #elif defined(HUF_FORCE_DECOMPRESS_X2)
+     (void)dtd;
+     assert(dtd.tableType == 1);
+-    return HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
++    return HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
+ #else
+-    return dtd.tableType ? HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2) :
+-                           HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
++    return dtd.tableType ? HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags) :
++                           HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
+ #endif
+ }
+ 
+-size_t HUF_decompress4X_hufOnly_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2)
++size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags)
+ {
+     /* validation checks */
+     if (dstSize == 0) return ERROR(dstSize_tooSmall);
+@@ -1726,15 +1928,14 @@ size_t HUF_decompress4X_hufOnly_wksp_bmi
+ #if defined(HUF_FORCE_DECOMPRESS_X1)
+         (void)algoNb;
+         assert(algoNb == 0);
+-        return HUF_decompress4X1_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2);
++        return HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags);
+ #elif defined(HUF_FORCE_DECOMPRESS_X2)
+         (void)algoNb;
+         assert(algoNb == 1);
+-        return HUF_decompress4X2_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2);
++        return HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags);
+ #else
+-        return algoNb ? HUF_decompress4X2_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2) :
+-                        HUF_decompress4X1_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2);
++        return algoNb ? HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags) :
++                        HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags);
+ #endif
+     }
+ }
+-
+--- a/lib/zstd/decompress/zstd_ddict.c
++++ b/lib/zstd/decompress/zstd_ddict.c
+@@ -1,5 +1,6 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -14,12 +15,12 @@
+ /*-*******************************************************
+ *  Dependencies
+ *********************************************************/
++#include "../common/allocations.h"  /* ZSTD_customMalloc, ZSTD_customFree */
+ #include "../common/zstd_deps.h"   /* ZSTD_memcpy, ZSTD_memmove, ZSTD_memset */
+ #include "../common/cpu.h"         /* bmi2 */
+ #include "../common/mem.h"         /* low level memory routines */
+ #define FSE_STATIC_LINKING_ONLY
+ #include "../common/fse.h"
+-#define HUF_STATIC_LINKING_ONLY
+ #include "../common/huf.h"
+ #include "zstd_decompress_internal.h"
+ #include "zstd_ddict.h"
+@@ -131,7 +132,7 @@ static size_t ZSTD_initDDict_internal(ZS
+         ZSTD_memcpy(internalBuffer, dict, dictSize);
+     }
+     ddict->dictSize = dictSize;
+-    ddict->entropy.hufTable[0] = (HUF_DTable)((HufLog)*0x1000001);  /* cover both little and big endian */
++    ddict->entropy.hufTable[0] = (HUF_DTable)((ZSTD_HUFFDTABLE_CAPACITY_LOG)*0x1000001);  /* cover both little and big endian */
+ 
+     /* parse dictionary content */
+     FORWARD_IF_ERROR( ZSTD_loadEntropy_intoDDict(ddict, dictContentType) , "");
+@@ -237,5 +238,5 @@ size_t ZSTD_sizeof_DDict(const ZSTD_DDic
+ unsigned ZSTD_getDictID_fromDDict(const ZSTD_DDict* ddict)
+ {
+     if (ddict==NULL) return 0;
+-    return ZSTD_getDictID_fromDict(ddict->dictContent, ddict->dictSize);
++    return ddict->dictID;
+ }
+--- a/lib/zstd/decompress/zstd_ddict.h
++++ b/lib/zstd/decompress/zstd_ddict.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+--- a/lib/zstd/decompress/zstd_decompress.c
++++ b/lib/zstd/decompress/zstd_decompress.c
+@@ -1,5 +1,6 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -53,13 +54,15 @@
+ *  Dependencies
+ *********************************************************/
+ #include "../common/zstd_deps.h"   /* ZSTD_memcpy, ZSTD_memmove, ZSTD_memset */
++#include "../common/allocations.h"  /* ZSTD_customMalloc, ZSTD_customCalloc, ZSTD_customFree */
++#include "../common/error_private.h"
++#include "../common/zstd_internal.h"  /* blockProperties_t */
+ #include "../common/mem.h"         /* low level memory routines */
++#include "../common/bits.h"  /* ZSTD_highbit32 */
+ #define FSE_STATIC_LINKING_ONLY
+ #include "../common/fse.h"
+-#define HUF_STATIC_LINKING_ONLY
+ #include "../common/huf.h"
+ #include <linux/xxhash.h> /* xxh64_reset, xxh64_update, xxh64_digest, XXH64 */
+-#include "../common/zstd_internal.h"  /* blockProperties_t */
+ #include "zstd_decompress_internal.h"   /* ZSTD_DCtx */
+ #include "zstd_ddict.h"  /* ZSTD_DDictDictContent */
+ #include "zstd_decompress_block.h"   /* ZSTD_decompressBlock_internal */
+@@ -72,11 +75,11 @@
+  *************************************/
+ 
+ #define DDICT_HASHSET_MAX_LOAD_FACTOR_COUNT_MULT 4
+-#define DDICT_HASHSET_MAX_LOAD_FACTOR_SIZE_MULT 3   /* These two constants represent SIZE_MULT/COUNT_MULT load factor without using a float.
+-                                                     * Currently, that means a 0.75 load factor.
+-                                                     * So, if count * COUNT_MULT / size * SIZE_MULT != 0, then we've exceeded
+-                                                     * the load factor of the ddict hash set.
+-                                                     */
++#define DDICT_HASHSET_MAX_LOAD_FACTOR_SIZE_MULT 3  /* These two constants represent SIZE_MULT/COUNT_MULT load factor without using a float.
++                                                    * Currently, that means a 0.75 load factor.
++                                                    * So, if count * COUNT_MULT / size * SIZE_MULT != 0, then we've exceeded
++                                                    * the load factor of the ddict hash set.
++                                                    */
+ 
+ #define DDICT_HASHSET_TABLE_BASE_SIZE 64
+ #define DDICT_HASHSET_RESIZE_FACTOR 2
+@@ -237,6 +240,8 @@ static void ZSTD_DCtx_resetParameters(ZS
+     dctx->outBufferMode = ZSTD_bm_buffered;
+     dctx->forceIgnoreChecksum = ZSTD_d_validateChecksum;
+     dctx->refMultipleDDicts = ZSTD_rmd_refSingleDDict;
++    dctx->disableHufAsm = 0;
++    dctx->maxBlockSizeParam = 0;
+ }
+ 
+ static void ZSTD_initDCtx_internal(ZSTD_DCtx* dctx)
+@@ -253,6 +258,7 @@ static void ZSTD_initDCtx_internal(ZSTD_
+     dctx->streamStage = zdss_init;
+     dctx->noForwardProgress = 0;
+     dctx->oversizedDuration = 0;
++    dctx->isFrameDecompression = 1;
+ #if DYNAMIC_BMI2
+     dctx->bmi2 = ZSTD_cpuSupportsBmi2();
+ #endif
+@@ -421,16 +427,40 @@ size_t ZSTD_frameHeaderSize(const void*
+  *  note : only works for formats ZSTD_f_zstd1 and ZSTD_f_zstd1_magicless
+  * @return : 0, `zfhPtr` is correctly filled,
+  *          >0, `srcSize` is too small, value is wanted `srcSize` amount,
+- *           or an error code, which can be tested using ZSTD_isError() */
++**           or an error code, which can be tested using ZSTD_isError() */
+ size_t ZSTD_getFrameHeader_advanced(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize, ZSTD_format_e format)
+ {
+     const BYTE* ip = (const BYTE*)src;
+     size_t const minInputSize = ZSTD_startingInputLength(format);
+ 
+-    ZSTD_memset(zfhPtr, 0, sizeof(*zfhPtr));   /* not strictly necessary, but static analyzer do not understand that zfhPtr is only going to be read only if return value is zero, since they are 2 different signals */
+-    if (srcSize < minInputSize) return minInputSize;
+-    RETURN_ERROR_IF(src==NULL, GENERIC, "invalid parameter");
++    DEBUGLOG(5, "ZSTD_getFrameHeader_advanced: minInputSize = %zu, srcSize = %zu", minInputSize, srcSize);
++
++    if (srcSize > 0) {
++        /* note : technically could be considered an assert(), since it's an invalid entry */
++        RETURN_ERROR_IF(src==NULL, GENERIC, "invalid parameter : src==NULL, but srcSize>0");
++    }
++    if (srcSize < minInputSize) {
++        if (srcSize > 0 && format != ZSTD_f_zstd1_magicless) {
++            /* when receiving less than @minInputSize bytes,
++             * control these bytes at least correspond to a supported magic number
++             * in order to error out early if they don't.
++            **/
++            size_t const toCopy = MIN(4, srcSize);
++            unsigned char hbuf[4]; MEM_writeLE32(hbuf, ZSTD_MAGICNUMBER);
++            assert(src != NULL);
++            ZSTD_memcpy(hbuf, src, toCopy);
++            if ( MEM_readLE32(hbuf) != ZSTD_MAGICNUMBER ) {
++                /* not a zstd frame : let's check if it's a skippable frame */
++                MEM_writeLE32(hbuf, ZSTD_MAGIC_SKIPPABLE_START);
++                ZSTD_memcpy(hbuf, src, toCopy);
++                if ((MEM_readLE32(hbuf) & ZSTD_MAGIC_SKIPPABLE_MASK) != ZSTD_MAGIC_SKIPPABLE_START) {
++                    RETURN_ERROR(prefix_unknown,
++                                "first bytes don't correspond to any supported magic number");
++        }   }   }
++        return minInputSize;
++    }
+ 
++    ZSTD_memset(zfhPtr, 0, sizeof(*zfhPtr));   /* not strictly necessary, but static analyzers may not understand that zfhPtr will be read only if return value is zero, since they are 2 different signals */
+     if ( (format != ZSTD_f_zstd1_magicless)
+       && (MEM_readLE32(src) != ZSTD_MAGICNUMBER) ) {
+         if ((MEM_readLE32(src) & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) {
+@@ -540,49 +570,52 @@ static size_t readSkippableFrameSize(voi
+     sizeU32 = MEM_readLE32((BYTE const*)src + ZSTD_FRAMEIDSIZE);
+     RETURN_ERROR_IF((U32)(sizeU32 + ZSTD_SKIPPABLEHEADERSIZE) < sizeU32,
+                     frameParameter_unsupported, "");
+-    {
+-        size_t const skippableSize = skippableHeaderSize + sizeU32;
++    {   size_t const skippableSize = skippableHeaderSize + sizeU32;
+         RETURN_ERROR_IF(skippableSize > srcSize, srcSize_wrong, "");
+         return skippableSize;
+     }
+ }
+ 
+ /*! ZSTD_readSkippableFrame() :
+- * Retrieves a zstd skippable frame containing data given by src, and writes it to dst buffer.
++ * Retrieves content of a skippable frame, and writes it to dst buffer.
+  *
+  * The parameter magicVariant will receive the magicVariant that was supplied when the frame was written,
+  * i.e. magicNumber - ZSTD_MAGIC_SKIPPABLE_START.  This can be NULL if the caller is not interested
+  * in the magicVariant.
+  *
+- * Returns an error if destination buffer is not large enough, or if the frame is not skippable.
++ * Returns an error if destination buffer is not large enough, or if this is not a valid skippable frame.
+  *
+  * @return : number of bytes written or a ZSTD error.
+  */
+-ZSTDLIB_API size_t ZSTD_readSkippableFrame(void* dst, size_t dstCapacity, unsigned* magicVariant,
+-                                            const void* src, size_t srcSize)
++size_t ZSTD_readSkippableFrame(void* dst, size_t dstCapacity,
++                               unsigned* magicVariant,  /* optional, can be NULL */
++                         const void* src, size_t srcSize)
+ {
+-    U32 const magicNumber = MEM_readLE32(src);
+-    size_t skippableFrameSize = readSkippableFrameSize(src, srcSize);
+-    size_t skippableContentSize = skippableFrameSize - ZSTD_SKIPPABLEHEADERSIZE;
+-
+-    /* check input validity */
+-    RETURN_ERROR_IF(!ZSTD_isSkippableFrame(src, srcSize), frameParameter_unsupported, "");
+-    RETURN_ERROR_IF(skippableFrameSize < ZSTD_SKIPPABLEHEADERSIZE || skippableFrameSize > srcSize, srcSize_wrong, "");
+-    RETURN_ERROR_IF(skippableContentSize > dstCapacity, dstSize_tooSmall, "");
+-
+-    /* deliver payload */
+-    if (skippableContentSize > 0  && dst != NULL)
+-        ZSTD_memcpy(dst, (const BYTE *)src + ZSTD_SKIPPABLEHEADERSIZE, skippableContentSize);
+-    if (magicVariant != NULL)
+-        *magicVariant = magicNumber - ZSTD_MAGIC_SKIPPABLE_START;
+-    return skippableContentSize;
++    RETURN_ERROR_IF(srcSize < ZSTD_SKIPPABLEHEADERSIZE, srcSize_wrong, "");
++
++    {   U32 const magicNumber = MEM_readLE32(src);
++        size_t skippableFrameSize = readSkippableFrameSize(src, srcSize);
++        size_t skippableContentSize = skippableFrameSize - ZSTD_SKIPPABLEHEADERSIZE;
++
++        /* check input validity */
++        RETURN_ERROR_IF(!ZSTD_isSkippableFrame(src, srcSize), frameParameter_unsupported, "");
++        RETURN_ERROR_IF(skippableFrameSize < ZSTD_SKIPPABLEHEADERSIZE || skippableFrameSize > srcSize, srcSize_wrong, "");
++        RETURN_ERROR_IF(skippableContentSize > dstCapacity, dstSize_tooSmall, "");
++
++        /* deliver payload */
++        if (skippableContentSize > 0  && dst != NULL)
++            ZSTD_memcpy(dst, (const BYTE *)src + ZSTD_SKIPPABLEHEADERSIZE, skippableContentSize);
++        if (magicVariant != NULL)
++            *magicVariant = magicNumber - ZSTD_MAGIC_SKIPPABLE_START;
++        return skippableContentSize;
++    }
+ }
+ 
+ /* ZSTD_findDecompressedSize() :
+- *  compatible with legacy mode
+  *  `srcSize` must be the exact length of some number of ZSTD compressed and/or
+  *      skippable frames
+- *  @return : decompressed size of the frames contained */
++ *  note: compatible with legacy mode
++ * @return : decompressed size of the frames contained */
+ unsigned long long ZSTD_findDecompressedSize(const void* src, size_t srcSize)
+ {
+     unsigned long long totalDstSize = 0;
+@@ -592,9 +625,7 @@ unsigned long long ZSTD_findDecompressed
+ 
+         if ((magicNumber & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) {
+             size_t const skippableSize = readSkippableFrameSize(src, srcSize);
+-            if (ZSTD_isError(skippableSize)) {
+-                return ZSTD_CONTENTSIZE_ERROR;
+-            }
++            if (ZSTD_isError(skippableSize)) return ZSTD_CONTENTSIZE_ERROR;
+             assert(skippableSize <= srcSize);
+ 
+             src = (const BYTE *)src + skippableSize;
+@@ -602,17 +633,17 @@ unsigned long long ZSTD_findDecompressed
+             continue;
+         }
+ 
+-        {   unsigned long long const ret = ZSTD_getFrameContentSize(src, srcSize);
+-            if (ret >= ZSTD_CONTENTSIZE_ERROR) return ret;
++        {   unsigned long long const fcs = ZSTD_getFrameContentSize(src, srcSize);
++            if (fcs >= ZSTD_CONTENTSIZE_ERROR) return fcs;
+ 
+-            /* check for overflow */
+-            if (totalDstSize + ret < totalDstSize) return ZSTD_CONTENTSIZE_ERROR;
+-            totalDstSize += ret;
++            if (totalDstSize + fcs < totalDstSize)
++                return ZSTD_CONTENTSIZE_ERROR; /* check for overflow */
++            totalDstSize += fcs;
+         }
++        /* skip to next frame */
+         {   size_t const frameSrcSize = ZSTD_findFrameCompressedSize(src, srcSize);
+-            if (ZSTD_isError(frameSrcSize)) {
+-                return ZSTD_CONTENTSIZE_ERROR;
+-            }
++            if (ZSTD_isError(frameSrcSize)) return ZSTD_CONTENTSIZE_ERROR;
++            assert(frameSrcSize <= srcSize);
+ 
+             src = (const BYTE *)src + frameSrcSize;
+             srcSize -= frameSrcSize;
+@@ -676,13 +707,13 @@ static ZSTD_frameSizeInfo ZSTD_errorFram
+     return frameSizeInfo;
+ }
+ 
+-static ZSTD_frameSizeInfo ZSTD_findFrameSizeInfo(const void* src, size_t srcSize)
++static ZSTD_frameSizeInfo ZSTD_findFrameSizeInfo(const void* src, size_t srcSize, ZSTD_format_e format)
+ {
+     ZSTD_frameSizeInfo frameSizeInfo;
+     ZSTD_memset(&frameSizeInfo, 0, sizeof(ZSTD_frameSizeInfo));
+ 
+ 
+-    if ((srcSize >= ZSTD_SKIPPABLEHEADERSIZE)
++    if (format == ZSTD_f_zstd1 && (srcSize >= ZSTD_SKIPPABLEHEADERSIZE)
+         && (MEM_readLE32(src) & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) {
+         frameSizeInfo.compressedSize = readSkippableFrameSize(src, srcSize);
+         assert(ZSTD_isError(frameSizeInfo.compressedSize) ||
+@@ -696,7 +727,7 @@ static ZSTD_frameSizeInfo ZSTD_findFrame
+         ZSTD_frameHeader zfh;
+ 
+         /* Extract Frame Header */
+-        {   size_t const ret = ZSTD_getFrameHeader(&zfh, src, srcSize);
++        {   size_t const ret = ZSTD_getFrameHeader_advanced(&zfh, src, srcSize, format);
+             if (ZSTD_isError(ret))
+                 return ZSTD_errorFrameSizeInfo(ret);
+             if (ret > 0)
+@@ -730,23 +761,26 @@ static ZSTD_frameSizeInfo ZSTD_findFrame
+             ip += 4;
+         }
+ 
++        frameSizeInfo.nbBlocks = nbBlocks;
+         frameSizeInfo.compressedSize = (size_t)(ip - ipstart);
+         frameSizeInfo.decompressedBound = (zfh.frameContentSize != ZSTD_CONTENTSIZE_UNKNOWN)
+                                         ? zfh.frameContentSize
+-                                        : nbBlocks * zfh.blockSizeMax;
++                                        : (unsigned long long)nbBlocks * zfh.blockSizeMax;
+         return frameSizeInfo;
+     }
+ }
+ 
++static size_t ZSTD_findFrameCompressedSize_advanced(const void *src, size_t srcSize, ZSTD_format_e format) {
++    ZSTD_frameSizeInfo const frameSizeInfo = ZSTD_findFrameSizeInfo(src, srcSize, format);
++    return frameSizeInfo.compressedSize;
++}
++
+ /* ZSTD_findFrameCompressedSize() :
+- *  compatible with legacy mode
+- *  `src` must point to the start of a ZSTD frame, ZSTD legacy frame, or skippable frame
+- *  `srcSize` must be at least as large as the frame contained
+- *  @return : the compressed size of the frame starting at `src` */
++ * See docs in zstd.h
++ * Note: compatible with legacy mode */
+ size_t ZSTD_findFrameCompressedSize(const void *src, size_t srcSize)
+ {
+-    ZSTD_frameSizeInfo const frameSizeInfo = ZSTD_findFrameSizeInfo(src, srcSize);
+-    return frameSizeInfo.compressedSize;
++    return ZSTD_findFrameCompressedSize_advanced(src, srcSize, ZSTD_f_zstd1);
+ }
+ 
+ /* ZSTD_decompressBound() :
+@@ -760,7 +794,7 @@ unsigned long long ZSTD_decompressBound(
+     unsigned long long bound = 0;
+     /* Iterate over each frame */
+     while (srcSize > 0) {
+-        ZSTD_frameSizeInfo const frameSizeInfo = ZSTD_findFrameSizeInfo(src, srcSize);
++        ZSTD_frameSizeInfo const frameSizeInfo = ZSTD_findFrameSizeInfo(src, srcSize, ZSTD_f_zstd1);
+         size_t const compressedSize = frameSizeInfo.compressedSize;
+         unsigned long long const decompressedBound = frameSizeInfo.decompressedBound;
+         if (ZSTD_isError(compressedSize) || decompressedBound == ZSTD_CONTENTSIZE_ERROR)
+@@ -773,6 +807,48 @@ unsigned long long ZSTD_decompressBound(
+     return bound;
+ }
+ 
++size_t ZSTD_decompressionMargin(void const* src, size_t srcSize)
++{
++    size_t margin = 0;
++    unsigned maxBlockSize = 0;
++
++    /* Iterate over each frame */
++    while (srcSize > 0) {
++        ZSTD_frameSizeInfo const frameSizeInfo = ZSTD_findFrameSizeInfo(src, srcSize, ZSTD_f_zstd1);
++        size_t const compressedSize = frameSizeInfo.compressedSize;
++        unsigned long long const decompressedBound = frameSizeInfo.decompressedBound;
++        ZSTD_frameHeader zfh;
++
++        FORWARD_IF_ERROR(ZSTD_getFrameHeader(&zfh, src, srcSize), "");
++        if (ZSTD_isError(compressedSize) || decompressedBound == ZSTD_CONTENTSIZE_ERROR)
++            return ERROR(corruption_detected);
++
++        if (zfh.frameType == ZSTD_frame) {
++            /* Add the frame header to our margin */
++            margin += zfh.headerSize;
++            /* Add the checksum to our margin */
++            margin += zfh.checksumFlag ? 4 : 0;
++            /* Add 3 bytes per block */
++            margin += 3 * frameSizeInfo.nbBlocks;
++
++            /* Compute the max block size */
++            maxBlockSize = MAX(maxBlockSize, zfh.blockSizeMax);
++        } else {
++            assert(zfh.frameType == ZSTD_skippableFrame);
++            /* Add the entire skippable frame size to our margin. */
++            margin += compressedSize;
++        }
++
++        assert(srcSize >= compressedSize);
++        src = (const BYTE*)src + compressedSize;
++        srcSize -= compressedSize;
++    }
++
++    /* Add the max block size back to the margin. */
++    margin += maxBlockSize;
++
++    return margin;
++}
+ 
+ /*-*************************************************************
+  *   Frame decoding
+@@ -856,6 +932,10 @@ static size_t ZSTD_decompressFrame(ZSTD_
+         ip += frameHeaderSize; remainingSrcSize -= frameHeaderSize;
+     }
+ 
++    /* Shrink the blockSizeMax if enabled */
++    if (dctx->maxBlockSizeParam != 0)
++        dctx->fParams.blockSizeMax = MIN(dctx->fParams.blockSizeMax, (unsigned)dctx->maxBlockSizeParam);
++
+     /* Loop on each block */
+     while (1) {
+         BYTE* oBlockEnd = oend;
+@@ -888,7 +968,8 @@ static size_t ZSTD_decompressFrame(ZSTD_
+         switch(blockProperties.blockType)
+         {
+         case bt_compressed:
+-            decodedSize = ZSTD_decompressBlock_internal(dctx, op, (size_t)(oBlockEnd-op), ip, cBlockSize, /* frame */ 1, not_streaming);
++            assert(dctx->isFrameDecompression == 1);
++            decodedSize = ZSTD_decompressBlock_internal(dctx, op, (size_t)(oBlockEnd-op), ip, cBlockSize, not_streaming);
+             break;
+         case bt_raw :
+             /* Use oend instead of oBlockEnd because this function is safe to overlap. It uses memmove. */
+@@ -901,12 +982,14 @@ static size_t ZSTD_decompressFrame(ZSTD_
+         default:
+             RETURN_ERROR(corruption_detected, "invalid block type");
+         }
+-
+-        if (ZSTD_isError(decodedSize)) return decodedSize;
+-        if (dctx->validateChecksum)
++        FORWARD_IF_ERROR(decodedSize, "Block decompression failure");
++        DEBUGLOG(5, "Decompressed block of dSize = %u", (unsigned)decodedSize);
++        if (dctx->validateChecksum) {
+             xxh64_update(&dctx->xxhState, op, decodedSize);
+-        if (decodedSize != 0)
++        }
++        if (decodedSize) /* support dst = NULL,0 */ {
+             op += decodedSize;
++        }
+         assert(ip != NULL);
+         ip += cBlockSize;
+         remainingSrcSize -= cBlockSize;
+@@ -930,12 +1013,15 @@ static size_t ZSTD_decompressFrame(ZSTD_
+     }
+     ZSTD_DCtx_trace_end(dctx, (U64)(op-ostart), (U64)(ip-istart), /* streaming */ 0);
+     /* Allow caller to get size read */
++    DEBUGLOG(4, "ZSTD_decompressFrame: decompressed frame of size %zi, consuming %zi bytes of input", op-ostart, ip - (const BYTE*)*srcPtr);
+     *srcPtr = ip;
+     *srcSizePtr = remainingSrcSize;
+     return (size_t)(op-ostart);
+ }
+ 
+-static size_t ZSTD_decompressMultiFrame(ZSTD_DCtx* dctx,
++static
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++size_t ZSTD_decompressMultiFrame(ZSTD_DCtx* dctx,
+                                         void* dst, size_t dstCapacity,
+                                   const void* src, size_t srcSize,
+                                   const void* dict, size_t dictSize,
+@@ -955,17 +1041,18 @@ static size_t ZSTD_decompressMultiFrame(
+     while (srcSize >= ZSTD_startingInputLength(dctx->format)) {
+ 
+ 
+-        {   U32 const magicNumber = MEM_readLE32(src);
+-            DEBUGLOG(4, "reading magic number %08X (expecting %08X)",
+-                        (unsigned)magicNumber, ZSTD_MAGICNUMBER);
++        if (dctx->format == ZSTD_f_zstd1 && srcSize >= 4) {
++            U32 const magicNumber = MEM_readLE32(src);
++            DEBUGLOG(5, "reading magic number %08X", (unsigned)magicNumber);
+             if ((magicNumber & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) {
++                /* skippable frame detected : skip it */
+                 size_t const skippableSize = readSkippableFrameSize(src, srcSize);
+-                FORWARD_IF_ERROR(skippableSize, "readSkippableFrameSize failed");
++                FORWARD_IF_ERROR(skippableSize, "invalid skippable frame");
+                 assert(skippableSize <= srcSize);
+ 
+                 src = (const BYTE *)src + skippableSize;
+                 srcSize -= skippableSize;
+-                continue;
++                continue; /* check next frame */
+         }   }
+ 
+         if (ddict) {
+@@ -1061,8 +1148,8 @@ size_t ZSTD_decompress(void* dst, size_t
+ size_t ZSTD_nextSrcSizeToDecompress(ZSTD_DCtx* dctx) { return dctx->expected; }
+ 
+ /*
+- * Similar to ZSTD_nextSrcSizeToDecompress(), but when a block input can be streamed,
+- * we allow taking a partial block as the input. Currently only raw uncompressed blocks can
++ * Similar to ZSTD_nextSrcSizeToDecompress(), but when a block input can be streamed, we
++ * allow taking a partial block as the input. Currently only raw uncompressed blocks can
+  * be streamed.
+  *
+  * For blocks that can be streamed, this allows us to reduce the latency until we produce
+@@ -1181,7 +1268,8 @@ size_t ZSTD_decompressContinue(ZSTD_DCtx
+             {
+             case bt_compressed:
+                 DEBUGLOG(5, "ZSTD_decompressContinue: case bt_compressed");
+-                rSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, /* frame */ 1, is_streaming);
++                assert(dctx->isFrameDecompression == 1);
++                rSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, is_streaming);
+                 dctx->expected = 0;  /* Streaming not supported */
+                 break;
+             case bt_raw :
+@@ -1250,6 +1338,7 @@ size_t ZSTD_decompressContinue(ZSTD_DCtx
+     case ZSTDds_decodeSkippableHeader:
+         assert(src != NULL);
+         assert(srcSize <= ZSTD_SKIPPABLEHEADERSIZE);
++        assert(dctx->format != ZSTD_f_zstd1_magicless);
+         ZSTD_memcpy(dctx->headerBuffer + (ZSTD_SKIPPABLEHEADERSIZE - srcSize), src, srcSize);   /* complete skippable header */
+         dctx->expected = MEM_readLE32(dctx->headerBuffer + ZSTD_FRAMEIDSIZE);   /* note : dctx->expected can grow seriously large, beyond local buffer size */
+         dctx->stage = ZSTDds_skipFrame;
+@@ -1262,7 +1351,7 @@ size_t ZSTD_decompressContinue(ZSTD_DCtx
+ 
+     default:
+         assert(0);   /* impossible */
+-        RETURN_ERROR(GENERIC, "impossible to reach");   /* some compiler require default to do something */
++        RETURN_ERROR(GENERIC, "impossible to reach");   /* some compilers require default to do something */
+     }
+ }
+ 
+@@ -1303,11 +1392,11 @@ ZSTD_loadDEntropy(ZSTD_entropyDTables_t*
+         /* in minimal huffman, we always use X1 variants */
+         size_t const hSize = HUF_readDTableX1_wksp(entropy->hufTable,
+                                                 dictPtr, dictEnd - dictPtr,
+-                                                workspace, workspaceSize);
++                                                workspace, workspaceSize, /* flags */ 0);
+ #else
+         size_t const hSize = HUF_readDTableX2_wksp(entropy->hufTable,
+                                                 dictPtr, (size_t)(dictEnd - dictPtr),
+-                                                workspace, workspaceSize);
++                                                workspace, workspaceSize, /* flags */ 0);
+ #endif
+         RETURN_ERROR_IF(HUF_isError(hSize), dictionary_corrupted, "");
+         dictPtr += hSize;
+@@ -1403,10 +1492,11 @@ size_t ZSTD_decompressBegin(ZSTD_DCtx* d
+     dctx->prefixStart = NULL;
+     dctx->virtualStart = NULL;
+     dctx->dictEnd = NULL;
+-    dctx->entropy.hufTable[0] = (HUF_DTable)((HufLog)*0x1000001);  /* cover both little and big endian */
++    dctx->entropy.hufTable[0] = (HUF_DTable)((ZSTD_HUFFDTABLE_CAPACITY_LOG)*0x1000001);  /* cover both little and big endian */
+     dctx->litEntropy = dctx->fseEntropy = 0;
+     dctx->dictID = 0;
+     dctx->bType = bt_reserved;
++    dctx->isFrameDecompression = 1;
+     ZSTD_STATIC_ASSERT(sizeof(dctx->entropy.rep) == sizeof(repStartValue));
+     ZSTD_memcpy(dctx->entropy.rep, repStartValue, sizeof(repStartValue));  /* initial repcodes */
+     dctx->LLTptr = dctx->entropy.LLTable;
+@@ -1465,7 +1555,7 @@ unsigned ZSTD_getDictID_fromDict(const v
+  *  This could for one of the following reasons :
+  *  - The frame does not require a dictionary (most common case).
+  *  - The frame was built with dictID intentionally removed.
+- *    Needed dictionary is a hidden information.
++ *    Needed dictionary is a hidden piece of information.
+  *    Note : this use case also happens when using a non-conformant dictionary.
+  *  - `srcSize` is too small, and as a result, frame header could not be decoded.
+  *    Note : possible if `srcSize < ZSTD_FRAMEHEADERSIZE_MAX`.
+@@ -1474,7 +1564,7 @@ unsigned ZSTD_getDictID_fromDict(const v
+  *  ZSTD_getFrameHeader(), which will provide a more precise error code. */
+ unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize)
+ {
+-    ZSTD_frameHeader zfp = { 0, 0, 0, ZSTD_frame, 0, 0, 0 };
++    ZSTD_frameHeader zfp = { 0, 0, 0, ZSTD_frame, 0, 0, 0, 0, 0 };
+     size_t const hError = ZSTD_getFrameHeader(&zfp, src, srcSize);
+     if (ZSTD_isError(hError)) return 0;
+     return zfp.dictID;
+@@ -1581,7 +1671,9 @@ size_t ZSTD_initDStream_usingDict(ZSTD_D
+ size_t ZSTD_initDStream(ZSTD_DStream* zds)
+ {
+     DEBUGLOG(4, "ZSTD_initDStream");
+-    return ZSTD_initDStream_usingDDict(zds, NULL);
++    FORWARD_IF_ERROR(ZSTD_DCtx_reset(zds, ZSTD_reset_session_only), "");
++    FORWARD_IF_ERROR(ZSTD_DCtx_refDDict(zds, NULL), "");
++    return ZSTD_startingInputLength(zds->format);
+ }
+ 
+ /* ZSTD_initDStream_usingDDict() :
+@@ -1589,6 +1681,7 @@ size_t ZSTD_initDStream(ZSTD_DStream* zd
+  * this function cannot fail */
+ size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* dctx, const ZSTD_DDict* ddict)
+ {
++    DEBUGLOG(4, "ZSTD_initDStream_usingDDict");
+     FORWARD_IF_ERROR( ZSTD_DCtx_reset(dctx, ZSTD_reset_session_only) , "");
+     FORWARD_IF_ERROR( ZSTD_DCtx_refDDict(dctx, ddict) , "");
+     return ZSTD_startingInputLength(dctx->format);
+@@ -1599,6 +1692,7 @@ size_t ZSTD_initDStream_usingDDict(ZSTD_
+  * this function cannot fail */
+ size_t ZSTD_resetDStream(ZSTD_DStream* dctx)
+ {
++    DEBUGLOG(4, "ZSTD_resetDStream");
+     FORWARD_IF_ERROR(ZSTD_DCtx_reset(dctx, ZSTD_reset_session_only), "");
+     return ZSTD_startingInputLength(dctx->format);
+ }
+@@ -1670,6 +1764,15 @@ ZSTD_bounds ZSTD_dParam_getBounds(ZSTD_d
+             bounds.lowerBound = (int)ZSTD_rmd_refSingleDDict;
+             bounds.upperBound = (int)ZSTD_rmd_refMultipleDDicts;
+             return bounds;
++        case ZSTD_d_disableHuffmanAssembly:
++            bounds.lowerBound = 0;
++            bounds.upperBound = 1;
++            return bounds;
++        case ZSTD_d_maxBlockSize:
++            bounds.lowerBound = ZSTD_BLOCKSIZE_MAX_MIN;
++            bounds.upperBound = ZSTD_BLOCKSIZE_MAX;
++            return bounds;
++
+         default:;
+     }
+     bounds.error = ERROR(parameter_unsupported);
+@@ -1710,6 +1813,12 @@ size_t ZSTD_DCtx_getParameter(ZSTD_DCtx*
+         case ZSTD_d_refMultipleDDicts:
+             *value = (int)dctx->refMultipleDDicts;
+             return 0;
++        case ZSTD_d_disableHuffmanAssembly:
++            *value = (int)dctx->disableHufAsm;
++            return 0;
++        case ZSTD_d_maxBlockSize:
++            *value = dctx->maxBlockSizeParam;
++            return 0;
+         default:;
+     }
+     RETURN_ERROR(parameter_unsupported, "");
+@@ -1743,6 +1852,14 @@ size_t ZSTD_DCtx_setParameter(ZSTD_DCtx*
+             }
+             dctx->refMultipleDDicts = (ZSTD_refMultipleDDicts_e)value;
+             return 0;
++        case ZSTD_d_disableHuffmanAssembly:
++            CHECK_DBOUNDS(ZSTD_d_disableHuffmanAssembly, value);
++            dctx->disableHufAsm = value != 0;
++            return 0;
++        case ZSTD_d_maxBlockSize:
++            if (value != 0) CHECK_DBOUNDS(ZSTD_d_maxBlockSize, value);
++            dctx->maxBlockSizeParam = value;
++            return 0;
+         default:;
+     }
+     RETURN_ERROR(parameter_unsupported, "");
+@@ -1754,6 +1871,7 @@ size_t ZSTD_DCtx_reset(ZSTD_DCtx* dctx,
+       || (reset == ZSTD_reset_session_and_parameters) ) {
+         dctx->streamStage = zdss_init;
+         dctx->noForwardProgress = 0;
++        dctx->isFrameDecompression = 1;
+     }
+     if ( (reset == ZSTD_reset_parameters)
+       || (reset == ZSTD_reset_session_and_parameters) ) {
+@@ -1770,11 +1888,17 @@ size_t ZSTD_sizeof_DStream(const ZSTD_DS
+     return ZSTD_sizeof_DCtx(dctx);
+ }
+ 
+-size_t ZSTD_decodingBufferSize_min(unsigned long long windowSize, unsigned long long frameContentSize)
++static size_t ZSTD_decodingBufferSize_internal(unsigned long long windowSize, unsigned long long frameContentSize, size_t blockSizeMax)
+ {
+-    size_t const blockSize = (size_t) MIN(windowSize, ZSTD_BLOCKSIZE_MAX);
+-    /* space is needed to store the litbuffer after the output of a given block without stomping the extDict of a previous run, as well as to cover both windows against wildcopy*/
+-    unsigned long long const neededRBSize = windowSize + blockSize + ZSTD_BLOCKSIZE_MAX + (WILDCOPY_OVERLENGTH * 2);
++    size_t const blockSize = MIN((size_t)MIN(windowSize, ZSTD_BLOCKSIZE_MAX), blockSizeMax);
++    /* We need blockSize + WILDCOPY_OVERLENGTH worth of buffer so that if a block
++     * ends at windowSize + WILDCOPY_OVERLENGTH + 1 bytes, we can start writing
++     * the block at the beginning of the output buffer, and maintain a full window.
++     *
++     * We need another blockSize worth of buffer so that we can store split
++     * literals at the end of the block without overwriting the extDict window.
++     */
++    unsigned long long const neededRBSize = windowSize + (blockSize * 2) + (WILDCOPY_OVERLENGTH * 2);
+     unsigned long long const neededSize = MIN(frameContentSize, neededRBSize);
+     size_t const minRBSize = (size_t) neededSize;
+     RETURN_ERROR_IF((unsigned long long)minRBSize != neededSize,
+@@ -1782,6 +1906,11 @@ size_t ZSTD_decodingBufferSize_min(unsig
+     return minRBSize;
+ }
+ 
++size_t ZSTD_decodingBufferSize_min(unsigned long long windowSize, unsigned long long frameContentSize)
++{
++    return ZSTD_decodingBufferSize_internal(windowSize, frameContentSize, ZSTD_BLOCKSIZE_MAX);
++}
++
+ size_t ZSTD_estimateDStreamSize(size_t windowSize)
+ {
+     size_t const blockSize = MIN(windowSize, ZSTD_BLOCKSIZE_MAX);
+@@ -1918,7 +2047,6 @@ size_t ZSTD_decompressStream(ZSTD_DStrea
+                 if (zds->refMultipleDDicts && zds->ddictSet) {
+                     ZSTD_DCtx_selectFrameDDict(zds);
+                 }
+-                DEBUGLOG(5, "header size : %u", (U32)hSize);
+                 if (ZSTD_isError(hSize)) {
+                     return hSize;   /* error */
+                 }
+@@ -1932,6 +2060,11 @@ size_t ZSTD_decompressStream(ZSTD_DStrea
+                             zds->lhSize += remainingInput;
+                         }
+                         input->pos = input->size;
++                        /* check first few bytes */
++                        FORWARD_IF_ERROR(
++                            ZSTD_getFrameHeader_advanced(&zds->fParams, zds->headerBuffer, zds->lhSize, zds->format),
++                            "First few bytes detected incorrect" );
++                        /* return hint input size */
+                         return (MAX((size_t)ZSTD_FRAMEHEADERSIZE_MIN(zds->format), hSize) - zds->lhSize) + ZSTD_blockHeaderSize;   /* remaining header bytes + next block header */
+                     }
+                     assert(ip != NULL);
+@@ -1943,14 +2076,15 @@ size_t ZSTD_decompressStream(ZSTD_DStrea
+             if (zds->fParams.frameContentSize != ZSTD_CONTENTSIZE_UNKNOWN
+                 && zds->fParams.frameType != ZSTD_skippableFrame
+                 && (U64)(size_t)(oend-op) >= zds->fParams.frameContentSize) {
+-                size_t const cSize = ZSTD_findFrameCompressedSize(istart, (size_t)(iend-istart));
++                size_t const cSize = ZSTD_findFrameCompressedSize_advanced(istart, (size_t)(iend-istart), zds->format);
+                 if (cSize <= (size_t)(iend-istart)) {
+                     /* shortcut : using single-pass mode */
+                     size_t const decompressedSize = ZSTD_decompress_usingDDict(zds, op, (size_t)(oend-op), istart, cSize, ZSTD_getDDict(zds));
+                     if (ZSTD_isError(decompressedSize)) return decompressedSize;
+-                    DEBUGLOG(4, "shortcut to single-pass ZSTD_decompress_usingDDict()")
++                    DEBUGLOG(4, "shortcut to single-pass ZSTD_decompress_usingDDict()");
++                    assert(istart != NULL);
+                     ip = istart + cSize;
+-                    op += decompressedSize;
++                    op = op ? op + decompressedSize : op; /* can occur if frameContentSize = 0 (empty frame) */
+                     zds->expected = 0;
+                     zds->streamStage = zdss_init;
+                     someMoreWork = 0;
+@@ -1969,7 +2103,8 @@ size_t ZSTD_decompressStream(ZSTD_DStrea
+             DEBUGLOG(4, "Consume header");
+             FORWARD_IF_ERROR(ZSTD_decompressBegin_usingDDict(zds, ZSTD_getDDict(zds)), "");
+ 
+-            if ((MEM_readLE32(zds->headerBuffer) & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) {  /* skippable frame */
++            if (zds->format == ZSTD_f_zstd1
++                && (MEM_readLE32(zds->headerBuffer) & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) {  /* skippable frame */
+                 zds->expected = MEM_readLE32(zds->headerBuffer + ZSTD_FRAMEIDSIZE);
+                 zds->stage = ZSTDds_skipFrame;
+             } else {
+@@ -1985,11 +2120,13 @@ size_t ZSTD_decompressStream(ZSTD_DStrea
+             zds->fParams.windowSize = MAX(zds->fParams.windowSize, 1U << ZSTD_WINDOWLOG_ABSOLUTEMIN);
+             RETURN_ERROR_IF(zds->fParams.windowSize > zds->maxWindowSize,
+                             frameParameter_windowTooLarge, "");
++            if (zds->maxBlockSizeParam != 0)
++                zds->fParams.blockSizeMax = MIN(zds->fParams.blockSizeMax, (unsigned)zds->maxBlockSizeParam);
+ 
+             /* Adapt buffer sizes to frame header instructions */
+             {   size_t const neededInBuffSize = MAX(zds->fParams.blockSizeMax, 4 /* frame checksum */);
+                 size_t const neededOutBuffSize = zds->outBufferMode == ZSTD_bm_buffered
+-                        ? ZSTD_decodingBufferSize_min(zds->fParams.windowSize, zds->fParams.frameContentSize)
++                        ? ZSTD_decodingBufferSize_internal(zds->fParams.windowSize, zds->fParams.frameContentSize, zds->fParams.blockSizeMax)
+                         : 0;
+ 
+                 ZSTD_DCtx_updateOversizedDuration(zds, neededInBuffSize, neededOutBuffSize);
+@@ -2034,6 +2171,7 @@ size_t ZSTD_decompressStream(ZSTD_DStrea
+                 }
+                 if ((size_t)(iend-ip) >= neededInSize) {  /* decode directly from src */
+                     FORWARD_IF_ERROR(ZSTD_decompressContinueStream(zds, &op, oend, ip, neededInSize), "");
++                    assert(ip != NULL);
+                     ip += neededInSize;
+                     /* Function modifies the stage so we must break */
+                     break;
+@@ -2048,7 +2186,7 @@ size_t ZSTD_decompressStream(ZSTD_DStrea
+                 int const isSkipFrame = ZSTD_isSkipFrame(zds);
+                 size_t loadedSize;
+                 /* At this point we shouldn't be decompressing a block that we can stream. */
+-                assert(neededInSize == ZSTD_nextSrcSizeToDecompressWithInputSize(zds, iend - ip));
++                assert(neededInSize == ZSTD_nextSrcSizeToDecompressWithInputSize(zds, (size_t)(iend - ip)));
+                 if (isSkipFrame) {
+                     loadedSize = MIN(toLoad, (size_t)(iend-ip));
+                 } else {
+@@ -2057,8 +2195,11 @@ size_t ZSTD_decompressStream(ZSTD_DStrea
+                                     "should never happen");
+                     loadedSize = ZSTD_limitCopy(zds->inBuff + zds->inPos, toLoad, ip, (size_t)(iend-ip));
+                 }
+-                ip += loadedSize;
+-                zds->inPos += loadedSize;
++                if (loadedSize != 0) {
++                    /* ip may be NULL */
++                    ip += loadedSize;
++                    zds->inPos += loadedSize;
++                }
+                 if (loadedSize < toLoad) { someMoreWork = 0; break; }   /* not enough input, wait for more */
+ 
+                 /* decode loaded input */
+@@ -2068,14 +2209,17 @@ size_t ZSTD_decompressStream(ZSTD_DStrea
+                 break;
+             }
+         case zdss_flush:
+-            {   size_t const toFlushSize = zds->outEnd - zds->outStart;
++            {
++                size_t const toFlushSize = zds->outEnd - zds->outStart;
+                 size_t const flushedSize = ZSTD_limitCopy(op, (size_t)(oend-op), zds->outBuff + zds->outStart, toFlushSize);
+-                op += flushedSize;
++
++                op = op ? op + flushedSize : op;
++
+                 zds->outStart += flushedSize;
+                 if (flushedSize == toFlushSize) {  /* flush completed */
+                     zds->streamStage = zdss_read;
+                     if ( (zds->outBuffSize < zds->fParams.frameContentSize)
+-                      && (zds->outStart + zds->fParams.blockSizeMax > zds->outBuffSize) ) {
++                        && (zds->outStart + zds->fParams.blockSizeMax > zds->outBuffSize) ) {
+                         DEBUGLOG(5, "restart filling outBuff from beginning (left:%i, needed:%u)",
+                                 (int)(zds->outBuffSize - zds->outStart),
+                                 (U32)zds->fParams.blockSizeMax);
+@@ -2089,7 +2233,7 @@ size_t ZSTD_decompressStream(ZSTD_DStrea
+ 
+         default:
+             assert(0);    /* impossible */
+-            RETURN_ERROR(GENERIC, "impossible to reach");   /* some compiler require default to do something */
++            RETURN_ERROR(GENERIC, "impossible to reach");   /* some compilers require default to do something */
+     }   }
+ 
+     /* result */
+@@ -2102,8 +2246,8 @@ size_t ZSTD_decompressStream(ZSTD_DStrea
+     if ((ip==istart) && (op==ostart)) {  /* no forward progress */
+         zds->noForwardProgress ++;
+         if (zds->noForwardProgress >= ZSTD_NO_FORWARD_PROGRESS_MAX) {
+-            RETURN_ERROR_IF(op==oend, dstSize_tooSmall, "");
+-            RETURN_ERROR_IF(ip==iend, srcSize_wrong, "");
++            RETURN_ERROR_IF(op==oend, noForwardProgress_destFull, "");
++            RETURN_ERROR_IF(ip==iend, noForwardProgress_inputEmpty, "");
+             assert(0);
+         }
+     } else {
+@@ -2140,11 +2284,17 @@ size_t ZSTD_decompressStream_simpleArgs
+                             void* dst, size_t dstCapacity, size_t* dstPos,
+                       const void* src, size_t srcSize, size_t* srcPos)
+ {
+-    ZSTD_outBuffer output = { dst, dstCapacity, *dstPos };
+-    ZSTD_inBuffer  input  = { src, srcSize, *srcPos };
+-    /* ZSTD_compress_generic() will check validity of dstPos and srcPos */
+-    size_t const cErr = ZSTD_decompressStream(dctx, &output, &input);
+-    *dstPos = output.pos;
+-    *srcPos = input.pos;
+-    return cErr;
++    ZSTD_outBuffer output;
++    ZSTD_inBuffer  input;
++    output.dst = dst;
++    output.size = dstCapacity;
++    output.pos = *dstPos;
++    input.src = src;
++    input.size = srcSize;
++    input.pos = *srcPos;
++    {   size_t const cErr = ZSTD_decompressStream(dctx, &output, &input);
++        *dstPos = output.pos;
++        *srcPos = input.pos;
++        return cErr;
++    }
+ }
+--- a/lib/zstd/decompress/zstd_decompress_block.c
++++ b/lib/zstd/decompress/zstd_decompress_block.c
+@@ -1,5 +1,6 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -20,12 +21,12 @@
+ #include "../common/mem.h"         /* low level memory routines */
+ #define FSE_STATIC_LINKING_ONLY
+ #include "../common/fse.h"
+-#define HUF_STATIC_LINKING_ONLY
+ #include "../common/huf.h"
+ #include "../common/zstd_internal.h"
+ #include "zstd_decompress_internal.h"   /* ZSTD_DCtx */
+ #include "zstd_ddict.h"  /* ZSTD_DDictDictContent */
+ #include "zstd_decompress_block.h"
++#include "../common/bits.h"  /* ZSTD_highbit32 */
+ 
+ /*_*******************************************************
+ *  Macros
+@@ -51,6 +52,13 @@ static void ZSTD_copy4(void* dst, const
+  *   Block decoding
+  ***************************************************************/
+ 
++static size_t ZSTD_blockSizeMax(ZSTD_DCtx const* dctx)
++{
++    size_t const blockSizeMax = dctx->isFrameDecompression ? dctx->fParams.blockSizeMax : ZSTD_BLOCKSIZE_MAX;
++    assert(blockSizeMax <= ZSTD_BLOCKSIZE_MAX);
++    return blockSizeMax;
++}
++
+ /*! ZSTD_getcBlockSize() :
+  *  Provides the size of compressed block from block header `src` */
+ size_t ZSTD_getcBlockSize(const void* src, size_t srcSize,
+@@ -73,41 +81,49 @@ size_t ZSTD_getcBlockSize(const void* sr
+ static void ZSTD_allocateLiteralsBuffer(ZSTD_DCtx* dctx, void* const dst, const size_t dstCapacity, const size_t litSize,
+     const streaming_operation streaming, const size_t expectedWriteSize, const unsigned splitImmediately)
+ {
+-    if (streaming == not_streaming && dstCapacity > ZSTD_BLOCKSIZE_MAX + WILDCOPY_OVERLENGTH + litSize + WILDCOPY_OVERLENGTH)
+-    {
+-        /* room for litbuffer to fit without read faulting */
+-        dctx->litBuffer = (BYTE*)dst + ZSTD_BLOCKSIZE_MAX + WILDCOPY_OVERLENGTH;
++    size_t const blockSizeMax = ZSTD_blockSizeMax(dctx);
++    assert(litSize <= blockSizeMax);
++    assert(dctx->isFrameDecompression || streaming == not_streaming);
++    assert(expectedWriteSize <= blockSizeMax);
++    if (streaming == not_streaming && dstCapacity > blockSizeMax + WILDCOPY_OVERLENGTH + litSize + WILDCOPY_OVERLENGTH) {
++        /* If we aren't streaming, we can just put the literals after the output
++         * of the current block. We don't need to worry about overwriting the
++         * extDict of our window, because it doesn't exist.
++         * So if we have space after the end of the block, just put it there.
++         */
++        dctx->litBuffer = (BYTE*)dst + blockSizeMax + WILDCOPY_OVERLENGTH;
+         dctx->litBufferEnd = dctx->litBuffer + litSize;
+         dctx->litBufferLocation = ZSTD_in_dst;
+-    }
+-    else if (litSize > ZSTD_LITBUFFEREXTRASIZE)
+-    {
+-        /* won't fit in litExtraBuffer, so it will be split between end of dst and extra buffer */
++    } else if (litSize <= ZSTD_LITBUFFEREXTRASIZE) {
++        /* Literals fit entirely within the extra buffer, put them there to avoid
++         * having to split the literals.
++         */
++        dctx->litBuffer = dctx->litExtraBuffer;
++        dctx->litBufferEnd = dctx->litBuffer + litSize;
++        dctx->litBufferLocation = ZSTD_not_in_dst;
++    } else {
++        assert(blockSizeMax > ZSTD_LITBUFFEREXTRASIZE);
++        /* Literals must be split between the output block and the extra lit
++         * buffer. We fill the extra lit buffer with the tail of the literals,
++         * and put the rest of the literals at the end of the block, with
++         * WILDCOPY_OVERLENGTH of buffer room to allow for overreads.
++         * This MUST not write more than our maxBlockSize beyond dst, because in
++         * streaming mode, that could overwrite part of our extDict window.
++         */
+         if (splitImmediately) {
+             /* won't fit in litExtraBuffer, so it will be split between end of dst and extra buffer */
+             dctx->litBuffer = (BYTE*)dst + expectedWriteSize - litSize + ZSTD_LITBUFFEREXTRASIZE - WILDCOPY_OVERLENGTH;
+             dctx->litBufferEnd = dctx->litBuffer + litSize - ZSTD_LITBUFFEREXTRASIZE;
+-        }
+-        else {
+-            /* initially this will be stored entirely in dst during huffman decoding, it will partially shifted to litExtraBuffer after */
++        } else {
++            /* initially this will be stored entirely in dst during huffman decoding, it will partially be shifted to litExtraBuffer after */
+             dctx->litBuffer = (BYTE*)dst + expectedWriteSize - litSize;
+             dctx->litBufferEnd = (BYTE*)dst + expectedWriteSize;
+         }
+         dctx->litBufferLocation = ZSTD_split;
+-    }
+-    else
+-    {
+-        /* fits entirely within litExtraBuffer, so no split is necessary */
+-        dctx->litBuffer = dctx->litExtraBuffer;
+-        dctx->litBufferEnd = dctx->litBuffer + litSize;
+-        dctx->litBufferLocation = ZSTD_not_in_dst;
++        assert(dctx->litBufferEnd <= (BYTE*)dst + expectedWriteSize);
+     }
+ }
+ 
+-/* Hidden declaration for fullbench */
+-size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
+-                          const void* src, size_t srcSize,
+-                          void* dst, size_t dstCapacity, const streaming_operation streaming);
+ /*! ZSTD_decodeLiteralsBlock() :
+  * Where it is possible to do so without being stomped by the output during decompression, the literals block will be stored
+  * in the dstBuffer.  If there is room to do so, it will be stored in full in the excess dst space after where the current
+@@ -116,7 +132,7 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCt
+  *
+  * @return : nb of bytes read from src (< srcSize )
+  *  note : symbol not declared but exposed for fullbench */
+-size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
++static size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
+                           const void* src, size_t srcSize,   /* note : srcSize < BLOCKSIZE */
+                           void* dst, size_t dstCapacity, const streaming_operation streaming)
+ {
+@@ -125,6 +141,7 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCt
+ 
+     {   const BYTE* const istart = (const BYTE*) src;
+         symbolEncodingType_e const litEncType = (symbolEncodingType_e)(istart[0] & 3);
++        size_t const blockSizeMax = ZSTD_blockSizeMax(dctx);
+ 
+         switch(litEncType)
+         {
+@@ -134,13 +151,16 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCt
+             ZSTD_FALLTHROUGH;
+ 
+         case set_compressed:
+-            RETURN_ERROR_IF(srcSize < 5, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 3; here we need up to 5 for case 3");
++            RETURN_ERROR_IF(srcSize < 5, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need up to 5 for case 3");
+             {   size_t lhSize, litSize, litCSize;
+                 U32 singleStream=0;
+                 U32 const lhlCode = (istart[0] >> 2) & 3;
+                 U32 const lhc = MEM_readLE32(istart);
+                 size_t hufSuccess;
+-                size_t expectedWriteSize = MIN(ZSTD_BLOCKSIZE_MAX, dstCapacity);
++                size_t expectedWriteSize = MIN(blockSizeMax, dstCapacity);
++                int const flags = 0
++                    | (ZSTD_DCtx_get_bmi2(dctx) ? HUF_flags_bmi2 : 0)
++                    | (dctx->disableHufAsm ? HUF_flags_disableAsm : 0);
+                 switch(lhlCode)
+                 {
+                 case 0: case 1: default:   /* note : default is impossible, since lhlCode into [0..3] */
+@@ -164,7 +184,11 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCt
+                     break;
+                 }
+                 RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled");
+-                RETURN_ERROR_IF(litSize > ZSTD_BLOCKSIZE_MAX, corruption_detected, "");
++                RETURN_ERROR_IF(litSize > blockSizeMax, corruption_detected, "");
++                if (!singleStream)
++                    RETURN_ERROR_IF(litSize < MIN_LITERALS_FOR_4_STREAMS, literals_headerWrong,
++                        "Not enough literals (%zu) for the 4-streams mode (min %u)",
++                        litSize, MIN_LITERALS_FOR_4_STREAMS);
+                 RETURN_ERROR_IF(litCSize + lhSize > srcSize, corruption_detected, "");
+                 RETURN_ERROR_IF(expectedWriteSize < litSize , dstSize_tooSmall, "");
+                 ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSize, streaming, expectedWriteSize, 0);
+@@ -176,13 +200,14 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCt
+ 
+                 if (litEncType==set_repeat) {
+                     if (singleStream) {
+-                        hufSuccess = HUF_decompress1X_usingDTable_bmi2(
++                        hufSuccess = HUF_decompress1X_usingDTable(
+                             dctx->litBuffer, litSize, istart+lhSize, litCSize,
+-                            dctx->HUFptr, ZSTD_DCtx_get_bmi2(dctx));
++                            dctx->HUFptr, flags);
+                     } else {
+-                        hufSuccess = HUF_decompress4X_usingDTable_bmi2(
++                        assert(litSize >= MIN_LITERALS_FOR_4_STREAMS);
++                        hufSuccess = HUF_decompress4X_usingDTable(
+                             dctx->litBuffer, litSize, istart+lhSize, litCSize,
+-                            dctx->HUFptr, ZSTD_DCtx_get_bmi2(dctx));
++                            dctx->HUFptr, flags);
+                     }
+                 } else {
+                     if (singleStream) {
+@@ -190,26 +215,28 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCt
+                         hufSuccess = HUF_decompress1X_DCtx_wksp(
+                             dctx->entropy.hufTable, dctx->litBuffer, litSize,
+                             istart+lhSize, litCSize, dctx->workspace,
+-                            sizeof(dctx->workspace));
++                            sizeof(dctx->workspace), flags);
+ #else
+-                        hufSuccess = HUF_decompress1X1_DCtx_wksp_bmi2(
++                        hufSuccess = HUF_decompress1X1_DCtx_wksp(
+                             dctx->entropy.hufTable, dctx->litBuffer, litSize,
+                             istart+lhSize, litCSize, dctx->workspace,
+-                            sizeof(dctx->workspace), ZSTD_DCtx_get_bmi2(dctx));
++                            sizeof(dctx->workspace), flags);
+ #endif
+                     } else {
+-                        hufSuccess = HUF_decompress4X_hufOnly_wksp_bmi2(
++                        hufSuccess = HUF_decompress4X_hufOnly_wksp(
+                             dctx->entropy.hufTable, dctx->litBuffer, litSize,
+                             istart+lhSize, litCSize, dctx->workspace,
+-                            sizeof(dctx->workspace), ZSTD_DCtx_get_bmi2(dctx));
++                            sizeof(dctx->workspace), flags);
+                     }
+                 }
+                 if (dctx->litBufferLocation == ZSTD_split)
+                 {
++                    assert(litSize > ZSTD_LITBUFFEREXTRASIZE);
+                     ZSTD_memcpy(dctx->litExtraBuffer, dctx->litBufferEnd - ZSTD_LITBUFFEREXTRASIZE, ZSTD_LITBUFFEREXTRASIZE);
+                     ZSTD_memmove(dctx->litBuffer + ZSTD_LITBUFFEREXTRASIZE - WILDCOPY_OVERLENGTH, dctx->litBuffer, litSize - ZSTD_LITBUFFEREXTRASIZE);
+                     dctx->litBuffer += ZSTD_LITBUFFEREXTRASIZE - WILDCOPY_OVERLENGTH;
+                     dctx->litBufferEnd -= WILDCOPY_OVERLENGTH;
++                    assert(dctx->litBufferEnd <= (BYTE*)dst + blockSizeMax);
+                 }
+ 
+                 RETURN_ERROR_IF(HUF_isError(hufSuccess), corruption_detected, "");
+@@ -224,7 +251,7 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCt
+         case set_basic:
+             {   size_t litSize, lhSize;
+                 U32 const lhlCode = ((istart[0]) >> 2) & 3;
+-                size_t expectedWriteSize = MIN(ZSTD_BLOCKSIZE_MAX, dstCapacity);
++                size_t expectedWriteSize = MIN(blockSizeMax, dstCapacity);
+                 switch(lhlCode)
+                 {
+                 case 0: case 2: default:   /* note : default is impossible, since lhlCode into [0..3] */
+@@ -237,11 +264,13 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCt
+                     break;
+                 case 3:
+                     lhSize = 3;
++                    RETURN_ERROR_IF(srcSize<3, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need lhSize = 3");
+                     litSize = MEM_readLE24(istart) >> 4;
+                     break;
+                 }
+ 
+                 RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled");
++                RETURN_ERROR_IF(litSize > blockSizeMax, corruption_detected, "");
+                 RETURN_ERROR_IF(expectedWriteSize < litSize, dstSize_tooSmall, "");
+                 ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSize, streaming, expectedWriteSize, 1);
+                 if (lhSize+litSize+WILDCOPY_OVERLENGTH > srcSize) {  /* risk reading beyond src buffer with wildcopy */
+@@ -270,7 +299,7 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCt
+         case set_rle:
+             {   U32 const lhlCode = ((istart[0]) >> 2) & 3;
+                 size_t litSize, lhSize;
+-                size_t expectedWriteSize = MIN(ZSTD_BLOCKSIZE_MAX, dstCapacity);
++                size_t expectedWriteSize = MIN(blockSizeMax, dstCapacity);
+                 switch(lhlCode)
+                 {
+                 case 0: case 2: default:   /* note : default is impossible, since lhlCode into [0..3] */
+@@ -279,16 +308,17 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCt
+                     break;
+                 case 1:
+                     lhSize = 2;
++                    RETURN_ERROR_IF(srcSize<3, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need lhSize+1 = 3");
+                     litSize = MEM_readLE16(istart) >> 4;
+                     break;
+                 case 3:
+                     lhSize = 3;
++                    RETURN_ERROR_IF(srcSize<4, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need lhSize+1 = 4");
+                     litSize = MEM_readLE24(istart) >> 4;
+-                    RETURN_ERROR_IF(srcSize<4, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 3; here we need lhSize+1 = 4");
+                     break;
+                 }
+                 RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled");
+-                RETURN_ERROR_IF(litSize > ZSTD_BLOCKSIZE_MAX, corruption_detected, "");
++                RETURN_ERROR_IF(litSize > blockSizeMax, corruption_detected, "");
+                 RETURN_ERROR_IF(expectedWriteSize < litSize, dstSize_tooSmall, "");
+                 ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSize, streaming, expectedWriteSize, 1);
+                 if (dctx->litBufferLocation == ZSTD_split)
+@@ -310,6 +340,18 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCt
+     }
+ }
+ 
++/* Hidden declaration for fullbench */
++size_t ZSTD_decodeLiteralsBlock_wrapper(ZSTD_DCtx* dctx,
++                          const void* src, size_t srcSize,
++                          void* dst, size_t dstCapacity);
++size_t ZSTD_decodeLiteralsBlock_wrapper(ZSTD_DCtx* dctx,
++                          const void* src, size_t srcSize,
++                          void* dst, size_t dstCapacity)
++{
++    dctx->isFrameDecompression = 0;
++    return ZSTD_decodeLiteralsBlock(dctx, src, srcSize, dst, dstCapacity, not_streaming);
++}
++
+ /* Default FSE distribution tables.
+  * These are pre-calculated FSE decoding tables using default distributions as defined in specification :
+  * https://github.com/facebook/zstd/blob/release/doc/zstd_compression_format.md#default-distributions
+@@ -506,14 +548,15 @@ void ZSTD_buildFSETable_body(ZSTD_seqSym
+                 for (i = 8; i < n; i += 8) {
+                     MEM_write64(spread + pos + i, sv);
+                 }
+-                pos += n;
++                assert(n>=0);
++                pos += (size_t)n;
+             }
+         }
+         /* Now we spread those positions across the table.
+-         * The benefit of doing it in two stages is that we avoid the the
++         * The benefit of doing it in two stages is that we avoid the
+          * variable size inner loop, which caused lots of branch misses.
+          * Now we can run through all the positions without any branch misses.
+-         * We unroll the loop twice, since that is what emperically worked best.
++         * We unroll the loop twice, since that is what empirically worked best.
+          */
+         {
+             size_t position = 0;
+@@ -540,7 +583,7 @@ void ZSTD_buildFSETable_body(ZSTD_seqSym
+             for (i=0; i<n; i++) {
+                 tableDecode[position].baseValue = s;
+                 position = (position + step) & tableMask;
+-                while (position > highThreshold) position = (position + step) & tableMask;   /* lowprob area */
++                while (UNLIKELY(position > highThreshold)) position = (position + step) & tableMask;   /* lowprob area */
+         }   }
+         assert(position == 0); /* position must reach all cells once, otherwise normalizedCounter is incorrect */
+     }
+@@ -551,7 +594,7 @@ void ZSTD_buildFSETable_body(ZSTD_seqSym
+         for (u=0; u<tableSize; u++) {
+             U32 const symbol = tableDecode[u].baseValue;
+             U32 const nextState = symbolNext[symbol]++;
+-            tableDecode[u].nbBits = (BYTE) (tableLog - BIT_highbit32(nextState) );
++            tableDecode[u].nbBits = (BYTE) (tableLog - ZSTD_highbit32(nextState) );
+             tableDecode[u].nextState = (U16) ( (nextState << tableDecode[u].nbBits) - tableSize);
+             assert(nbAdditionalBits[symbol] < 255);
+             tableDecode[u].nbAdditionalBits = nbAdditionalBits[symbol];
+@@ -664,11 +707,6 @@ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx*
+ 
+     /* SeqHead */
+     nbSeq = *ip++;
+-    if (!nbSeq) {
+-        *nbSeqPtr=0;
+-        RETURN_ERROR_IF(srcSize != 1, srcSize_wrong, "");
+-        return 1;
+-    }
+     if (nbSeq > 0x7F) {
+         if (nbSeq == 0xFF) {
+             RETURN_ERROR_IF(ip+2 > iend, srcSize_wrong, "");
+@@ -681,8 +719,16 @@ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx*
+     }
+     *nbSeqPtr = nbSeq;
+ 
++    if (nbSeq == 0) {
++        /* No sequence : section ends immediately */
++        RETURN_ERROR_IF(ip != iend, corruption_detected,
++            "extraneous data present in the Sequences section");
++        return (size_t)(ip - istart);
++    }
++
+     /* FSE table descriptors */
+     RETURN_ERROR_IF(ip+1 > iend, srcSize_wrong, ""); /* minimum possible size: 1 byte for symbol encoding types */
++    RETURN_ERROR_IF(*ip & 3, corruption_detected, ""); /* The last field, Reserved, must be all-zeroes. */
+     {   symbolEncodingType_e const LLtype = (symbolEncodingType_e)(*ip >> 6);
+         symbolEncodingType_e const OFtype = (symbolEncodingType_e)((*ip >> 4) & 3);
+         symbolEncodingType_e const MLtype = (symbolEncodingType_e)((*ip >> 2) & 3);
+@@ -829,7 +875,7 @@ static void ZSTD_safecopy(BYTE* op, cons
+ /* ZSTD_safecopyDstBeforeSrc():
+  * This version allows overlap with dst before src, or handles the non-overlap case with dst after src
+  * Kept separate from more common ZSTD_safecopy case to avoid performance impact to the safecopy common case */
+-static void ZSTD_safecopyDstBeforeSrc(BYTE* op, BYTE const* ip, ptrdiff_t length) {
++static void ZSTD_safecopyDstBeforeSrc(BYTE* op, const BYTE* ip, ptrdiff_t length) {
+     ptrdiff_t const diff = op - ip;
+     BYTE* const oend = op + length;
+ 
+@@ -858,6 +904,7 @@ static void ZSTD_safecopyDstBeforeSrc(BY
+  * to be optimized for many small sequences, since those fall into ZSTD_execSequence().
+  */
+ FORCE_NOINLINE
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+ size_t ZSTD_execSequenceEnd(BYTE* op,
+     BYTE* const oend, seq_t sequence,
+     const BYTE** litPtr, const BYTE* const litLimit,
+@@ -905,6 +952,7 @@ size_t ZSTD_execSequenceEnd(BYTE* op,
+  * This version is intended to be used during instances where the litBuffer is still split.  It is kept separate to avoid performance impact for the good case.
+  */
+ FORCE_NOINLINE
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+ size_t ZSTD_execSequenceEndSplitLitBuffer(BYTE* op,
+     BYTE* const oend, const BYTE* const oend_w, seq_t sequence,
+     const BYTE** litPtr, const BYTE* const litLimit,
+@@ -950,6 +998,7 @@ size_t ZSTD_execSequenceEndSplitLitBuffe
+ }
+ 
+ HINT_INLINE
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+ size_t ZSTD_execSequence(BYTE* op,
+     BYTE* const oend, seq_t sequence,
+     const BYTE** litPtr, const BYTE* const litLimit,
+@@ -964,6 +1013,11 @@ size_t ZSTD_execSequence(BYTE* op,
+ 
+     assert(op != NULL /* Precondition */);
+     assert(oend_w < oend /* No underflow */);
++
++#if defined(__aarch64__)
++    /* prefetch sequence starting from match that will be used for copy later */
++    PREFETCH_L1(match);
++#endif
+     /* Handle edge cases in a slow path:
+      *   - Read beyond end of literals
+      *   - Match end is within WILDCOPY_OVERLIMIT of oend
+@@ -1043,6 +1097,7 @@ size_t ZSTD_execSequence(BYTE* op,
+ }
+ 
+ HINT_INLINE
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+ size_t ZSTD_execSequenceSplitLitBuffer(BYTE* op,
+     BYTE* const oend, const BYTE* const oend_w, seq_t sequence,
+     const BYTE** litPtr, const BYTE* const litLimit,
+@@ -1154,7 +1209,7 @@ ZSTD_updateFseStateWithDInfo(ZSTD_fseSta
+ }
+ 
+ /* We need to add at most (ZSTD_WINDOWLOG_MAX_32 - 1) bits to read the maximum
+- * offset bits. But we can only read at most (STREAM_ACCUMULATOR_MIN_32 - 1)
++ * offset bits. But we can only read at most STREAM_ACCUMULATOR_MIN_32
+  * bits before reloading. This value is the maximum number of bytes we read
+  * after reloading when we are decoding long offsets.
+  */
+@@ -1165,13 +1220,37 @@ ZSTD_updateFseStateWithDInfo(ZSTD_fseSta
+ 
+ typedef enum { ZSTD_lo_isRegularOffset, ZSTD_lo_isLongOffset=1 } ZSTD_longOffset_e;
+ 
++/*
++ * ZSTD_decodeSequence():
++ * @p longOffsets : tells the decoder to reload more bit while decoding large offsets
++ *                  only used in 32-bit mode
++ * @return : Sequence (litL + matchL + offset)
++ */
+ FORCE_INLINE_TEMPLATE seq_t
+-ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets)
++ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets, const int isLastSeq)
+ {
+     seq_t seq;
++    /*
++     * ZSTD_seqSymbol is a 64 bits wide structure.
++     * It can be loaded in one operation
++     * and its fields extracted by simply shifting or bit-extracting on aarch64.
++     * GCC doesn't recognize this and generates more unnecessary ldr/ldrb/ldrh
++     * operations that cause performance drop. This can be avoided by using this
++     * ZSTD_memcpy hack.
++     */
++#if defined(__aarch64__) && (defined(__GNUC__) && !defined(__clang__))
++    ZSTD_seqSymbol llDInfoS, mlDInfoS, ofDInfoS;
++    ZSTD_seqSymbol* const llDInfo = &llDInfoS;
++    ZSTD_seqSymbol* const mlDInfo = &mlDInfoS;
++    ZSTD_seqSymbol* const ofDInfo = &ofDInfoS;
++    ZSTD_memcpy(llDInfo, seqState->stateLL.table + seqState->stateLL.state, sizeof(ZSTD_seqSymbol));
++    ZSTD_memcpy(mlDInfo, seqState->stateML.table + seqState->stateML.state, sizeof(ZSTD_seqSymbol));
++    ZSTD_memcpy(ofDInfo, seqState->stateOffb.table + seqState->stateOffb.state, sizeof(ZSTD_seqSymbol));
++#else
+     const ZSTD_seqSymbol* const llDInfo = seqState->stateLL.table + seqState->stateLL.state;
+     const ZSTD_seqSymbol* const mlDInfo = seqState->stateML.table + seqState->stateML.state;
+     const ZSTD_seqSymbol* const ofDInfo = seqState->stateOffb.table + seqState->stateOffb.state;
++#endif
+     seq.matchLength = mlDInfo->baseValue;
+     seq.litLength = llDInfo->baseValue;
+     {   U32 const ofBase = ofDInfo->baseValue;
+@@ -1186,28 +1265,31 @@ ZSTD_decodeSequence(seqState_t* seqState
+         U32 const llnbBits = llDInfo->nbBits;
+         U32 const mlnbBits = mlDInfo->nbBits;
+         U32 const ofnbBits = ofDInfo->nbBits;
++
++        assert(llBits <= MaxLLBits);
++        assert(mlBits <= MaxMLBits);
++        assert(ofBits <= MaxOff);
+         /*
+          * As gcc has better branch and block analyzers, sometimes it is only
+-         * valuable to mark likelyness for clang, it gives around 3-4% of
++         * valuable to mark likeliness for clang, it gives around 3-4% of
+          * performance.
+          */
+ 
+         /* sequence */
+         {   size_t offset;
+-    #if defined(__clang__)
+-            if (LIKELY(ofBits > 1)) {
+-    #else
+             if (ofBits > 1) {
+-    #endif
+                 ZSTD_STATIC_ASSERT(ZSTD_lo_isLongOffset == 1);
+                 ZSTD_STATIC_ASSERT(LONG_OFFSETS_MAX_EXTRA_BITS_32 == 5);
+-                assert(ofBits <= MaxOff);
++                ZSTD_STATIC_ASSERT(STREAM_ACCUMULATOR_MIN_32 > LONG_OFFSETS_MAX_EXTRA_BITS_32);
++                ZSTD_STATIC_ASSERT(STREAM_ACCUMULATOR_MIN_32 - LONG_OFFSETS_MAX_EXTRA_BITS_32 >= MaxMLBits);
+                 if (MEM_32bits() && longOffsets && (ofBits >= STREAM_ACCUMULATOR_MIN_32)) {
+-                    U32 const extraBits = ofBits - MIN(ofBits, 32 - seqState->DStream.bitsConsumed);
++                    /* Always read extra bits, this keeps the logic simple,
++                     * avoids branches, and avoids accidentally reading 0 bits.
++                     */
++                    U32 const extraBits = LONG_OFFSETS_MAX_EXTRA_BITS_32;
+                     offset = ofBase + (BIT_readBitsFast(&seqState->DStream, ofBits - extraBits) << extraBits);
+                     BIT_reloadDStream(&seqState->DStream);
+-                    if (extraBits) offset += BIT_readBitsFast(&seqState->DStream, extraBits);
+-                    assert(extraBits <= LONG_OFFSETS_MAX_EXTRA_BITS_32);   /* to avoid another reload */
++                    offset += BIT_readBitsFast(&seqState->DStream, extraBits);
+                 } else {
+                     offset = ofBase + BIT_readBitsFast(&seqState->DStream, ofBits/*>0*/);   /* <=  (ZSTD_WINDOWLOG_MAX-1) bits */
+                     if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream);
+@@ -1224,7 +1306,7 @@ ZSTD_decodeSequence(seqState_t* seqState
+                 } else {
+                     offset = ofBase + ll0 + BIT_readBitsFast(&seqState->DStream, 1);
+                     {   size_t temp = (offset==3) ? seqState->prevOffset[0] - 1 : seqState->prevOffset[offset];
+-                        temp += !temp;   /* 0 is not valid; input is corrupted; force offset to 1 */
++                        temp -= !temp; /* 0 is not valid: input corrupted => force offset to -1 => corruption detected at execSequence */
+                         if (offset != 1) seqState->prevOffset[2] = seqState->prevOffset[1];
+                         seqState->prevOffset[1] = seqState->prevOffset[0];
+                         seqState->prevOffset[0] = offset = temp;
+@@ -1232,11 +1314,7 @@ ZSTD_decodeSequence(seqState_t* seqState
+             seq.offset = offset;
+         }
+ 
+-    #if defined(__clang__)
+-        if (UNLIKELY(mlBits > 0))
+-    #else
+         if (mlBits > 0)
+-    #endif
+             seq.matchLength += BIT_readBitsFast(&seqState->DStream, mlBits/*>0*/);
+ 
+         if (MEM_32bits() && (mlBits+llBits >= STREAM_ACCUMULATOR_MIN_32-LONG_OFFSETS_MAX_EXTRA_BITS_32))
+@@ -1246,11 +1324,7 @@ ZSTD_decodeSequence(seqState_t* seqState
+         /* Ensure there are enough bits to read the rest of data in 64-bit mode. */
+         ZSTD_STATIC_ASSERT(16+LLFSELog+MLFSELog+OffFSELog < STREAM_ACCUMULATOR_MIN_64);
+ 
+-    #if defined(__clang__)
+-        if (UNLIKELY(llBits > 0))
+-    #else
+         if (llBits > 0)
+-    #endif
+             seq.litLength += BIT_readBitsFast(&seqState->DStream, llBits/*>0*/);
+ 
+         if (MEM_32bits())
+@@ -1259,17 +1333,22 @@ ZSTD_decodeSequence(seqState_t* seqState
+         DEBUGLOG(6, "seq: litL=%u, matchL=%u, offset=%u",
+                     (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset);
+ 
+-        ZSTD_updateFseStateWithDInfo(&seqState->stateLL, &seqState->DStream, llNext, llnbBits);    /* <=  9 bits */
+-        ZSTD_updateFseStateWithDInfo(&seqState->stateML, &seqState->DStream, mlNext, mlnbBits);    /* <=  9 bits */
+-        if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream);    /* <= 18 bits */
+-        ZSTD_updateFseStateWithDInfo(&seqState->stateOffb, &seqState->DStream, ofNext, ofnbBits);  /* <=  8 bits */
++        if (!isLastSeq) {
++            /* don't update FSE state for last Sequence */
++            ZSTD_updateFseStateWithDInfo(&seqState->stateLL, &seqState->DStream, llNext, llnbBits);    /* <=  9 bits */
++            ZSTD_updateFseStateWithDInfo(&seqState->stateML, &seqState->DStream, mlNext, mlnbBits);    /* <=  9 bits */
++            if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream);    /* <= 18 bits */
++            ZSTD_updateFseStateWithDInfo(&seqState->stateOffb, &seqState->DStream, ofNext, ofnbBits);  /* <=  8 bits */
++            BIT_reloadDStream(&seqState->DStream);
++        }
+     }
+ 
+     return seq;
+ }
+ 
+-#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+-MEM_STATIC int ZSTD_dictionaryIsActive(ZSTD_DCtx const* dctx, BYTE const* prefixStart, BYTE const* oLitEnd)
++#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
++#if DEBUGLEVEL >= 1
++static int ZSTD_dictionaryIsActive(ZSTD_DCtx const* dctx, BYTE const* prefixStart, BYTE const* oLitEnd)
+ {
+     size_t const windowSize = dctx->fParams.windowSize;
+     /* No dictionary used. */
+@@ -1283,30 +1362,33 @@ MEM_STATIC int ZSTD_dictionaryIsActive(Z
+     /* Dictionary is active. */
+     return 1;
+ }
++#endif
+ 
+-MEM_STATIC void ZSTD_assertValidSequence(
++static void ZSTD_assertValidSequence(
+         ZSTD_DCtx const* dctx,
+         BYTE const* op, BYTE const* oend,
+         seq_t const seq,
+         BYTE const* prefixStart, BYTE const* virtualStart)
+ {
+ #if DEBUGLEVEL >= 1
+-    size_t const windowSize = dctx->fParams.windowSize;
+-    size_t const sequenceSize = seq.litLength + seq.matchLength;
+-    BYTE const* const oLitEnd = op + seq.litLength;
+-    DEBUGLOG(6, "Checking sequence: litL=%u matchL=%u offset=%u",
+-            (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset);
+-    assert(op <= oend);
+-    assert((size_t)(oend - op) >= sequenceSize);
+-    assert(sequenceSize <= ZSTD_BLOCKSIZE_MAX);
+-    if (ZSTD_dictionaryIsActive(dctx, prefixStart, oLitEnd)) {
+-        size_t const dictSize = (size_t)((char const*)dctx->dictContentEndForFuzzing - (char const*)dctx->dictContentBeginForFuzzing);
+-        /* Offset must be within the dictionary. */
+-        assert(seq.offset <= (size_t)(oLitEnd - virtualStart));
+-        assert(seq.offset <= windowSize + dictSize);
+-    } else {
+-        /* Offset must be within our window. */
+-        assert(seq.offset <= windowSize);
++    if (dctx->isFrameDecompression) {
++        size_t const windowSize = dctx->fParams.windowSize;
++        size_t const sequenceSize = seq.litLength + seq.matchLength;
++        BYTE const* const oLitEnd = op + seq.litLength;
++        DEBUGLOG(6, "Checking sequence: litL=%u matchL=%u offset=%u",
++                (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset);
++        assert(op <= oend);
++        assert((size_t)(oend - op) >= sequenceSize);
++        assert(sequenceSize <= ZSTD_blockSizeMax(dctx));
++        if (ZSTD_dictionaryIsActive(dctx, prefixStart, oLitEnd)) {
++            size_t const dictSize = (size_t)((char const*)dctx->dictContentEndForFuzzing - (char const*)dctx->dictContentBeginForFuzzing);
++            /* Offset must be within the dictionary. */
++            assert(seq.offset <= (size_t)(oLitEnd - virtualStart));
++            assert(seq.offset <= windowSize + dictSize);
++        } else {
++            /* Offset must be within our window. */
++            assert(seq.offset <= windowSize);
++        }
+     }
+ #else
+     (void)dctx, (void)op, (void)oend, (void)seq, (void)prefixStart, (void)virtualStart;
+@@ -1322,23 +1404,21 @@ DONT_VECTORIZE
+ ZSTD_decompressSequences_bodySplitLitBuffer( ZSTD_DCtx* dctx,
+                                void* dst, size_t maxDstSize,
+                          const void* seqStart, size_t seqSize, int nbSeq,
+-                         const ZSTD_longOffset_e isLongOffset,
+-                         const int frame)
++                         const ZSTD_longOffset_e isLongOffset)
+ {
+     const BYTE* ip = (const BYTE*)seqStart;
+     const BYTE* const iend = ip + seqSize;
+     BYTE* const ostart = (BYTE*)dst;
+-    BYTE* const oend = ostart + maxDstSize;
++    BYTE* const oend = ZSTD_maybeNullPtrAdd(ostart, maxDstSize);
+     BYTE* op = ostart;
+     const BYTE* litPtr = dctx->litPtr;
+     const BYTE* litBufferEnd = dctx->litBufferEnd;
+     const BYTE* const prefixStart = (const BYTE*) (dctx->prefixStart);
+     const BYTE* const vBase = (const BYTE*) (dctx->virtualStart);
+     const BYTE* const dictEnd = (const BYTE*) (dctx->dictEnd);
+-    DEBUGLOG(5, "ZSTD_decompressSequences_bodySplitLitBuffer");
+-    (void)frame;
++    DEBUGLOG(5, "ZSTD_decompressSequences_bodySplitLitBuffer (%i seqs)", nbSeq);
+ 
+-    /* Regen sequences */
++    /* Literals are split between internal buffer & output buffer */
+     if (nbSeq) {
+         seqState_t seqState;
+         dctx->fseEntropy = 1;
+@@ -1357,8 +1437,7 @@ ZSTD_decompressSequences_bodySplitLitBuf
+                 BIT_DStream_completed < BIT_DStream_overflow);
+ 
+         /* decompress without overrunning litPtr begins */
+-        {
+-            seq_t sequence = ZSTD_decodeSequence(&seqState, isLongOffset);
++        {   seq_t sequence = {0,0,0};  /* some static analyzer believe that @sequence is not initialized (it necessarily is, since for(;;) loop as at least one iteration) */
+             /* Align the decompression loop to 32 + 16 bytes.
+                 *
+                 * zstd compiled with gcc-9 on an Intel i9-9900k shows 10% decompression
+@@ -1420,27 +1499,26 @@ ZSTD_decompressSequences_bodySplitLitBuf
+ #endif
+ 
+             /* Handle the initial state where litBuffer is currently split between dst and litExtraBuffer */
+-            for (; litPtr + sequence.litLength <= dctx->litBufferEnd; ) {
+-                size_t const oneSeqSize = ZSTD_execSequenceSplitLitBuffer(op, oend, litPtr + sequence.litLength - WILDCOPY_OVERLENGTH, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd);
++            for ( ; nbSeq; nbSeq--) {
++                sequence = ZSTD_decodeSequence(&seqState, isLongOffset, nbSeq==1);
++                if (litPtr + sequence.litLength > dctx->litBufferEnd) break;
++                {   size_t const oneSeqSize = ZSTD_execSequenceSplitLitBuffer(op, oend, litPtr + sequence.litLength - WILDCOPY_OVERLENGTH, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd);
+ #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
+-                assert(!ZSTD_isError(oneSeqSize));
+-                if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase);
++                    assert(!ZSTD_isError(oneSeqSize));
++                    ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase);
+ #endif
+-                if (UNLIKELY(ZSTD_isError(oneSeqSize)))
+-                    return oneSeqSize;
+-                DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize);
+-                op += oneSeqSize;
+-                if (UNLIKELY(!--nbSeq))
+-                    break;
+-                BIT_reloadDStream(&(seqState.DStream));
+-                sequence = ZSTD_decodeSequence(&seqState, isLongOffset);
+-            }
++                    if (UNLIKELY(ZSTD_isError(oneSeqSize)))
++                        return oneSeqSize;
++                    DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize);
++                    op += oneSeqSize;
++            }   }
++            DEBUGLOG(6, "reached: (litPtr + sequence.litLength > dctx->litBufferEnd)");
+ 
+             /* If there are more sequences, they will need to read literals from litExtraBuffer; copy over the remainder from dst and update litPtr and litEnd */
+             if (nbSeq > 0) {
+                 const size_t leftoverLit = dctx->litBufferEnd - litPtr;
+-                if (leftoverLit)
+-                {
++                DEBUGLOG(6, "There are %i sequences left, and %zu/%zu literals left in buffer", nbSeq, leftoverLit, sequence.litLength);
++                if (leftoverLit) {
+                     RETURN_ERROR_IF(leftoverLit > (size_t)(oend - op), dstSize_tooSmall, "remaining lit must fit within dstBuffer");
+                     ZSTD_safecopyDstBeforeSrc(op, litPtr, leftoverLit);
+                     sequence.litLength -= leftoverLit;
+@@ -1449,24 +1527,22 @@ ZSTD_decompressSequences_bodySplitLitBuf
+                 litPtr = dctx->litExtraBuffer;
+                 litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE;
+                 dctx->litBufferLocation = ZSTD_not_in_dst;
+-                {
+-                    size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd);
++                {   size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd);
+ #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
+                     assert(!ZSTD_isError(oneSeqSize));
+-                    if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase);
++                    ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase);
+ #endif
+                     if (UNLIKELY(ZSTD_isError(oneSeqSize)))
+                         return oneSeqSize;
+                     DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize);
+                     op += oneSeqSize;
+-                    if (--nbSeq)
+-                        BIT_reloadDStream(&(seqState.DStream));
+                 }
++                nbSeq--;
+             }
+         }
+ 
+-        if (nbSeq > 0) /* there is remaining lit from extra buffer */
+-        {
++        if (nbSeq > 0) {
++            /* there is remaining lit from extra buffer */
+ 
+ #if defined(__x86_64__)
+             __asm__(".p2align 6");
+@@ -1485,35 +1561,34 @@ ZSTD_decompressSequences_bodySplitLitBuf
+ #  endif
+ #endif
+ 
+-            for (; ; ) {
+-                seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset);
++            for ( ; nbSeq ; nbSeq--) {
++                seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset, nbSeq==1);
+                 size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd);
+ #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
+                 assert(!ZSTD_isError(oneSeqSize));
+-                if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase);
++                ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase);
+ #endif
+                 if (UNLIKELY(ZSTD_isError(oneSeqSize)))
+                     return oneSeqSize;
+                 DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize);
+                 op += oneSeqSize;
+-                if (UNLIKELY(!--nbSeq))
+-                    break;
+-                BIT_reloadDStream(&(seqState.DStream));
+             }
+         }
+ 
+         /* check if reached exact end */
+         DEBUGLOG(5, "ZSTD_decompressSequences_bodySplitLitBuffer: after decode loop, remaining nbSeq : %i", nbSeq);
+         RETURN_ERROR_IF(nbSeq, corruption_detected, "");
+-        RETURN_ERROR_IF(BIT_reloadDStream(&seqState.DStream) < BIT_DStream_completed, corruption_detected, "");
++        DEBUGLOG(5, "bitStream : start=%p, ptr=%p, bitsConsumed=%u", seqState.DStream.start, seqState.DStream.ptr, seqState.DStream.bitsConsumed);
++        RETURN_ERROR_IF(!BIT_endOfDStream(&seqState.DStream), corruption_detected, "");
+         /* save reps for next block */
+         { U32 i; for (i=0; i<ZSTD_REP_NUM; i++) dctx->entropy.rep[i] = (U32)(seqState.prevOffset[i]); }
+     }
+ 
+     /* last literal segment */
+-    if (dctx->litBufferLocation == ZSTD_split)  /* split hasn't been reached yet, first get dst then copy litExtraBuffer */
+-    {
+-        size_t const lastLLSize = litBufferEnd - litPtr;
++    if (dctx->litBufferLocation == ZSTD_split) {
++        /* split hasn't been reached yet, first get dst then copy litExtraBuffer */
++        size_t const lastLLSize = (size_t)(litBufferEnd - litPtr);
++        DEBUGLOG(6, "copy last literals from segment : %u", (U32)lastLLSize);
+         RETURN_ERROR_IF(lastLLSize > (size_t)(oend - op), dstSize_tooSmall, "");
+         if (op != NULL) {
+             ZSTD_memmove(op, litPtr, lastLLSize);
+@@ -1523,15 +1598,17 @@ ZSTD_decompressSequences_bodySplitLitBuf
+         litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE;
+         dctx->litBufferLocation = ZSTD_not_in_dst;
+     }
+-    {   size_t const lastLLSize = litBufferEnd - litPtr;
++    /* copy last literals from internal buffer */
++    {   size_t const lastLLSize = (size_t)(litBufferEnd - litPtr);
++        DEBUGLOG(6, "copy last literals from internal buffer : %u", (U32)lastLLSize);
+         RETURN_ERROR_IF(lastLLSize > (size_t)(oend-op), dstSize_tooSmall, "");
+         if (op != NULL) {
+             ZSTD_memcpy(op, litPtr, lastLLSize);
+             op += lastLLSize;
+-        }
+-    }
++    }   }
+ 
+-    return op-ostart;
++    DEBUGLOG(6, "decoded block of size %u bytes", (U32)(op - ostart));
++    return (size_t)(op - ostart);
+ }
+ 
+ FORCE_INLINE_TEMPLATE size_t
+@@ -1539,21 +1616,19 @@ DONT_VECTORIZE
+ ZSTD_decompressSequences_body(ZSTD_DCtx* dctx,
+     void* dst, size_t maxDstSize,
+     const void* seqStart, size_t seqSize, int nbSeq,
+-    const ZSTD_longOffset_e isLongOffset,
+-    const int frame)
++    const ZSTD_longOffset_e isLongOffset)
+ {
+     const BYTE* ip = (const BYTE*)seqStart;
+     const BYTE* const iend = ip + seqSize;
+     BYTE* const ostart = (BYTE*)dst;
+-    BYTE* const oend = dctx->litBufferLocation == ZSTD_not_in_dst ? ostart + maxDstSize : dctx->litBuffer;
++    BYTE* const oend = dctx->litBufferLocation == ZSTD_not_in_dst ? ZSTD_maybeNullPtrAdd(ostart, maxDstSize) : dctx->litBuffer;
+     BYTE* op = ostart;
+     const BYTE* litPtr = dctx->litPtr;
+     const BYTE* const litEnd = litPtr + dctx->litSize;
+     const BYTE* const prefixStart = (const BYTE*)(dctx->prefixStart);
+     const BYTE* const vBase = (const BYTE*)(dctx->virtualStart);
+     const BYTE* const dictEnd = (const BYTE*)(dctx->dictEnd);
+-    DEBUGLOG(5, "ZSTD_decompressSequences_body");
+-    (void)frame;
++    DEBUGLOG(5, "ZSTD_decompressSequences_body: nbSeq = %d", nbSeq);
+ 
+     /* Regen sequences */
+     if (nbSeq) {
+@@ -1568,11 +1643,6 @@ ZSTD_decompressSequences_body(ZSTD_DCtx*
+         ZSTD_initFseState(&seqState.stateML, &seqState.DStream, dctx->MLTptr);
+         assert(dst != NULL);
+ 
+-        ZSTD_STATIC_ASSERT(
+-            BIT_DStream_unfinished < BIT_DStream_completed &&
+-            BIT_DStream_endOfBuffer < BIT_DStream_completed &&
+-            BIT_DStream_completed < BIT_DStream_overflow);
+-
+ #if defined(__x86_64__)
+             __asm__(".p2align 6");
+             __asm__("nop");
+@@ -1587,73 +1657,70 @@ ZSTD_decompressSequences_body(ZSTD_DCtx*
+ #  endif
+ #endif
+ 
+-        for ( ; ; ) {
+-            seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset);
++        for ( ; nbSeq ; nbSeq--) {
++            seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset, nbSeq==1);
+             size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litEnd, prefixStart, vBase, dictEnd);
+ #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
+             assert(!ZSTD_isError(oneSeqSize));
+-            if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase);
++            ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase);
+ #endif
+             if (UNLIKELY(ZSTD_isError(oneSeqSize)))
+                 return oneSeqSize;
+             DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize);
+             op += oneSeqSize;
+-            if (UNLIKELY(!--nbSeq))
+-                break;
+-            BIT_reloadDStream(&(seqState.DStream));
+         }
+ 
+         /* check if reached exact end */
+-        DEBUGLOG(5, "ZSTD_decompressSequences_body: after decode loop, remaining nbSeq : %i", nbSeq);
+-        RETURN_ERROR_IF(nbSeq, corruption_detected, "");
+-        RETURN_ERROR_IF(BIT_reloadDStream(&seqState.DStream) < BIT_DStream_completed, corruption_detected, "");
++        assert(nbSeq == 0);
++        RETURN_ERROR_IF(!BIT_endOfDStream(&seqState.DStream), corruption_detected, "");
+         /* save reps for next block */
+         { U32 i; for (i=0; i<ZSTD_REP_NUM; i++) dctx->entropy.rep[i] = (U32)(seqState.prevOffset[i]); }
+     }
+ 
+     /* last literal segment */
+-    {   size_t const lastLLSize = litEnd - litPtr;
++    {   size_t const lastLLSize = (size_t)(litEnd - litPtr);
++        DEBUGLOG(6, "copy last literals : %u", (U32)lastLLSize);
+         RETURN_ERROR_IF(lastLLSize > (size_t)(oend-op), dstSize_tooSmall, "");
+         if (op != NULL) {
+             ZSTD_memcpy(op, litPtr, lastLLSize);
+             op += lastLLSize;
+-        }
+-    }
++    }   }
+ 
+-    return op-ostart;
++    DEBUGLOG(6, "decoded block of size %u bytes", (U32)(op - ostart));
++    return (size_t)(op - ostart);
+ }
+ 
+ static size_t
+ ZSTD_decompressSequences_default(ZSTD_DCtx* dctx,
+                                  void* dst, size_t maxDstSize,
+                            const void* seqStart, size_t seqSize, int nbSeq,
+-                           const ZSTD_longOffset_e isLongOffset,
+-                           const int frame)
++                           const ZSTD_longOffset_e isLongOffset)
+ {
+-    return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
++    return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
+ }
+ 
+ static size_t
+ ZSTD_decompressSequencesSplitLitBuffer_default(ZSTD_DCtx* dctx,
+                                                void* dst, size_t maxDstSize,
+                                          const void* seqStart, size_t seqSize, int nbSeq,
+-                                         const ZSTD_longOffset_e isLongOffset,
+-                                         const int frame)
++                                         const ZSTD_longOffset_e isLongOffset)
+ {
+-    return ZSTD_decompressSequences_bodySplitLitBuffer(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
++    return ZSTD_decompressSequences_bodySplitLitBuffer(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
+ }
+ #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */
+ 
+ #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT
+ 
+-FORCE_INLINE_TEMPLATE size_t
+-ZSTD_prefetchMatch(size_t prefetchPos, seq_t const sequence,
++FORCE_INLINE_TEMPLATE
++
++size_t ZSTD_prefetchMatch(size_t prefetchPos, seq_t const sequence,
+                    const BYTE* const prefixStart, const BYTE* const dictEnd)
+ {
+     prefetchPos += sequence.litLength;
+     {   const BYTE* const matchBase = (sequence.offset > prefetchPos) ? dictEnd : prefixStart;
+-        const BYTE* const match = matchBase + prefetchPos - sequence.offset; /* note : this operation can overflow when seq.offset is really too large, which can only happen when input is corrupted.
+-                                                                              * No consequence though : memory address is only used for prefetching, not for dereferencing */
++        /* note : this operation can overflow when seq.offset is really too large, which can only happen when input is corrupted.
++         * No consequence though : memory address is only used for prefetching, not for dereferencing */
++        const BYTE* const match = ZSTD_wrappedPtrSub(ZSTD_wrappedPtrAdd(matchBase, prefetchPos), sequence.offset);
+         PREFETCH_L1(match); PREFETCH_L1(match+CACHELINE_SIZE);   /* note : it's safe to invoke PREFETCH() on any memory address, including invalid ones */
+     }
+     return prefetchPos + sequence.matchLength;
+@@ -1668,20 +1735,18 @@ ZSTD_decompressSequencesLong_body(
+                                ZSTD_DCtx* dctx,
+                                void* dst, size_t maxDstSize,
+                          const void* seqStart, size_t seqSize, int nbSeq,
+-                         const ZSTD_longOffset_e isLongOffset,
+-                         const int frame)
++                         const ZSTD_longOffset_e isLongOffset)
+ {
+     const BYTE* ip = (const BYTE*)seqStart;
+     const BYTE* const iend = ip + seqSize;
+     BYTE* const ostart = (BYTE*)dst;
+-    BYTE* const oend = dctx->litBufferLocation == ZSTD_in_dst ? dctx->litBuffer : ostart + maxDstSize;
++    BYTE* const oend = dctx->litBufferLocation == ZSTD_in_dst ? dctx->litBuffer : ZSTD_maybeNullPtrAdd(ostart, maxDstSize);
+     BYTE* op = ostart;
+     const BYTE* litPtr = dctx->litPtr;
+     const BYTE* litBufferEnd = dctx->litBufferEnd;
+     const BYTE* const prefixStart = (const BYTE*) (dctx->prefixStart);
+     const BYTE* const dictStart = (const BYTE*) (dctx->virtualStart);
+     const BYTE* const dictEnd = (const BYTE*) (dctx->dictEnd);
+-    (void)frame;
+ 
+     /* Regen sequences */
+     if (nbSeq) {
+@@ -1706,20 +1771,17 @@ ZSTD_decompressSequencesLong_body(
+         ZSTD_initFseState(&seqState.stateML, &seqState.DStream, dctx->MLTptr);
+ 
+         /* prepare in advance */
+-        for (seqNb=0; (BIT_reloadDStream(&seqState.DStream) <= BIT_DStream_completed) && (seqNb<seqAdvance); seqNb++) {
+-            seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset);
++        for (seqNb=0; seqNb<seqAdvance; seqNb++) {
++            seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset, seqNb == nbSeq-1);
+             prefetchPos = ZSTD_prefetchMatch(prefetchPos, sequence, prefixStart, dictEnd);
+             sequences[seqNb] = sequence;
+         }
+-        RETURN_ERROR_IF(seqNb<seqAdvance, corruption_detected, "");
+ 
+         /* decompress without stomping litBuffer */
+-        for (; (BIT_reloadDStream(&(seqState.DStream)) <= BIT_DStream_completed) && (seqNb < nbSeq); seqNb++) {
+-            seq_t sequence = ZSTD_decodeSequence(&seqState, isLongOffset);
+-            size_t oneSeqSize;
++        for (; seqNb < nbSeq; seqNb++) {
++            seq_t sequence = ZSTD_decodeSequence(&seqState, isLongOffset, seqNb == nbSeq-1);
+ 
+-            if (dctx->litBufferLocation == ZSTD_split && litPtr + sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK].litLength > dctx->litBufferEnd)
+-            {
++            if (dctx->litBufferLocation == ZSTD_split && litPtr + sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK].litLength > dctx->litBufferEnd) {
+                 /* lit buffer is reaching split point, empty out the first buffer and transition to litExtraBuffer */
+                 const size_t leftoverLit = dctx->litBufferEnd - litPtr;
+                 if (leftoverLit)
+@@ -1732,26 +1794,26 @@ ZSTD_decompressSequencesLong_body(
+                 litPtr = dctx->litExtraBuffer;
+                 litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE;
+                 dctx->litBufferLocation = ZSTD_not_in_dst;
+-                oneSeqSize = ZSTD_execSequence(op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd);
++                {   size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd);
+ #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
+-                assert(!ZSTD_isError(oneSeqSize));
+-                if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart);
++                    assert(!ZSTD_isError(oneSeqSize));
++                    ZSTD_assertValidSequence(dctx, op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart);
+ #endif
+-                if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
++                    if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
+ 
+-                prefetchPos = ZSTD_prefetchMatch(prefetchPos, sequence, prefixStart, dictEnd);
+-                sequences[seqNb & STORED_SEQS_MASK] = sequence;
+-                op += oneSeqSize;
+-            }
++                    prefetchPos = ZSTD_prefetchMatch(prefetchPos, sequence, prefixStart, dictEnd);
++                    sequences[seqNb & STORED_SEQS_MASK] = sequence;
++                    op += oneSeqSize;
++            }   }
+             else
+             {
+                 /* lit buffer is either wholly contained in first or second split, or not split at all*/
+-                oneSeqSize = dctx->litBufferLocation == ZSTD_split ?
++                size_t const oneSeqSize = dctx->litBufferLocation == ZSTD_split ?
+                     ZSTD_execSequenceSplitLitBuffer(op, oend, litPtr + sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK].litLength - WILDCOPY_OVERLENGTH, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd) :
+                     ZSTD_execSequence(op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd);
+ #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
+                 assert(!ZSTD_isError(oneSeqSize));
+-                if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart);
++                ZSTD_assertValidSequence(dctx, op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart);
+ #endif
+                 if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
+ 
+@@ -1760,17 +1822,15 @@ ZSTD_decompressSequencesLong_body(
+                 op += oneSeqSize;
+             }
+         }
+-        RETURN_ERROR_IF(seqNb<nbSeq, corruption_detected, "");
++        RETURN_ERROR_IF(!BIT_endOfDStream(&seqState.DStream), corruption_detected, "");
+ 
+         /* finish queue */
+         seqNb -= seqAdvance;
+         for ( ; seqNb<nbSeq ; seqNb++) {
+             seq_t *sequence = &(sequences[seqNb&STORED_SEQS_MASK]);
+-            if (dctx->litBufferLocation == ZSTD_split && litPtr + sequence->litLength > dctx->litBufferEnd)
+-            {
++            if (dctx->litBufferLocation == ZSTD_split && litPtr + sequence->litLength > dctx->litBufferEnd) {
+                 const size_t leftoverLit = dctx->litBufferEnd - litPtr;
+-                if (leftoverLit)
+-                {
++                if (leftoverLit) {
+                     RETURN_ERROR_IF(leftoverLit > (size_t)(oend - op), dstSize_tooSmall, "remaining lit must fit within dstBuffer");
+                     ZSTD_safecopyDstBeforeSrc(op, litPtr, leftoverLit);
+                     sequence->litLength -= leftoverLit;
+@@ -1779,11 +1839,10 @@ ZSTD_decompressSequencesLong_body(
+                 litPtr = dctx->litExtraBuffer;
+                 litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE;
+                 dctx->litBufferLocation = ZSTD_not_in_dst;
+-                {
+-                    size_t const oneSeqSize = ZSTD_execSequence(op, oend, *sequence, &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd);
++                {   size_t const oneSeqSize = ZSTD_execSequence(op, oend, *sequence, &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd);
+ #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
+                     assert(!ZSTD_isError(oneSeqSize));
+-                    if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[seqNb&STORED_SEQS_MASK], prefixStart, dictStart);
++                    ZSTD_assertValidSequence(dctx, op, oend, sequences[seqNb&STORED_SEQS_MASK], prefixStart, dictStart);
+ #endif
+                     if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
+                     op += oneSeqSize;
+@@ -1796,7 +1855,7 @@ ZSTD_decompressSequencesLong_body(
+                     ZSTD_execSequence(op, oend, *sequence, &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd);
+ #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
+                 assert(!ZSTD_isError(oneSeqSize));
+-                if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[seqNb&STORED_SEQS_MASK], prefixStart, dictStart);
++                ZSTD_assertValidSequence(dctx, op, oend, sequences[seqNb&STORED_SEQS_MASK], prefixStart, dictStart);
+ #endif
+                 if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
+                 op += oneSeqSize;
+@@ -1808,8 +1867,7 @@ ZSTD_decompressSequencesLong_body(
+     }
+ 
+     /* last literal segment */
+-    if (dctx->litBufferLocation == ZSTD_split)  /* first deplete literal buffer in dst, then copy litExtraBuffer */
+-    {
++    if (dctx->litBufferLocation == ZSTD_split) { /* first deplete literal buffer in dst, then copy litExtraBuffer */
+         size_t const lastLLSize = litBufferEnd - litPtr;
+         RETURN_ERROR_IF(lastLLSize > (size_t)(oend - op), dstSize_tooSmall, "");
+         if (op != NULL) {
+@@ -1827,17 +1885,16 @@ ZSTD_decompressSequencesLong_body(
+         }
+     }
+ 
+-    return op-ostart;
++    return (size_t)(op - ostart);
+ }
+ 
+ static size_t
+ ZSTD_decompressSequencesLong_default(ZSTD_DCtx* dctx,
+                                  void* dst, size_t maxDstSize,
+                            const void* seqStart, size_t seqSize, int nbSeq,
+-                           const ZSTD_longOffset_e isLongOffset,
+-                           const int frame)
++                           const ZSTD_longOffset_e isLongOffset)
+ {
+-    return ZSTD_decompressSequencesLong_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
++    return ZSTD_decompressSequencesLong_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
+ }
+ #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */
+ 
+@@ -1851,20 +1908,18 @@ DONT_VECTORIZE
+ ZSTD_decompressSequences_bmi2(ZSTD_DCtx* dctx,
+                                  void* dst, size_t maxDstSize,
+                            const void* seqStart, size_t seqSize, int nbSeq,
+-                           const ZSTD_longOffset_e isLongOffset,
+-                           const int frame)
++                           const ZSTD_longOffset_e isLongOffset)
+ {
+-    return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
++    return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
+ }
+ static BMI2_TARGET_ATTRIBUTE size_t
+ DONT_VECTORIZE
+ ZSTD_decompressSequencesSplitLitBuffer_bmi2(ZSTD_DCtx* dctx,
+                                  void* dst, size_t maxDstSize,
+                            const void* seqStart, size_t seqSize, int nbSeq,
+-                           const ZSTD_longOffset_e isLongOffset,
+-                           const int frame)
++                           const ZSTD_longOffset_e isLongOffset)
+ {
+-    return ZSTD_decompressSequences_bodySplitLitBuffer(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
++    return ZSTD_decompressSequences_bodySplitLitBuffer(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
+ }
+ #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */
+ 
+@@ -1873,10 +1928,9 @@ static BMI2_TARGET_ATTRIBUTE size_t
+ ZSTD_decompressSequencesLong_bmi2(ZSTD_DCtx* dctx,
+                                  void* dst, size_t maxDstSize,
+                            const void* seqStart, size_t seqSize, int nbSeq,
+-                           const ZSTD_longOffset_e isLongOffset,
+-                           const int frame)
++                           const ZSTD_longOffset_e isLongOffset)
+ {
+-    return ZSTD_decompressSequencesLong_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
++    return ZSTD_decompressSequencesLong_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
+ }
+ #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */
+ 
+@@ -1886,37 +1940,34 @@ typedef size_t (*ZSTD_decompressSequence
+                             ZSTD_DCtx* dctx,
+                             void* dst, size_t maxDstSize,
+                             const void* seqStart, size_t seqSize, int nbSeq,
+-                            const ZSTD_longOffset_e isLongOffset,
+-                            const int frame);
++                            const ZSTD_longOffset_e isLongOffset);
+ 
+ #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG
+ static size_t
+ ZSTD_decompressSequences(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize,
+                    const void* seqStart, size_t seqSize, int nbSeq,
+-                   const ZSTD_longOffset_e isLongOffset,
+-                   const int frame)
++                   const ZSTD_longOffset_e isLongOffset)
+ {
+     DEBUGLOG(5, "ZSTD_decompressSequences");
+ #if DYNAMIC_BMI2
+     if (ZSTD_DCtx_get_bmi2(dctx)) {
+-        return ZSTD_decompressSequences_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
++        return ZSTD_decompressSequences_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
+     }
+ #endif
+-    return ZSTD_decompressSequences_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
++    return ZSTD_decompressSequences_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
+ }
+ static size_t
+ ZSTD_decompressSequencesSplitLitBuffer(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize,
+                                  const void* seqStart, size_t seqSize, int nbSeq,
+-                                 const ZSTD_longOffset_e isLongOffset,
+-                                 const int frame)
++                                 const ZSTD_longOffset_e isLongOffset)
+ {
+     DEBUGLOG(5, "ZSTD_decompressSequencesSplitLitBuffer");
+ #if DYNAMIC_BMI2
+     if (ZSTD_DCtx_get_bmi2(dctx)) {
+-        return ZSTD_decompressSequencesSplitLitBuffer_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
++        return ZSTD_decompressSequencesSplitLitBuffer_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
+     }
+ #endif
+-    return ZSTD_decompressSequencesSplitLitBuffer_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
++    return ZSTD_decompressSequencesSplitLitBuffer_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
+ }
+ #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */
+ 
+@@ -1931,69 +1982,114 @@ static size_t
+ ZSTD_decompressSequencesLong(ZSTD_DCtx* dctx,
+                              void* dst, size_t maxDstSize,
+                              const void* seqStart, size_t seqSize, int nbSeq,
+-                             const ZSTD_longOffset_e isLongOffset,
+-                             const int frame)
++                             const ZSTD_longOffset_e isLongOffset)
+ {
+     DEBUGLOG(5, "ZSTD_decompressSequencesLong");
+ #if DYNAMIC_BMI2
+     if (ZSTD_DCtx_get_bmi2(dctx)) {
+-        return ZSTD_decompressSequencesLong_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
++        return ZSTD_decompressSequencesLong_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
+     }
+ #endif
+-  return ZSTD_decompressSequencesLong_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
++  return ZSTD_decompressSequencesLong_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
+ }
+ #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */
+ 
+ 
++/*
++ * @returns The total size of the history referenceable by zstd, including
++ * both the prefix and the extDict. At @p op any offset larger than this
++ * is invalid.
++ */
++static size_t ZSTD_totalHistorySize(BYTE* op, BYTE const* virtualStart)
++{
++    return (size_t)(op - virtualStart);
++}
++
++typedef struct {
++    unsigned longOffsetShare;
++    unsigned maxNbAdditionalBits;
++} ZSTD_OffsetInfo;
+ 
+-#if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \
+-    !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG)
+-/* ZSTD_getLongOffsetsShare() :
++/* ZSTD_getOffsetInfo() :
+  * condition : offTable must be valid
+  * @return : "share" of long offsets (arbitrarily defined as > (1<<23))
+- *           compared to maximum possible of (1<<OffFSELog) */
+-static unsigned
+-ZSTD_getLongOffsetsShare(const ZSTD_seqSymbol* offTable)
+-{
+-    const void* ptr = offTable;
+-    U32 const tableLog = ((const ZSTD_seqSymbol_header*)ptr)[0].tableLog;
+-    const ZSTD_seqSymbol* table = offTable + 1;
+-    U32 const max = 1 << tableLog;
+-    U32 u, total = 0;
+-    DEBUGLOG(5, "ZSTD_getLongOffsetsShare: (tableLog=%u)", tableLog);
+-
+-    assert(max <= (1 << OffFSELog));  /* max not too large */
+-    for (u=0; u<max; u++) {
+-        if (table[u].nbAdditionalBits > 22) total += 1;
++ *           compared to maximum possible of (1<<OffFSELog),
++ *           as well as the maximum number additional bits required.
++ */
++static ZSTD_OffsetInfo
++ZSTD_getOffsetInfo(const ZSTD_seqSymbol* offTable, int nbSeq)
++{
++    ZSTD_OffsetInfo info = {0, 0};
++    /* If nbSeq == 0, then the offTable is uninitialized, but we have
++     * no sequences, so both values should be 0.
++     */
++    if (nbSeq != 0) {
++        const void* ptr = offTable;
++        U32 const tableLog = ((const ZSTD_seqSymbol_header*)ptr)[0].tableLog;
++        const ZSTD_seqSymbol* table = offTable + 1;
++        U32 const max = 1 << tableLog;
++        U32 u;
++        DEBUGLOG(5, "ZSTD_getLongOffsetsShare: (tableLog=%u)", tableLog);
++
++        assert(max <= (1 << OffFSELog));  /* max not too large */
++        for (u=0; u<max; u++) {
++            info.maxNbAdditionalBits = MAX(info.maxNbAdditionalBits, table[u].nbAdditionalBits);
++            if (table[u].nbAdditionalBits > 22) info.longOffsetShare += 1;
++        }
++
++        assert(tableLog <= OffFSELog);
++        info.longOffsetShare <<= (OffFSELog - tableLog);  /* scale to OffFSELog */
+     }
+ 
+-    assert(tableLog <= OffFSELog);
+-    total <<= (OffFSELog - tableLog);  /* scale to OffFSELog */
++    return info;
++}
+ 
+-    return total;
++/*
++ * @returns The maximum offset we can decode in one read of our bitstream, without
++ * reloading more bits in the middle of the offset bits read. Any offsets larger
++ * than this must use the long offset decoder.
++ */
++static size_t ZSTD_maxShortOffset(void)
++{
++    if (MEM_64bits()) {
++        /* We can decode any offset without reloading bits.
++         * This might change if the max window size grows.
++         */
++        ZSTD_STATIC_ASSERT(ZSTD_WINDOWLOG_MAX <= 31);
++        return (size_t)-1;
++    } else {
++        /* The maximum offBase is (1 << (STREAM_ACCUMULATOR_MIN + 1)) - 1.
++         * This offBase would require STREAM_ACCUMULATOR_MIN extra bits.
++         * Then we have to subtract ZSTD_REP_NUM to get the maximum possible offset.
++         */
++        size_t const maxOffbase = ((size_t)1 << (STREAM_ACCUMULATOR_MIN + 1)) - 1;
++        size_t const maxOffset = maxOffbase - ZSTD_REP_NUM;
++        assert(ZSTD_highbit32((U32)maxOffbase) == STREAM_ACCUMULATOR_MIN);
++        return maxOffset;
++    }
+ }
+-#endif
+ 
+ size_t
+ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
+                               void* dst, size_t dstCapacity,
+-                        const void* src, size_t srcSize, const int frame, const streaming_operation streaming)
++                        const void* src, size_t srcSize, const streaming_operation streaming)
+ {   /* blockType == blockCompressed */
+     const BYTE* ip = (const BYTE*)src;
+-    /* isLongOffset must be true if there are long offsets.
+-     * Offsets are long if they are larger than 2^STREAM_ACCUMULATOR_MIN.
+-     * We don't expect that to be the case in 64-bit mode.
+-     * In block mode, window size is not known, so we have to be conservative.
+-     * (note: but it could be evaluated from current-lowLimit)
+-     */
+-    ZSTD_longOffset_e const isLongOffset = (ZSTD_longOffset_e)(MEM_32bits() && (!frame || (dctx->fParams.windowSize > (1ULL << STREAM_ACCUMULATOR_MIN))));
+-    DEBUGLOG(5, "ZSTD_decompressBlock_internal (size : %u)", (U32)srcSize);
++    DEBUGLOG(5, "ZSTD_decompressBlock_internal (cSize : %u)", (unsigned)srcSize);
+ 
+-    RETURN_ERROR_IF(srcSize >= ZSTD_BLOCKSIZE_MAX, srcSize_wrong, "");
++    /* Note : the wording of the specification
++     * allows compressed block to be sized exactly ZSTD_blockSizeMax(dctx).
++     * This generally does not happen, as it makes little sense,
++     * since an uncompressed block would feature same size and have no decompression cost.
++     * Also, note that decoder from reference libzstd before < v1.5.4
++     * would consider this edge case as an error.
++     * As a consequence, avoid generating compressed blocks of size ZSTD_blockSizeMax(dctx)
++     * for broader compatibility with the deployed ecosystem of zstd decoders */
++    RETURN_ERROR_IF(srcSize > ZSTD_blockSizeMax(dctx), srcSize_wrong, "");
+ 
+     /* Decode literals section */
+     {   size_t const litCSize = ZSTD_decodeLiteralsBlock(dctx, src, srcSize, dst, dstCapacity, streaming);
+-        DEBUGLOG(5, "ZSTD_decodeLiteralsBlock : %u", (U32)litCSize);
++        DEBUGLOG(5, "ZSTD_decodeLiteralsBlock : cSize=%u, nbLiterals=%zu", (U32)litCSize, dctx->litSize);
+         if (ZSTD_isError(litCSize)) return litCSize;
+         ip += litCSize;
+         srcSize -= litCSize;
+@@ -2001,6 +2097,23 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx*
+ 
+     /* Build Decoding Tables */
+     {
++        /* Compute the maximum block size, which must also work when !frame and fParams are unset.
++         * Additionally, take the min with dstCapacity to ensure that the totalHistorySize fits in a size_t.
++         */
++        size_t const blockSizeMax = MIN(dstCapacity, ZSTD_blockSizeMax(dctx));
++        size_t const totalHistorySize = ZSTD_totalHistorySize(ZSTD_maybeNullPtrAdd((BYTE*)dst, blockSizeMax), (BYTE const*)dctx->virtualStart);
++        /* isLongOffset must be true if there are long offsets.
++         * Offsets are long if they are larger than ZSTD_maxShortOffset().
++         * We don't expect that to be the case in 64-bit mode.
++         *
++         * We check here to see if our history is large enough to allow long offsets.
++         * If it isn't, then we can't possible have (valid) long offsets. If the offset
++         * is invalid, then it is okay to read it incorrectly.
++         *
++         * If isLongOffsets is true, then we will later check our decoding table to see
++         * if it is even possible to generate long offsets.
++         */
++        ZSTD_longOffset_e isLongOffset = (ZSTD_longOffset_e)(MEM_32bits() && (totalHistorySize > ZSTD_maxShortOffset()));
+         /* These macros control at build-time which decompressor implementation
+          * we use. If neither is defined, we do some inspection and dispatch at
+          * runtime.
+@@ -2008,6 +2121,11 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx*
+ #if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \
+     !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG)
+         int usePrefetchDecoder = dctx->ddictIsCold;
++#else
++        /* Set to 1 to avoid computing offset info if we don't need to.
++         * Otherwise this value is ignored.
++         */
++        int usePrefetchDecoder = 1;
+ #endif
+         int nbSeq;
+         size_t const seqHSize = ZSTD_decodeSeqHeaders(dctx, &nbSeq, ip, srcSize);
+@@ -2015,40 +2133,55 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx*
+         ip += seqHSize;
+         srcSize -= seqHSize;
+ 
+-        RETURN_ERROR_IF(dst == NULL && nbSeq > 0, dstSize_tooSmall, "NULL not handled");
+-
+-#if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \
+-    !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG)
+-        if ( !usePrefetchDecoder
+-          && (!frame || (dctx->fParams.windowSize > (1<<24)))
+-          && (nbSeq>ADVANCED_SEQS) ) {  /* could probably use a larger nbSeq limit */
+-            U32 const shareLongOffsets = ZSTD_getLongOffsetsShare(dctx->OFTptr);
+-            U32 const minShare = MEM_64bits() ? 7 : 20; /* heuristic values, correspond to 2.73% and 7.81% */
+-            usePrefetchDecoder = (shareLongOffsets >= minShare);
++        RETURN_ERROR_IF((dst == NULL || dstCapacity == 0) && nbSeq > 0, dstSize_tooSmall, "NULL not handled");
++        RETURN_ERROR_IF(MEM_64bits() && sizeof(size_t) == sizeof(void*) && (size_t)(-1) - (size_t)dst < (size_t)(1 << 20), dstSize_tooSmall,
++                "invalid dst");
++
++        /* If we could potentially have long offsets, or we might want to use the prefetch decoder,
++         * compute information about the share of long offsets, and the maximum nbAdditionalBits.
++         * NOTE: could probably use a larger nbSeq limit
++         */
++        if (isLongOffset || (!usePrefetchDecoder && (totalHistorySize > (1u << 24)) && (nbSeq > 8))) {
++            ZSTD_OffsetInfo const info = ZSTD_getOffsetInfo(dctx->OFTptr, nbSeq);
++            if (isLongOffset && info.maxNbAdditionalBits <= STREAM_ACCUMULATOR_MIN) {
++                /* If isLongOffset, but the maximum number of additional bits that we see in our table is small
++                 * enough, then we know it is impossible to have too long an offset in this block, so we can
++                 * use the regular offset decoder.
++                 */
++                isLongOffset = ZSTD_lo_isRegularOffset;
++            }
++            if (!usePrefetchDecoder) {
++                U32 const minShare = MEM_64bits() ? 7 : 20; /* heuristic values, correspond to 2.73% and 7.81% */
++                usePrefetchDecoder = (info.longOffsetShare >= minShare);
++            }
+         }
+-#endif
+ 
+         dctx->ddictIsCold = 0;
+ 
+ #if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \
+     !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG)
+-        if (usePrefetchDecoder)
++        if (usePrefetchDecoder) {
++#else
++        (void)usePrefetchDecoder;
++        {
+ #endif
+ #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT
+-            return ZSTD_decompressSequencesLong(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset, frame);
++            return ZSTD_decompressSequencesLong(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset);
+ #endif
++        }
+ 
+ #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG
+         /* else */
+         if (dctx->litBufferLocation == ZSTD_split)
+-            return ZSTD_decompressSequencesSplitLitBuffer(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset, frame);
++            return ZSTD_decompressSequencesSplitLitBuffer(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset);
+         else
+-            return ZSTD_decompressSequences(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset, frame);
++            return ZSTD_decompressSequences(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset);
+ #endif
+     }
+ }
+ 
+ 
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+ void ZSTD_checkContinuity(ZSTD_DCtx* dctx, const void* dst, size_t dstSize)
+ {
+     if (dst != dctx->previousDstEnd && dstSize > 0) {   /* not contiguous */
+@@ -2060,13 +2193,24 @@ void ZSTD_checkContinuity(ZSTD_DCtx* dct
+ }
+ 
+ 
+-size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx,
+-                            void* dst, size_t dstCapacity,
+-                      const void* src, size_t srcSize)
++size_t ZSTD_decompressBlock_deprecated(ZSTD_DCtx* dctx,
++                                       void* dst, size_t dstCapacity,
++                                 const void* src, size_t srcSize)
+ {
+     size_t dSize;
++    dctx->isFrameDecompression = 0;
+     ZSTD_checkContinuity(dctx, dst, dstCapacity);
+-    dSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, /* frame */ 0, not_streaming);
++    dSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, not_streaming);
++    FORWARD_IF_ERROR(dSize, "");
+     dctx->previousDstEnd = (char*)dst + dSize;
+     return dSize;
+ }
++
++
++/* NOTE: Must just wrap ZSTD_decompressBlock_deprecated() */
++size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx,
++                            void* dst, size_t dstCapacity,
++                      const void* src, size_t srcSize)
++{
++    return ZSTD_decompressBlock_deprecated(dctx, dst, dstCapacity, src, srcSize);
++}
+--- a/lib/zstd/decompress/zstd_decompress_block.h
++++ b/lib/zstd/decompress/zstd_decompress_block.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -47,7 +48,7 @@ typedef enum {
+  */
+ size_t ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
+                                void* dst, size_t dstCapacity,
+-                         const void* src, size_t srcSize, const int frame, const streaming_operation streaming);
++                         const void* src, size_t srcSize, const streaming_operation streaming);
+ 
+ /* ZSTD_buildFSETable() :
+  * generate FSE decoding table for one symbol (ll, ml or off)
+@@ -64,5 +65,10 @@ void ZSTD_buildFSETable(ZSTD_seqSymbol*
+                    unsigned tableLog, void* wksp, size_t wkspSize,
+                    int bmi2);
+ 
++/* Internal definition of ZSTD_decompressBlock() to avoid deprecation warnings. */
++size_t ZSTD_decompressBlock_deprecated(ZSTD_DCtx* dctx,
++                            void* dst, size_t dstCapacity,
++                      const void* src, size_t srcSize);
++
+ 
+ #endif /* ZSTD_DEC_BLOCK_H */
+--- a/lib/zstd/decompress/zstd_decompress_internal.h
++++ b/lib/zstd/decompress/zstd_decompress_internal.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -75,12 +76,13 @@ static UNUSED_ATTR const U32 ML_base[Max
+ 
+ #define ZSTD_BUILD_FSE_TABLE_WKSP_SIZE (sizeof(S16) * (MaxSeq + 1) + (1u << MaxFSELog) + sizeof(U64))
+ #define ZSTD_BUILD_FSE_TABLE_WKSP_SIZE_U32 ((ZSTD_BUILD_FSE_TABLE_WKSP_SIZE + sizeof(U32) - 1) / sizeof(U32))
++#define ZSTD_HUFFDTABLE_CAPACITY_LOG 12
+ 
+ typedef struct {
+     ZSTD_seqSymbol LLTable[SEQSYMBOL_TABLE_SIZE(LLFSELog)];    /* Note : Space reserved for FSE Tables */
+     ZSTD_seqSymbol OFTable[SEQSYMBOL_TABLE_SIZE(OffFSELog)];   /* is also used as temporary workspace while building hufTable during DDict creation */
+     ZSTD_seqSymbol MLTable[SEQSYMBOL_TABLE_SIZE(MLFSELog)];    /* and therefore must be at least HUF_DECOMPRESS_WORKSPACE_SIZE large */
+-    HUF_DTable hufTable[HUF_DTABLE_SIZE(HufLog)];  /* can accommodate HUF_decompress4X */
++    HUF_DTable hufTable[HUF_DTABLE_SIZE(ZSTD_HUFFDTABLE_CAPACITY_LOG)];  /* can accommodate HUF_decompress4X */
+     U32 rep[ZSTD_REP_NUM];
+     U32 workspace[ZSTD_BUILD_FSE_TABLE_WKSP_SIZE_U32];
+ } ZSTD_entropyDTables_t;
+@@ -152,6 +154,7 @@ struct ZSTD_DCtx_s
+     size_t litSize;
+     size_t rleSize;
+     size_t staticSize;
++    int isFrameDecompression;
+ #if DYNAMIC_BMI2 != 0
+     int bmi2;                     /* == 1 if the CPU supports BMI2 and 0 otherwise. CPU support is determined dynamically once per context lifetime. */
+ #endif
+@@ -164,6 +167,8 @@ struct ZSTD_DCtx_s
+     ZSTD_dictUses_e dictUses;
+     ZSTD_DDictHashSet* ddictSet;                    /* Hash set for multiple ddicts */
+     ZSTD_refMultipleDDicts_e refMultipleDDicts;     /* User specified: if == 1, will allow references to multiple DDicts. Default == 0 (disabled) */
++    int disableHufAsm;
++    int maxBlockSizeParam;
+ 
+     /* streaming */
+     ZSTD_dStreamStage streamStage;
+--- a/lib/zstd/decompress_sources.h
++++ b/lib/zstd/decompress_sources.h
+@@ -1,6 +1,6 @@
+ /* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+--- a/lib/zstd/zstd_common_module.c
++++ b/lib/zstd/zstd_common_module.c
+@@ -1,6 +1,6 @@
+ // SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -24,9 +24,6 @@ EXPORT_SYMBOL_GPL(HUF_readStats_wksp);
+ EXPORT_SYMBOL_GPL(ZSTD_isError);
+ EXPORT_SYMBOL_GPL(ZSTD_getErrorName);
+ EXPORT_SYMBOL_GPL(ZSTD_getErrorCode);
+-EXPORT_SYMBOL_GPL(ZSTD_customMalloc);
+-EXPORT_SYMBOL_GPL(ZSTD_customCalloc);
+-EXPORT_SYMBOL_GPL(ZSTD_customFree);
+ 
+ MODULE_LICENSE("Dual BSD/GPL");
+ MODULE_DESCRIPTION("Zstd Common");
+--- a/lib/zstd/zstd_compress_module.c
++++ b/lib/zstd/zstd_compress_module.c
+@@ -1,6 +1,6 @@
+ // SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+--- a/lib/zstd/zstd_decompress_module.c
++++ b/lib/zstd/zstd_decompress_module.c
+@@ -1,6 +1,6 @@
+ // SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -77,7 +77,7 @@ EXPORT_SYMBOL(zstd_init_dstream);
+ 
+ size_t zstd_reset_dstream(zstd_dstream *dstream)
+ {
+-	return ZSTD_resetDStream(dstream);
++	return ZSTD_DCtx_reset(dstream, ZSTD_reset_session_only);
+ }
+ EXPORT_SYMBOL(zstd_reset_dstream);
+ 
diff --git a/debian/patches/patchset-pf/zstd/0002-lib-zstd-Refactor-intentional-wrap-around-test.patch b/debian/patches/patchset-pf/zstd/0002-lib-zstd-Refactor-intentional-wrap-around-test.patch
new file mode 100644
index 0000000..3b837a1
--- /dev/null
+++ b/debian/patches/patchset-pf/zstd/0002-lib-zstd-Refactor-intentional-wrap-around-test.patch
@@ -0,0 +1,58 @@
+From c09f361b41027ca073de5631c66dfe0e7275c3a4 Mon Sep 17 00:00:00 2001
+From: Kees Cook <keescook@chromium.org>
+Date: Mon, 22 Jan 2024 16:27:56 -0800
+Subject: lib: zstd: Refactor intentional wrap-around test
+
+In an effort to separate intentional arithmetic wrap-around from
+unexpected wrap-around, we need to refactor places that depend on this
+kind of math. One of the most common code patterns of this is:
+
+	VAR + value < VAR
+
+Notably, this is considered "undefined behavior" for signed and pointer
+types, which the kernel works around by using the -fno-strict-overflow
+option in the build[1] (which used to just be -fwrapv). Regardless, we
+want to get the kernel source to the position where we can meaningfully
+instrument arithmetic wrap-around conditions and catch them when they
+are unexpected, regardless of whether they are signed[2], unsigned[3],
+or pointer[4] types.
+
+Switch to a more regular type for a 64-bit value and refactor the
+open-coded wrap-around addition test to use subtraction from the type max
+(since add_would_overflow() may not be defined in early boot code). This
+paves the way to enabling the wrap-around sanitizers in the future.
+
+Link: https://git.kernel.org/linus/68df3755e383e6fecf2354a67b08f92f18536594 [1]
+Link: https://github.com/KSPP/linux/issues/26 [2]
+Link: https://github.com/KSPP/linux/issues/27 [3]
+Link: https://github.com/KSPP/linux/issues/344 [4]
+Cc: Nick Terrell <terrelln@fb.com>
+Cc: Paul Jones <paul@pauljones.id.au>
+Cc: Sedat Dilek <sedat.dilek@gmail.com>
+Cc: Oleksandr Natalenko <oleksandr@natalenko.name>
+Cc: Xin Gao <gaoxin@cdjrlc.com>
+Signed-off-by: Kees Cook <keescook@chromium.org>
+---
+ lib/zstd/decompress/zstd_decompress.c | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+--- a/lib/zstd/decompress/zstd_decompress.c
++++ b/lib/zstd/decompress/zstd_decompress.c
+@@ -618,7 +618,7 @@ size_t ZSTD_readSkippableFrame(void* dst
+  * @return : decompressed size of the frames contained */
+ unsigned long long ZSTD_findDecompressedSize(const void* src, size_t srcSize)
+ {
+-    unsigned long long totalDstSize = 0;
++    U64 totalDstSize = 0;
+ 
+     while (srcSize >= ZSTD_startingInputLength(ZSTD_f_zstd1)) {
+         U32 const magicNumber = MEM_readLE32(src);
+@@ -636,7 +636,7 @@ unsigned long long ZSTD_findDecompressed
+         {   unsigned long long const fcs = ZSTD_getFrameContentSize(src, srcSize);
+             if (fcs >= ZSTD_CONTENTSIZE_ERROR) return fcs;
+ 
+-            if (totalDstSize + fcs < totalDstSize)
++            if (U64_MAX - totalDstSize < fcs)
+                 return ZSTD_CONTENTSIZE_ERROR; /* check for overflow */
+             totalDstSize += fcs;
+         }
diff --git a/debian/patches/patchset-xanmod/binder/0001-binder-turn-into-module.patch b/debian/patches/patchset-xanmod/binder/0001-binder-turn-into-module.patch
new file mode 100644
index 0000000..aa448d6
--- /dev/null
+++ b/debian/patches/patchset-xanmod/binder/0001-binder-turn-into-module.patch
@@ -0,0 +1,209 @@
+From 0bbb6450b1ba362c1c2e7d8d752b39ec9844629b Mon Sep 17 00:00:00 2001
+From: Christian Brauner <christian@brauner.io>
+Date: Wed, 16 Jan 2019 23:13:25 +0100
+Subject: [PATCH 1/4] binder: turn into module
+
+The Android binder driver needs to become a module for the sake of shipping
+Anbox. To do this we need to export the following functions since binder is
+currently still using them:
+
+- security_binder_set_context_mgr()
+- security_binder_transaction()
+- security_binder_transfer_binder()
+- security_binder_transfer_file()
+- can_nice()
+- __close_fd_get_file()
+- mmput_async()
+- task_work_add()
+- map_kernel_range_noflush()
+- get_vm_area()
+- zap_page_range_single()
+- put_ipc_ns()
+- get_ipc_ns_exported()
+- show_init_ipc_ns()
+
+Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
+[ saf: fix additional reference to init_ipc_ns from 5.0-rc6 ]
+Signed-off-by: Seth Forshee <seth.forshee@canonical.com>
+[ arighi: fix EXPORT_SYMBOL vs EXPORT_SYMBOL_GPL change from 6.0-rc5 ]
+[ arighi: zap_page_range() has been dropped, export zap_page_range_single() in 6.3 ]
+Signed-off-by: Andrea Righi <andrea.righi@canonical.com>
+Signed-off-by: Alexandre Frade <kernel@xanmod.org>
+---
+ drivers/android/Kconfig           |  6 +++---
+ drivers/android/binder.c          | 17 ++++++++++++++---
+ drivers/android/binder_alloc.h    |  3 ++-
+ drivers/android/binder_internal.h |  5 +++--
+ drivers/android/binderfs.c        |  6 +++---
+ fs/file.c                         |  1 +
+ include/linux/ipc_namespace.h     |  3 +++
+ ipc/namespace.c                   | 17 +++++++++++++++++
+ kernel/sched/syscalls.c           |  1 +
+ kernel/task_work.c                |  1 +
+ mm/memory.c                       |  1 +
+ mm/vmalloc.c                      |  1 +
+ security/security.c               |  4 ++++
+ 14 files changed, 61 insertions(+), 15 deletions(-)
+
+--- a/drivers/android/Kconfig
++++ b/drivers/android/Kconfig
+@@ -14,8 +14,8 @@ config ANDROID_BINDER_IPC
+ 	  between said processes.
+ 
+ config ANDROID_BINDERFS
+-	bool "Android Binderfs filesystem"
+-	depends on ANDROID_BINDER_IPC
++	tristate "Android Binderfs filesystem"
++	depends on (ANDROID_BINDER_IPC=y) || (ANDROID_BINDER_IPC=m && m)
+ 	default n
+ 	help
+ 	  Binderfs is a pseudo-filesystem for the Android Binder IPC driver
+--- a/drivers/android/binder.c
++++ b/drivers/android/binder.c
+@@ -6713,9 +6713,20 @@ err_alloc_device_names_failed:
+ 	return ret;
+ }
+ 
+-device_initcall(binder_init);
++module_init(binder_init);
++/*
++ * binder will have no exit function since binderfs instances can be mounted
++ * multiple times and also in user namespaces finding and destroying them all
++ * is not feasible without introducing insane locking. Just ignoring existing
++ * instances on module unload also wouldn't work since we would loose track of
++ * what major numer was dynamically allocated and also what minor numbers are
++ * already given out. So this would get us into all kinds of issues with device
++ * number reuse. So simply don't allow unloading unless we are forced to do so.
++ */
++
++MODULE_AUTHOR("Google, Inc.");
++MODULE_DESCRIPTION("Driver for Android binder device");
++MODULE_LICENSE("GPL v2");
+ 
+ #define CREATE_TRACE_POINTS
+ #include "binder_trace.h"
+-
+-MODULE_LICENSE("GPL v2");
+--- a/drivers/android/binder_alloc.h
++++ b/drivers/android/binder_alloc.h
+@@ -6,6 +6,7 @@
+ #ifndef _LINUX_BINDER_ALLOC_H
+ #define _LINUX_BINDER_ALLOC_H
+ 
++#include <linux/kconfig.h>
+ #include <linux/rbtree.h>
+ #include <linux/list.h>
+ #include <linux/mm.h>
+@@ -111,7 +112,7 @@ struct binder_alloc {
+ 	bool oneway_spam_detected;
+ };
+ 
+-#ifdef CONFIG_ANDROID_BINDER_IPC_SELFTEST
++#if IS_ENABLED(CONFIG_ANDROID_BINDER_IPC_SELFTEST)
+ void binder_selftest_alloc(struct binder_alloc *alloc);
+ #else
+ static inline void binder_selftest_alloc(struct binder_alloc *alloc) {}
+--- a/drivers/android/binder_internal.h
++++ b/drivers/android/binder_internal.h
+@@ -5,6 +5,7 @@
+ 
+ #include <linux/export.h>
+ #include <linux/fs.h>
++#include <linux/kconfig.h>
+ #include <linux/list.h>
+ #include <linux/miscdevice.h>
+ #include <linux/mutex.h>
+@@ -78,7 +79,7 @@ extern const struct file_operations bind
+ 
+ extern char *binder_devices_param;
+ 
+-#ifdef CONFIG_ANDROID_BINDERFS
++#if IS_ENABLED(CONFIG_ANDROID_BINDERFS)
+ extern bool is_binderfs_device(const struct inode *inode);
+ extern struct dentry *binderfs_create_file(struct dentry *dir, const char *name,
+ 					   const struct file_operations *fops,
+@@ -99,7 +100,7 @@ static inline struct dentry *binderfs_cr
+ static inline void binderfs_remove_file(struct dentry *dentry) {}
+ #endif
+ 
+-#ifdef CONFIG_ANDROID_BINDERFS
++#if IS_ENABLED(CONFIG_ANDROID_BINDERFS)
+ extern int __init init_binderfs(void);
+ #else
+ static inline int __init init_binderfs(void)
+--- a/drivers/android/binderfs.c
++++ b/drivers/android/binderfs.c
+@@ -120,7 +120,7 @@ static int binderfs_binder_device_create
+ 	struct super_block *sb = ref_inode->i_sb;
+ 	struct binderfs_info *info = sb->s_fs_info;
+ #if defined(CONFIG_IPC_NS)
+-	bool use_reserve = (info->ipc_ns == &init_ipc_ns);
++	bool use_reserve = (info->ipc_ns == show_init_ipc_ns());
+ #else
+ 	bool use_reserve = true;
+ #endif
+@@ -397,7 +397,7 @@ static int binderfs_binder_ctl_create(st
+ 	struct dentry *root = sb->s_root;
+ 	struct binderfs_info *info = sb->s_fs_info;
+ #if defined(CONFIG_IPC_NS)
+-	bool use_reserve = (info->ipc_ns == &init_ipc_ns);
++	bool use_reserve = (info->ipc_ns == show_init_ipc_ns());
+ #else
+ 	bool use_reserve = true;
+ #endif
+@@ -683,7 +683,7 @@ static int binderfs_fill_super(struct su
+ 		return -ENOMEM;
+ 	info = sb->s_fs_info;
+ 
+-	info->ipc_ns = get_ipc_ns(current->nsproxy->ipc_ns);
++	info->ipc_ns = get_ipc_ns_exported(current->nsproxy->ipc_ns);
+ 
+ 	info->root_gid = make_kgid(sb->s_user_ns, 0);
+ 	if (!gid_valid(info->root_gid))
+--- a/include/linux/ipc_namespace.h
++++ b/include/linux/ipc_namespace.h
+@@ -128,6 +128,9 @@ extern int mq_init_ns(struct ipc_namespa
+ static inline int mq_init_ns(struct ipc_namespace *ns) { return 0; }
+ #endif
+ 
++extern struct ipc_namespace *get_ipc_ns_exported(struct ipc_namespace *ns);
++extern struct ipc_namespace *show_init_ipc_ns(void);
++
+ #if defined(CONFIG_IPC_NS)
+ extern struct ipc_namespace *copy_ipcs(unsigned long flags,
+ 	struct user_namespace *user_ns, struct ipc_namespace *ns);
+--- a/ipc/namespace.c
++++ b/ipc/namespace.c
+@@ -207,6 +207,22 @@ void put_ipc_ns(struct ipc_namespace *ns
+ }
+ EXPORT_SYMBOL_GPL(put_ipc_ns);
+ 
++struct ipc_namespace *get_ipc_ns_exported(struct ipc_namespace *ns)
++{
++	return get_ipc_ns(ns);
++}
++EXPORT_SYMBOL(get_ipc_ns_exported);
++
++struct ipc_namespace *show_init_ipc_ns(void)
++{
++#if defined(CONFIG_IPC_NS)
++	return &init_ipc_ns;
++#else
++	return NULL;
++#endif
++}
++EXPORT_SYMBOL(show_init_ipc_ns);
++
+ static inline struct ipc_namespace *to_ipc_ns(struct ns_common *ns)
+ {
+ 	return container_of(ns, struct ipc_namespace, ns);
+--- a/mm/vmalloc.c
++++ b/mm/vmalloc.c
+@@ -3166,6 +3166,7 @@ struct vm_struct *get_vm_area(unsigned l
+ 				  NUMA_NO_NODE, GFP_KERNEL,
+ 				  __builtin_return_address(0));
+ }
++EXPORT_SYMBOL(get_vm_area);
+ 
+ struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags,
+ 				const void *caller)
diff --git a/debian/patches/patchset-xanmod/clearlinux/0001-sched-wait-Do-accept-in-LIFO-order-for-cache-efficie.patch b/debian/patches/patchset-xanmod/clearlinux/0001-sched-wait-Do-accept-in-LIFO-order-for-cache-efficie.patch
new file mode 100644
index 0000000..e9f5aca
--- /dev/null
+++ b/debian/patches/patchset-xanmod/clearlinux/0001-sched-wait-Do-accept-in-LIFO-order-for-cache-efficie.patch
@@ -0,0 +1,82 @@
+From 1e2892b83e6c4062623f9aec06eba27884d47059 Mon Sep 17 00:00:00 2001
+From: Arjan van de Ven <arjan@linux.intel.com>
+Date: Thu, 13 Dec 2018 01:00:49 +0000
+Subject: [PATCH 1/4] sched/wait: Do accept() in LIFO order for cache
+ efficiency
+
+Signed-off-by: Alexandre Frade <kernel@xanmod.org>
+---
+ include/linux/wait.h            |  2 ++
+ kernel/sched/wait.c             | 24 ++++++++++++++++++++++++
+ net/ipv4/inet_connection_sock.c |  2 +-
+ 3 files changed, 27 insertions(+), 1 deletion(-)
+
+--- a/include/linux/wait.h
++++ b/include/linux/wait.h
+@@ -163,6 +163,7 @@ static inline bool wq_has_sleeper(struct
+ 
+ extern void add_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry);
+ extern void add_wait_queue_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry);
++extern void add_wait_queue_exclusive_lifo(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry);
+ extern void add_wait_queue_priority(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry);
+ extern void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry);
+ 
+@@ -1191,6 +1192,7 @@ do {										\
+  */
+ void prepare_to_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state);
+ bool prepare_to_wait_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state);
++void prepare_to_wait_exclusive_lifo(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state);
+ long prepare_to_wait_event(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state);
+ void finish_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry);
+ long wait_woken(struct wait_queue_entry *wq_entry, unsigned mode, long timeout);
+--- a/kernel/sched/wait.c
++++ b/kernel/sched/wait.c
+@@ -47,6 +47,17 @@ void add_wait_queue_priority(struct wait
+ }
+ EXPORT_SYMBOL_GPL(add_wait_queue_priority);
+ 
++void add_wait_queue_exclusive_lifo(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
++{
++	unsigned long flags;
++
++	wq_entry->flags |= WQ_FLAG_EXCLUSIVE;
++	spin_lock_irqsave(&wq_head->lock, flags);
++	__add_wait_queue(wq_head, wq_entry);
++	spin_unlock_irqrestore(&wq_head->lock, flags);
++}
++EXPORT_SYMBOL(add_wait_queue_exclusive_lifo);
++
+ void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
+ {
+ 	unsigned long flags;
+@@ -259,6 +270,19 @@ prepare_to_wait_exclusive(struct wait_qu
+ }
+ EXPORT_SYMBOL(prepare_to_wait_exclusive);
+ 
++void prepare_to_wait_exclusive_lifo(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state)
++{
++	unsigned long flags;
++
++	wq_entry->flags |= WQ_FLAG_EXCLUSIVE;
++	spin_lock_irqsave(&wq_head->lock, flags);
++	if (list_empty(&wq_entry->entry))
++		__add_wait_queue(wq_head, wq_entry);
++	set_current_state(state);
++	spin_unlock_irqrestore(&wq_head->lock, flags);
++}
++EXPORT_SYMBOL(prepare_to_wait_exclusive_lifo);
++
+ void init_wait_entry(struct wait_queue_entry *wq_entry, int flags)
+ {
+ 	wq_entry->flags = flags;
+--- a/net/ipv4/inet_connection_sock.c
++++ b/net/ipv4/inet_connection_sock.c
+@@ -634,7 +634,7 @@ static int inet_csk_wait_for_connect(str
+ 	 * having to remove and re-insert us on the wait queue.
+ 	 */
+ 	for (;;) {
+-		prepare_to_wait_exclusive(sk_sleep(sk), &wait,
++		prepare_to_wait_exclusive_lifo(sk_sleep(sk), &wait,
+ 					  TASK_INTERRUPTIBLE);
+ 		release_sock(sk);
+ 		if (reqsk_queue_empty(&icsk->icsk_accept_queue))
diff --git a/debian/patches/patchset-xanmod/clearlinux/0002-firmware-Enable-stateless-firmware-loading.patch b/debian/patches/patchset-xanmod/clearlinux/0002-firmware-Enable-stateless-firmware-loading.patch
new file mode 100644
index 0000000..b9342a8
--- /dev/null
+++ b/debian/patches/patchset-xanmod/clearlinux/0002-firmware-Enable-stateless-firmware-loading.patch
@@ -0,0 +1,25 @@
+From 52caaf4af0242cc3ac1ddaf9791c4c08a4b7226d Mon Sep 17 00:00:00 2001
+From: William Douglas <william.douglas@intel.com>
+Date: Wed, 20 Jun 2018 17:23:21 +0000
+Subject: [PATCH 2/4] firmware: Enable stateless firmware loading
+
+Prefer the order of specific version before generic and /etc before
+/lib to enable the user to give specific overrides for generic
+firmware and distribution firmware.
+
+Signed-off-by: Alexandre Frade <kernel@xanmod.org>
+---
+ drivers/base/firmware_loader/main.c | 2 ++
+ 1 file changed, 2 insertions(+)
+
+--- a/drivers/base/firmware_loader/main.c
++++ b/drivers/base/firmware_loader/main.c
+@@ -471,6 +471,8 @@ static int fw_decompress_xz(struct devic
+ static char fw_path_para[256];
+ static const char * const fw_path[] = {
+ 	fw_path_para,
++	"/etc/firmware/" UTS_RELEASE,
++	"/etc/firmware",
+ 	"/lib/firmware/updates/" UTS_RELEASE,
+ 	"/lib/firmware/updates",
+ 	"/lib/firmware/" UTS_RELEASE,
diff --git a/debian/patches/patchset-xanmod/clearlinux/0003-locking-rwsem-spin-faster.patch b/debian/patches/patchset-xanmod/clearlinux/0003-locking-rwsem-spin-faster.patch
new file mode 100644
index 0000000..79cbb57
--- /dev/null
+++ b/debian/patches/patchset-xanmod/clearlinux/0003-locking-rwsem-spin-faster.patch
@@ -0,0 +1,32 @@
+From f57e26428b29ddcdf58b35659002293bde1bf281 Mon Sep 17 00:00:00 2001
+From: Arjan van de Ven <arjan@linux.intel.com>
+Date: Sun, 18 Feb 2018 23:35:41 +0000
+Subject: [PATCH 3/4] locking: rwsem: spin faster
+
+tweak rwsem owner spinning a bit
+
+Signed-off-by: Alexandre Frade <kernel@xanmod.org>
+---
+ kernel/locking/rwsem.c | 4 +++-
+ 1 file changed, 3 insertions(+), 1 deletion(-)
+
+--- a/kernel/locking/rwsem.c
++++ b/kernel/locking/rwsem.c
+@@ -749,6 +749,7 @@ rwsem_spin_on_owner(struct rw_semaphore
+ 	struct task_struct *new, *owner;
+ 	unsigned long flags, new_flags;
+ 	enum owner_state state;
++	int i = 0;
+ 
+ 	lockdep_assert_preemption_disabled();
+ 
+@@ -785,7 +786,8 @@ rwsem_spin_on_owner(struct rw_semaphore
+ 			break;
+ 		}
+ 
+-		cpu_relax();
++		if (i++ > 1000)
++			cpu_relax();
+ 	}
+ 
+ 	return state;
diff --git a/debian/patches/patchset-xanmod/net/netfilter/0001-netfilter-Add-netfilter-nf_tables-fullcone-support.patch b/debian/patches/patchset-xanmod/net/netfilter/0001-netfilter-Add-netfilter-nf_tables-fullcone-support.patch
new file mode 100644
index 0000000..a6c2e65
--- /dev/null
+++ b/debian/patches/patchset-xanmod/net/netfilter/0001-netfilter-Add-netfilter-nf_tables-fullcone-support.patch
@@ -0,0 +1,2288 @@
+From f68a84ded01d688aa58a973788f507899fbe191f Mon Sep 17 00:00:00 2001
+From: Alexandre Frade <kernel@xanmod.org>
+Date: Mon, 27 Feb 2023 01:38:18 +0000
+Subject: [PATCH 1/2] netfilter: Add netfilter nf_tables fullcone support
+
+Signed-off-by: Syrone Wong <wong.syrone@gmail.com>
+Signed-off-by: Alexandre Frade <kernel@xanmod.org>
+---
+ include/net/netfilter/nf_nat_fullcone.h |  156 +++
+ net/netfilter/Kconfig                   |   15 +
+ net/netfilter/Makefile                  |    5 +
+ net/netfilter/nf_nat_fullcone.c         | 1604 +++++++++++++++++++++++
+ net/netfilter/nft_ext_fullcone.c        |  466 +++++++
+ 5 files changed, 2246 insertions(+)
+ create mode 100644 include/net/netfilter/nf_nat_fullcone.h
+ create mode 100644 net/netfilter/nf_nat_fullcone.c
+ create mode 100644 net/netfilter/nft_ext_fullcone.c
+
+--- /dev/null
++++ b/include/net/netfilter/nf_nat_fullcone.h
+@@ -0,0 +1,156 @@
++// SPDX-License-Identifier: GPL-2.0-only
++
++/*
++ * Nftables NAT extension: fullcone expression support library header
++ *
++ * Copyright (c) 2018 Chion Tang <tech@chionlab.moe>
++ *   Original xt_FULLCONENAT and related iptables extension author
++ * Copyright (c) 2019-2022 GitHub/llccd Twitter/@gNodeB
++ *   Added IPv6 support for xt_FULLCONENAT and ip6tables extension
++ *   Ported to recent kernel versions
++ * Copyright (c) 2022 Syrone Wong <wong.syrone@gmail.com>
++ *   Massively rewrite the whole module, split the original code into library and nftables 'fullcone' expression module
++ */
++#ifndef _NF_NAT_FULLCONE_H_
++#define _NF_NAT_FULLCONE_H_
++
++#include <linux/version.h>
++#include <linux/skbuff.h>
++#include <linux/list.h>
++#include <linux/inetdevice.h>
++#include <linux/netfilter/nf_tables.h>
++
++#include <net/netfilter/nf_nat.h>
++#include <net/netfilter/nf_conntrack.h>
++#include <net/netfilter/nf_conntrack_zones.h>
++#include <net/netfilter/nf_conntrack_tuple.h>
++#include <net/netfilter/nf_conntrack_core.h>
++#include <net/netfilter/nf_conntrack_ecache.h>
++
++#ifndef NF_NAT_RANGE_PROTO_RANDOM_FULLY
++#define NF_NAT_RANGE_PROTO_RANDOM_FULLY (1 << 4)
++#endif
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 10, 0)
++static inline int nf_ct_netns_get(struct net *net, u8 nfproto)
++{
++	return 0;
++}
++
++static inline void nf_ct_netns_put(struct net *net, u8 nfproto)
++{
++}
++#endif
++
++/**
++ * enum nft_fullcone_attributes - nf_tables fullcone expression netlink attributes
++ *
++ * @NFTA_FULLCONE_REG_PROTO_MIN: source register of proto range start (NLA_U32: nft_registers)
++ * @NFTA_FULLCONE_REG_PROTO_MAX: source register of proto range end (NLA_U32: nft_registers)
++ * @NFTA_FULLCONE_FLAGS: NAT flags (see NF_NAT_RANGE_* in linux/netfilter/nf_nat.h) (NLA_U32)
++ */
++enum nft_fullcone_attributes {
++	NFTA_FULLCONE_UNSPEC,
++	NFTA_FULLCONE_REG_PROTO_MIN,
++	NFTA_FULLCONE_REG_PROTO_MAX,
++	NFTA_FULLCONE_FLAGS,
++	__NFTA_FULLCONE_MAX
++};
++#define NFTA_FULLCONE_MAX		(__NFTA_FULLCONE_MAX - 1)
++
++/* fullcone specific data structures */
++
++struct nat_mapping_original_tuple {
++	struct nf_conntrack_tuple tuple;
++
++	struct list_head node;
++};
++
++struct nat_mapping {
++	uint16_t port;		/* external source port */
++	__be32 addr;		/* external source ip address */
++
++	__be32 int_addr;	/* internal source ip address */
++	uint16_t int_port;	/* internal source port */
++
++	int refer_count;	/* how many references linked to this mapping
++				 * aka. length of original_tuple_list */
++
++	struct list_head original_tuple_list;
++
++	struct hlist_node node_by_ext_port;
++	struct hlist_node node_by_int_src;
++
++};
++
++#if IS_ENABLED(CONFIG_NF_NAT_IPV6) || (IS_ENABLED(CONFIG_IPV6) && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 1, 0))
++struct nat_mapping6 {
++	uint16_t port;		/* external source port */
++	union nf_inet_addr addr;	/* external source ip address */
++
++	union nf_inet_addr int_addr;	/* internal source ip address */
++	uint16_t int_port;	/* internal source port */
++
++	int refer_count;	/* how many references linked to this mapping
++				 * aka. length of original_tuple_list */
++
++	struct list_head original_tuple_list;
++
++	struct hlist_node node_by_ext_port;
++	struct hlist_node node_by_int_src;
++
++};
++#endif
++
++struct tuple_list {
++	struct nf_conntrack_tuple tuple_original;
++	struct nf_conntrack_tuple tuple_reply;
++	struct list_head list;
++};
++
++/* fullcone specific data structures end */
++
++// NOTE: declaration listed here must use EXPORT_SYMBOL_*
++
++unsigned int nf_nat_fullcone_ipv4(struct sk_buff *skb, unsigned int hooknum,
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 18, 0)
++				  struct nf_nat_range2 *range,
++#else
++				  struct nf_nat_range *range,
++#endif
++				  const struct net_device *out);
++
++unsigned int nf_nat_fullcone_ipv6(struct sk_buff *skb, unsigned int hooknum,
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 18, 0)
++				  struct nf_nat_range2 *range,
++#else
++				  struct nf_nat_range *range,
++#endif
++				  const struct net_device *out);
++
++void nf_nat_fullcone_handle_dying_tuples(void);
++void nf_nat_fullcone_destroy_mappings(void);
++void nf_nat_fullcone_dying_tuple_list_add(struct list_head *new_dying);
++
++/*
++ * For [FUTURE] usage
++ *
++ * from https://elixir.bootlin.com/linux/v5.15.32/source/net/netfilter/xt_nat.c#L37
++static void xt_nat_convert_range(struct nf_nat_range2 *dst,
++				 const struct nf_nat_ipv4_range *src)
++{
++	memset(&dst->min_addr, 0, sizeof(dst->min_addr));
++	memset(&dst->max_addr, 0, sizeof(dst->max_addr));
++	// base_proto is nf_nat_range2 specific
++	memset(&dst->base_proto, 0, sizeof(dst->base_proto));
++
++	dst->flags	 = src->flags;
++	dst->min_addr.ip = src->min_ip;
++	dst->max_addr.ip = src->max_ip;
++	dst->min_proto	 = src->min;
++	dst->max_proto	 = src->max;
++}
++ *
++ */
++
++#endif /*_NF_NAT_FULLCONE_H_ */
+--- a/net/netfilter/Kconfig
++++ b/net/netfilter/Kconfig
+@@ -571,6 +571,21 @@ config NFT_NAT
+ 	  This option adds the "nat" expression that you can use to perform
+ 	  typical Network Address Translation (NAT) packet transformations.
+ 
++config NFT_FULLCONE
++	depends on NF_CONNTRACK
++	depends on NF_NAT
++	tristate "Netfilter nf_tables fullcone support"
++	help
++	  This options adds the "fullcone" expression that you can use
++	  to perform NAT in the RFC3489-compatible full cone SNAT flavour.
++	  Currently only UDP traffic is supported for full-cone NAT.
++	  For other protos FULLCONENAT is equivalent to MASQUERADE.
++
++	  To compile this code as a module, choose M here: the module will be
++	  called nft_fullcone.
++
++	  If unsure, say N.
++
+ config NFT_TUNNEL
+ 	tristate "Netfilter nf_tables tunnel module"
+ 	help
+--- a/net/netfilter/Makefile
++++ b/net/netfilter/Makefile
+@@ -240,3 +240,8 @@ obj-$(CONFIG_IP_VS) += ipvs/
+ 
+ # lwtunnel
+ obj-$(CONFIG_LWTUNNEL) += nf_hooks_lwtunnel.o
++
++# Nftables/netfilter fullcone expression support
++obj-$(CONFIG_NFT_FULLCONE) += \
++	nf_nat_fullcone.o \
++	nft_ext_fullcone.o
+--- /dev/null
++++ b/net/netfilter/nf_nat_fullcone.c
+@@ -0,0 +1,1604 @@
++// SPDX-License-Identifier: GPL-2.0-only
++
++/*
++ * Nftables NAT extension: fullcone expression support library
++ *
++ * Copyright (c) 2018 Chion Tang <tech@chionlab.moe>
++ *   Original xt_FULLCONENAT and related iptables extension author
++ * Copyright (c) 2019-2022 GitHub/llccd Twitter/@gNodeB
++ *   Added IPv6 support for xt_FULLCONENAT and ip6tables extension
++ *   Ported to recent kernel versions
++ * Copyright (c) 2022 Syrone Wong <wong.syrone@gmail.com>
++ *   Massively rewrite the whole module, split the original code into library and nftables 'fullcone' expression module
++ * Copyright (c) 2022 Alexandre Frade <kernel@xanmod.org>
++ *   Ported to latest kernel source tree
++ */
++
++#define pr_fmt(fmt) "fullcone " KBUILD_MODNAME ": " fmt
++
++#include <linux/version.h>
++#include <linux/types.h>
++#include <linux/list.h>
++#include <linux/hashtable.h>
++#include <linux/atomic.h>
++#include <linux/kernel.h>
++#include <linux/jhash.h>
++
++#ifdef CONFIG_NF_CONNTRACK_CHAIN_EVENTS
++#include <linux/notifier.h>
++#endif
++
++#include <linux/netfilter.h>
++#include <linux/netfilter_ipv4.h>
++
++#if IS_ENABLED(CONFIG_NF_NAT_IPV6) || (IS_ENABLED(CONFIG_IPV6) && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 1, 0))
++#include <linux/netfilter_ipv6.h>
++#include <linux/ipv6.h>
++#include <net/addrconf.h>
++#endif
++
++#include <net/netfilter/nf_nat_fullcone.h>
++
++/*
++ * FULLCONE_HKEY generates u32 hash value
++ * Modified from net/netfilter/ipset/ip_set_hash_gen.h
++ * dataptr: a pointer
++ * datatypelen: sizeof(struct blah) or sizeof(u32)
++ * initval: initial value
++ * htable_bits: hashtable bits
++ */
++#define FULLCONE_HKEY(dataptr, datatypelen, initval, htable_bits)			\
++({								\
++	const u32 *__k = (const u32 *)(dataptr);			\
++	u32 __l = (datatypelen) / sizeof(u32);			\
++								\
++	BUILD_BUG_ON((datatypelen) % sizeof(u32) != 0);		\
++								\
++	jhash2(__k, __l, (initval)) & jhash_mask((htable_bits));	\
++})
++
++#define HASHTABLE_BUCKET_BITS 10
++
++/* static variables */
++
++static DEFINE_HASHTABLE(mapping_table_by_ext_port, HASHTABLE_BUCKET_BITS);
++static DEFINE_HASHTABLE(mapping_table_by_int_src, HASHTABLE_BUCKET_BITS);
++
++static DEFINE_SPINLOCK(fullconenat_lock);
++
++#if IS_ENABLED(CONFIG_NF_NAT_IPV6) || (IS_ENABLED(CONFIG_IPV6) && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 1, 0))
++static DEFINE_HASHTABLE(mapping6_table_by_ext_port, HASHTABLE_BUCKET_BITS);
++static DEFINE_HASHTABLE(mapping6_table_by_int_src, HASHTABLE_BUCKET_BITS);
++
++static DEFINE_SPINLOCK(fullconenat6_lock);
++#endif
++
++static LIST_HEAD(dying_tuple_list);
++static DEFINE_SPINLOCK(dying_tuple_list_lock);
++
++/* static variables end */
++
++/* forward declaration */
++
++#if IS_ENABLED(CONFIG_IPV6)
++static int nat_ipv6_dev_get_saddr(struct net *net, const struct net_device *dev,
++				  const struct in6_addr *daddr, unsigned int srcprefs, struct in6_addr *saddr);
++#endif
++
++#if IS_ENABLED(CONFIG_NF_NAT_IPV6) || (IS_ENABLED(CONFIG_IPV6) && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 1, 0))
++/* non-atomic: can only be called serially within lock zones. */
++static char *fullcone_nf_ct_stringify_tuple6(const struct nf_conntrack_tuple
++					     *t);
++#endif
++/* non-atomic: can only be called serially within lock zones. */
++static char *nf_ct_stringify_tuple(const struct nf_conntrack_tuple *t);
++
++#if IS_ENABLED(CONFIG_NF_NAT_IPV6) || (IS_ENABLED(CONFIG_IPV6) && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 1, 0))
++static struct nat_mapping6 *allocate_mapping6(const union nf_inet_addr
++					      *int_addr,
++					      const uint16_t int_port,
++					      const uint16_t port, const union nf_inet_addr *addr);
++#endif
++static struct nat_mapping *allocate_mapping(const __be32 int_addr,
++					    const uint16_t int_port, const uint16_t port, const __be32 addr);
++
++#if IS_ENABLED(CONFIG_NF_NAT_IPV6) || (IS_ENABLED(CONFIG_IPV6) && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 1, 0))
++static void add_original_tuple_to_mapping6(struct nat_mapping6 *mapping, const struct nf_conntrack_tuple
++					   *original_tuple);
++#endif
++static void add_original_tuple_to_mapping(struct nat_mapping *mapping, const struct nf_conntrack_tuple
++					  *original_tuple);
++
++#if IS_ENABLED(CONFIG_NF_NAT_IPV6) || (IS_ENABLED(CONFIG_IPV6) && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 1, 0))
++static struct nat_mapping6 *get_mapping6_by_int_src(const union nf_inet_addr
++						    *src_ip, const uint16_t src_port, const union nf_inet_addr
++						    *ext_ip);
++#endif
++
++static struct nat_mapping *get_mapping_by_int_src(const __be32 src_ip, const uint16_t src_port, const __be32 ext_ip);
++
++#if IS_ENABLED(CONFIG_NF_NAT_IPV6) || (IS_ENABLED(CONFIG_IPV6) && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 1, 0))
++static struct nat_mapping6 *get_mapping6_by_int_src_inrange(const union
++							    nf_inet_addr
++							    *src_ip, const uint16_t src_port, const union
++							    nf_inet_addr
++							    *min_ip, const union
++							    nf_inet_addr
++							    *max_ip);
++#endif
++static struct nat_mapping *get_mapping_by_int_src_inrange(const __be32 src_ip,
++							  const uint16_t
++							  src_port, const __be32 min_ip, const __be32 max_ip);
++
++#if IS_ENABLED(CONFIG_NF_NAT_IPV6) || (IS_ENABLED(CONFIG_IPV6) && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 1, 0))
++static void kill_mapping6(struct nat_mapping6 *mapping);
++#endif
++static void kill_mapping(struct nat_mapping *mapping);
++
++#if IS_ENABLED(CONFIG_NF_NAT_IPV6) || (IS_ENABLED(CONFIG_IPV6) && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 1, 0))
++
++/* check if a mapping is valid.
++ * possibly delete and free an invalid mapping.
++ * the mapping should not be used anymore after check_mapping6() returns 0. */
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 3, 0)
++static int check_mapping6(struct nat_mapping6 *mapping, struct net *net, const struct nf_conntrack_zone *zone);
++#else
++static int check_mapping6(struct nat_mapping6 *mapping, struct net *net, const u16 zone);
++#endif
++
++#endif
++
++/* check if a mapping is valid.
++ * possibly delete and free an invalid mapping.
++ * the mapping should not be used anymore after check_mapping() returns 0. */
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 3, 0)
++static int check_mapping(struct nat_mapping *mapping, struct net *net, const struct nf_conntrack_zone *zone);
++#else
++static int check_mapping(struct nat_mapping *mapping, struct net *net, const u16 zone);
++#endif
++
++#if IS_ENABLED(CONFIG_NF_NAT_IPV6) || (IS_ENABLED(CONFIG_IPV6) && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 1, 0))
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 3, 0)
++static struct nat_mapping6 *get_mapping6_by_ext_port(const uint16_t port, const union nf_inet_addr
++						     *ext_ip, struct net *net, const struct
++						     nf_conntrack_zone *zone);
++#else
++static struct nat_mapping6 *get_mapping6_by_ext_port(const uint16_t port, const union nf_inet_addr
++						     *ext_ip, struct net *net, const u16 zone);
++#endif
++#endif
++
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 3, 0)
++static struct nat_mapping *get_mapping_by_ext_port(const uint16_t port, const __be32 ext_ip, struct net *net, const struct
++						   nf_conntrack_zone *zone);
++#else
++static struct nat_mapping *get_mapping_by_ext_port(const uint16_t port,
++						   const __be32 ext_ip, struct net *net, const u16 zone);
++#endif
++
++#if IS_ENABLED(CONFIG_NF_NAT_IPV6) || (IS_ENABLED(CONFIG_IPV6) && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 1, 0))
++
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 3, 0)
++static uint16_t find_appropriate_port6(struct net *net,
++				       const struct nf_conntrack_zone *zone,
++				       const uint16_t original_port, const union nf_inet_addr *ext_ip,
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 18, 0)
++				       struct nf_nat_range2 *range);
++
++#else
++				       struct nf_nat_range *range);
++
++#endif
++
++#else
++static uint16_t find_appropriate_port6(struct net *net, const u16 zone,
++				       const uint16_t original_port, const union nf_inet_addr *ext_ip,
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 18, 0)
++				       struct nf_nat_range2 *range);
++
++#else
++				       struct nf_nat_range *range);
++
++#endif
++#endif
++
++#endif
++
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 3, 0)
++static uint16_t find_appropriate_port(struct net *net,
++				      const struct nf_conntrack_zone *zone,
++				      const uint16_t original_port, const __be32 ext_ip,
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 18, 0)
++				      struct nf_nat_range2 *range);
++#else
++				      struct nf_nat_range *range);
++#endif
++
++#else
++static uint16_t find_appropriate_port(struct net *net, const u16 zone,
++				      const uint16_t original_port, const __be32 ext_ip,
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 18, 0)
++				      struct nf_nat_range2 *range);
++#else
++				      struct nf_nat_range *range);
++#endif
++#endif
++
++#if IS_ENABLED(CONFIG_NF_NAT_IPV6) || (IS_ENABLED(CONFIG_IPV6) && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 1, 0))
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 3, 0)
++static void find_leastused_ip6(const struct nf_conntrack_zone *zone,
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 18, 0)
++			       struct nf_nat_range2 *range,
++#else
++			       struct nf_nat_range *range,
++#endif
++			       const union nf_inet_addr *src,
++			       const union nf_inet_addr *dst, union nf_inet_addr *var_ipp);
++#else
++static void find_leastused_ip6(const u16 zone,
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 18, 0)
++			       struct nf_nat_range2 *range,
++#else
++			       struct nf_nat_range *range,
++#endif
++			       const union nf_inet_addr *src,
++			       const union nf_inet_addr *dst, union nf_inet_addr *var_ipp);
++#endif
++#endif
++
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 3, 0)
++static __be32 find_leastused_ip(const struct nf_conntrack_zone *zone,
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 18, 0)
++				struct nf_nat_range2 *range,
++#else
++				struct nf_nat_range *range,
++#endif
++				const __be32 src, const __be32 dst);
++#else
++static __be32 find_leastused_ip(const u16 zone,
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 18, 0)
++				struct nf_nat_range2 *range,
++#else
++				struct nf_nat_range *range,
++#endif
++				const __be32 src, const __be32 dst);
++#endif
++
++/* forward declaration end */
++
++/* non-atomic part */
++
++static char tuple_tmp_string[512];
++
++#if IS_ENABLED(CONFIG_NF_NAT_IPV6) || (IS_ENABLED(CONFIG_IPV6) && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 1, 0))
++/* non-atomic: can only be called serially within lock zones. */
++static char *fullcone_nf_ct_stringify_tuple6(const struct nf_conntrack_tuple *t)
++{
++	snprintf(tuple_tmp_string, sizeof(tuple_tmp_string),
++		 "[%pI6c]:%hu -> [%pI6c]:%hu", &t->src.u3.ip6,
++		 be16_to_cpu(t->src.u.all), &t->dst.u3.ip6, be16_to_cpu(t->dst.u.all));
++	return tuple_tmp_string;
++}
++#endif
++/* non-atomic: can only be called serially within lock zones. */
++static char *nf_ct_stringify_tuple(const struct nf_conntrack_tuple *t)
++{
++	snprintf(tuple_tmp_string, sizeof(tuple_tmp_string),
++		 "%pI4:%hu -> %pI4:%hu", &t->src.u3.ip,
++		 be16_to_cpu(t->src.u.all), &t->dst.u3.ip, be16_to_cpu(t->dst.u.all));
++	return tuple_tmp_string;
++}
++
++/* non-atomic part end */
++
++void nf_nat_fullcone_dying_tuple_list_add(struct list_head *new_dying)
++{
++	spin_lock_bh(&dying_tuple_list_lock);
++	list_add(new_dying, &dying_tuple_list);
++	spin_unlock_bh(&dying_tuple_list_lock);
++}
++
++EXPORT_SYMBOL_GPL(nf_nat_fullcone_dying_tuple_list_add);
++
++void nf_nat_fullcone_handle_dying_tuples(void)
++{
++	struct list_head *iter, *tmp, *iter_2, *tmp_2;
++	struct tuple_list *item;
++	struct nf_conntrack_tuple *ct_tuple;
++	struct nat_mapping *mapping;
++	__be32 ip, ext_ip;
++	uint16_t port;
++	struct nat_mapping_original_tuple *original_tuple_item;
++#if IS_ENABLED(CONFIG_NF_NAT_IPV6) || (IS_ENABLED(CONFIG_IPV6) && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 1, 0))
++	struct nat_mapping6 *mapping6;
++	union nf_inet_addr *ip6, *ext_ip6;
++	spin_lock_bh(&fullconenat6_lock);
++#endif
++
++	spin_lock_bh(&fullconenat_lock);
++	spin_lock_bh(&dying_tuple_list_lock);
++
++	list_for_each_safe(iter, tmp, &dying_tuple_list) {
++		item = list_entry(iter, struct tuple_list, list);
++
++		/* we dont know the conntrack direction for now so we try in both ways. */
++		ct_tuple = &(item->tuple_original);
++#if IS_ENABLED(CONFIG_NF_NAT_IPV6) || (IS_ENABLED(CONFIG_IPV6) && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 1, 0))
++		if (ct_tuple->src.l3num == PF_INET6) {
++			ip6 = &(ct_tuple->src).u3;
++			port = be16_to_cpu((ct_tuple->src).u.udp.port);
++			ext_ip6 = &item->tuple_reply.dst.u3;
++			mapping6 = get_mapping6_by_int_src(ip6, port, ext_ip6);
++			if (mapping6 == NULL) {
++				ext_ip6 = &(ct_tuple->dst).u3;
++				ct_tuple = &(item->tuple_reply);
++				ip6 = &(ct_tuple->src).u3;
++				port = be16_to_cpu((ct_tuple->src).u.udp.port);
++				mapping6 = get_mapping6_by_int_src(ip6, port, ext_ip6);
++				if (mapping6 != NULL) {
++					pr_debug
++					    ("nf_nat_fullcone_handle_dying_tuples(): INBOUND dying conntrack at ext port %d\n",
++					     mapping6->port);
++				}
++			} else {
++				pr_debug
++				    ("nf_nat_fullcone_handle_dying_tuples(): OUTBOUND dying conntrack at ext port %d\n",
++				     mapping6->port);
++			}
++
++			if (mapping6 == NULL) {
++				goto next;
++			}
++
++			/* look for the corresponding out-dated tuple and free it */
++			list_for_each_safe(iter_2, tmp_2, &mapping6->original_tuple_list) {
++				original_tuple_item = list_entry(iter_2, struct
++								 nat_mapping_original_tuple, node);
++
++				if (nf_ct_tuple_equal(&original_tuple_item->tuple, &(item->tuple_original))) {
++					pr_debug
++					    ("nf_nat_fullcone_handle_dying_tuples(): tuple %s expired. free this tuple.\n",
++					     fullcone_nf_ct_stringify_tuple6(&original_tuple_item->tuple));
++					list_del(&original_tuple_item->node);
++					kfree(original_tuple_item);
++					(mapping6->refer_count)--;
++				}
++			}
++
++			/* then kill the mapping if needed */
++			pr_debug
++			    ("nf_nat_fullcone_handle_dying_tuples(): refer_count for mapping at ext_port %d is now %d\n",
++			     mapping6->port, mapping6->refer_count);
++			if (mapping6->refer_count <= 0) {
++				pr_debug
++				    ("nf_nat_fullcone_handle_dying_tuples(): kill expired mapping at ext port %d\n",
++				     mapping6->port);
++				kill_mapping6(mapping6);
++			}
++			goto next;
++		}
++		if (unlikely(ct_tuple->src.l3num != PF_INET))
++#else
++		if (ct_tuple->src.l3num != PF_INET)
++#endif
++			goto next;
++
++		ip = (ct_tuple->src).u3.ip;
++		port = be16_to_cpu((ct_tuple->src).u.udp.port);
++		ext_ip = item->tuple_reply.dst.u3.ip;
++		mapping = get_mapping_by_int_src(ip, port, ext_ip);
++		if (mapping == NULL) {
++			ext_ip = (ct_tuple->dst).u3.ip;
++			ct_tuple = &(item->tuple_reply);
++			ip = (ct_tuple->src).u3.ip;
++			port = be16_to_cpu((ct_tuple->src).u.udp.port);
++			mapping = get_mapping_by_int_src(ip, port, ext_ip);
++			if (mapping != NULL) {
++				pr_debug
++				    ("nf_nat_fullcone_handle_dying_tuples(): INBOUND dying conntrack at ext port %d\n",
++				     mapping->port);
++			}
++		} else {
++			pr_debug
++			    ("nf_nat_fullcone_handle_dying_tuples(): OUTBOUND dying conntrack at ext port %d\n",
++			     mapping->port);
++		}
++
++		if (mapping == NULL) {
++			goto next;
++		}
++
++		/* look for the corresponding out-dated tuple and free it */
++		list_for_each_safe(iter_2, tmp_2, &mapping->original_tuple_list) {
++			original_tuple_item = list_entry(iter_2, struct nat_mapping_original_tuple, node);
++
++			if (nf_ct_tuple_equal(&original_tuple_item->tuple, &(item->tuple_original))) {
++				pr_debug
++				    ("nf_nat_fullcone_handle_dying_tuples(): tuple %s expired. free this tuple.\n",
++				     nf_ct_stringify_tuple(&original_tuple_item->tuple));
++				list_del(&original_tuple_item->node);
++				kfree(original_tuple_item);
++				(mapping->refer_count)--;
++			}
++		}
++
++		/* then kill the mapping if needed */
++		pr_debug
++		    ("nf_nat_fullcone_handle_dying_tuples(): refer_count for mapping at ext_port %d is now %d\n",
++		     mapping->port, mapping->refer_count);
++		if (mapping->refer_count <= 0) {
++			pr_debug
++			    ("nf_nat_fullcone_handle_dying_tuples(): kill expired mapping at ext port %d\n",
++			     mapping->port);
++			kill_mapping(mapping);
++		}
++
++next:
++		list_del(&item->list);
++		kfree(item);
++	}
++
++	spin_unlock_bh(&dying_tuple_list_lock);
++	spin_unlock_bh(&fullconenat_lock);
++#if IS_ENABLED(CONFIG_NF_NAT_IPV6) || (IS_ENABLED(CONFIG_IPV6) && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 1, 0))
++	spin_unlock_bh(&fullconenat6_lock);
++#endif
++}
++
++EXPORT_SYMBOL_GPL(nf_nat_fullcone_handle_dying_tuples);
++
++#if IS_ENABLED(CONFIG_NF_NAT_IPV6) || (IS_ENABLED(CONFIG_IPV6) && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 1, 0))
++static struct nat_mapping6 *allocate_mapping6(const union nf_inet_addr
++					      *int_addr,
++					      const uint16_t int_port,
++					      const uint16_t port, const union nf_inet_addr *addr)
++{
++	struct nat_mapping6 *p_new;
++	u32 hash_src;
++
++	p_new = kmalloc(sizeof(struct nat_mapping6), GFP_ATOMIC);
++	if (p_new == NULL) {
++		pr_err("kmalloc() for allocate_mapping6 failed.\n");
++		return NULL;
++	}
++	p_new->addr = *addr;
++	p_new->port = port;
++	p_new->int_addr = *int_addr;
++	p_new->int_port = int_port;
++	p_new->refer_count = 0;
++	(p_new->original_tuple_list).next = &(p_new->original_tuple_list);
++	(p_new->original_tuple_list).prev = &(p_new->original_tuple_list);
++
++	hash_src = FULLCONE_HKEY(int_addr, sizeof(union nf_inet_addr), (u32) int_port, HASHTABLE_BUCKET_BITS);
++	//hash_src = jhash2((u32 *) int_addr->all, 4, (u32) int_port);
++
++	hash_add(mapping6_table_by_ext_port, &p_new->node_by_ext_port, port);
++	hash_add(mapping6_table_by_int_src, &p_new->node_by_int_src, hash_src);
++
++	pr_debug("new mapping allocated for [%pI6c]:%d ==> [%pI6c]:%d\n",
++		 &p_new->int_addr, p_new->int_port, &p_new->addr, p_new->port);
++
++	return p_new;
++}
++#endif
++static struct nat_mapping *allocate_mapping(const __be32 int_addr,
++					    const uint16_t int_port, const uint16_t port, const __be32 addr)
++{
++	struct nat_mapping *p_new;
++	u32 hash_src;
++
++	p_new = kmalloc(sizeof(struct nat_mapping), GFP_ATOMIC);
++	if (p_new == NULL) {
++		pr_err("kmalloc() for allocate_mapping failed.\n");
++		return NULL;
++	}
++	p_new->addr = addr;
++	p_new->port = port;
++	p_new->int_addr = int_addr;
++	p_new->int_port = int_port;
++	p_new->refer_count = 0;
++	(p_new->original_tuple_list).next = &(p_new->original_tuple_list);
++	(p_new->original_tuple_list).prev = &(p_new->original_tuple_list);
++
++	hash_src = FULLCONE_HKEY(&int_addr, sizeof(__be32), (u32) int_port, HASHTABLE_BUCKET_BITS);
++	//hash_src = HASH_2(int_addr, (u32) int_port);
++
++	hash_add(mapping_table_by_ext_port, &p_new->node_by_ext_port, port);
++	hash_add(mapping_table_by_int_src, &p_new->node_by_int_src, hash_src);
++
++	pr_debug("new mapping allocated for %pI4:%d ==> %pI4:%d\n",
++		 &p_new->int_addr, p_new->int_port, &p_new->addr, p_new->port);
++
++	return p_new;
++}
++
++#if IS_ENABLED(CONFIG_NF_NAT_IPV6) || (IS_ENABLED(CONFIG_IPV6) && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 1, 0))
++static void add_original_tuple_to_mapping6(struct nat_mapping6 *mapping, const struct nf_conntrack_tuple
++					   *original_tuple)
++{
++	struct nat_mapping_original_tuple *item = kmalloc(sizeof(struct nat_mapping_original_tuple), GFP_ATOMIC);
++	if (item == NULL) {
++		pr_err("kmalloc() for add_original_tuple_to_mapping6 failed.\n");
++		return;
++	}
++	memcpy(&item->tuple, original_tuple, sizeof(struct nf_conntrack_tuple));
++	list_add(&item->node, &mapping->original_tuple_list);
++	(mapping->refer_count)++;
++}
++#endif
++static void add_original_tuple_to_mapping(struct nat_mapping *mapping, const struct nf_conntrack_tuple
++					  *original_tuple)
++{
++	struct nat_mapping_original_tuple *item = kmalloc(sizeof(struct nat_mapping_original_tuple), GFP_ATOMIC);
++	if (item == NULL) {
++		pr_err("kmalloc() for add_original_tuple_to_mapping failed.\n");
++		return;
++	}
++	memcpy(&item->tuple, original_tuple, sizeof(struct nf_conntrack_tuple));
++	list_add(&item->node, &mapping->original_tuple_list);
++	(mapping->refer_count)++;
++}
++
++#if IS_ENABLED(CONFIG_NF_NAT_IPV6) || (IS_ENABLED(CONFIG_IPV6) && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 1, 0))
++static struct nat_mapping6 *get_mapping6_by_int_src(const union nf_inet_addr
++						    *src_ip, const uint16_t src_port, const union nf_inet_addr
++						    *ext_ip)
++{
++	struct nat_mapping6 *p_current;
++	u32 hash_src = FULLCONE_HKEY(src_ip, sizeof(union nf_inet_addr), (u32) src_port, HASHTABLE_BUCKET_BITS);
++	//u32 hash_src = jhash2((u32 *) src_ip->all, 4, (u32) src_port);
++
++	hash_for_each_possible(mapping6_table_by_int_src, p_current, node_by_int_src, hash_src) {
++		if (nf_inet_addr_cmp(&p_current->int_addr, src_ip)
++		    && p_current->int_port == src_port && nf_inet_addr_cmp(&p_current->addr, ext_ip)) {
++			return p_current;
++		}
++	}
++
++	return NULL;
++}
++#endif
++
++static struct nat_mapping *get_mapping_by_int_src(const __be32 src_ip, const uint16_t src_port, const __be32 ext_ip)
++{
++	struct nat_mapping *p_current;
++	u32 hash_src = FULLCONE_HKEY(&src_ip, sizeof(__be32), (u32) src_port, HASHTABLE_BUCKET_BITS);
++	//u32 hash_src = HASH_2(src_ip, (u32) src_port);
++
++	hash_for_each_possible(mapping_table_by_int_src, p_current, node_by_int_src, hash_src) {
++		if (p_current->int_addr == src_ip && p_current->int_port == src_port && p_current->addr == ext_ip) {
++			return p_current;
++		}
++	}
++
++	return NULL;
++}
++
++#if IS_ENABLED(CONFIG_NF_NAT_IPV6) || (IS_ENABLED(CONFIG_IPV6) && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 1, 0))
++static struct nat_mapping6 *get_mapping6_by_int_src_inrange(const union
++							    nf_inet_addr
++							    *src_ip, const uint16_t src_port, const union
++							    nf_inet_addr
++							    *min_ip, const union
++							    nf_inet_addr
++							    *max_ip)
++{
++	struct nat_mapping6 *p_current;
++
++	u32 hash_src = FULLCONE_HKEY(src_ip, sizeof(union nf_inet_addr), (u32) src_port, HASHTABLE_BUCKET_BITS);
++	//u32 hash_src = jhash2((u32 *) src_ip->all, 4, (u32) src_port);
++
++	hash_for_each_possible(mapping6_table_by_int_src, p_current, node_by_int_src, hash_src) {
++		if (nf_inet_addr_cmp(&p_current->int_addr, src_ip)
++		    && p_current->int_port == src_port
++		    && memcmp(&p_current->addr, min_ip,
++			      sizeof(union nf_inet_addr)) >= 0
++		    && memcmp(&p_current->addr, max_ip, sizeof(union nf_inet_addr)) <= 0) {
++			return p_current;
++		}
++	}
++
++	return NULL;
++}
++#endif
++static struct nat_mapping *get_mapping_by_int_src_inrange(const __be32 src_ip,
++							  const uint16_t
++							  src_port, const __be32 min_ip, const __be32 max_ip)
++{
++	struct nat_mapping *p_current;
++	u32 hash_src = FULLCONE_HKEY(&src_ip, sizeof(__be32), (u32) src_port, HASHTABLE_BUCKET_BITS);
++	//u32 hash_src = HASH_2(src_ip, (u32) src_port);
++
++	hash_for_each_possible(mapping_table_by_int_src, p_current, node_by_int_src, hash_src) {
++		if (p_current->int_addr == src_ip
++		    && p_current->int_port == src_port
++		    && memcmp(&p_current->addr, &min_ip, sizeof(__be32)) >= 0
++		    && memcmp(&p_current->addr, &max_ip, sizeof(__be32)) <= 0) {
++			return p_current;
++		}
++	}
++
++	return NULL;
++}
++
++#if IS_ENABLED(CONFIG_NF_NAT_IPV6) || (IS_ENABLED(CONFIG_IPV6) && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 1, 0))
++static void kill_mapping6(struct nat_mapping6 *mapping)
++{
++	struct list_head *iter, *tmp;
++	struct nat_mapping_original_tuple *original_tuple_item;
++
++	if (mapping == NULL) {
++		return;
++	}
++
++	list_for_each_safe(iter, tmp, &mapping->original_tuple_list) {
++		original_tuple_item = list_entry(iter, struct nat_mapping_original_tuple, node);
++		list_del(&original_tuple_item->node);
++		kfree(original_tuple_item);
++	}
++
++	hash_del(&mapping->node_by_ext_port);
++	hash_del(&mapping->node_by_int_src);
++	kfree(mapping);
++}
++#endif
++static void kill_mapping(struct nat_mapping *mapping)
++{
++	struct list_head *iter, *tmp;
++	struct nat_mapping_original_tuple *original_tuple_item;
++
++	if (mapping == NULL) {
++		return;
++	}
++
++	list_for_each_safe(iter, tmp, &mapping->original_tuple_list) {
++		original_tuple_item = list_entry(iter, struct nat_mapping_original_tuple, node);
++		list_del(&original_tuple_item->node);
++		kfree(original_tuple_item);
++	}
++
++	hash_del(&mapping->node_by_ext_port);
++	hash_del(&mapping->node_by_int_src);
++	kfree(mapping);
++}
++
++#if IS_ENABLED(CONFIG_NF_NAT_IPV6) || (IS_ENABLED(CONFIG_IPV6) && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 1, 0))
++
++/* check if a mapping is valid.
++ * possibly delete and free an invalid mapping.
++ * the mapping should not be used anymore after check_mapping6() returns 0. */
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 3, 0)
++static int check_mapping6(struct nat_mapping6 *mapping, struct net *net, const struct nf_conntrack_zone *zone)
++{
++#else
++static int check_mapping6(struct nat_mapping6 *mapping, struct net *net, const u16 zone)
++{
++#endif
++	struct list_head *iter, *tmp;
++	struct nat_mapping_original_tuple *original_tuple_item;
++	struct nf_conntrack_tuple_hash *tuple_hash;
++	struct nf_conn *ct;
++
++	/* for dying/unconfirmed conntrack tuples, an IPCT_DESTROY event may NOT be fired.
++	 * so we manually kill one of those tuples once we acquire one. */
++
++	list_for_each_safe(iter, tmp, &mapping->original_tuple_list) {
++		original_tuple_item = list_entry(iter, struct nat_mapping_original_tuple, node);
++
++		tuple_hash = nf_conntrack_find_get(net, zone, &original_tuple_item->tuple);
++
++		if (tuple_hash == NULL) {
++			pr_debug
++			    ("check_mapping6(): tuple %s dying/unconfirmed. free this tuple.\n",
++			     fullcone_nf_ct_stringify_tuple6(&original_tuple_item->tuple));
++
++			list_del(&original_tuple_item->node);
++			kfree(original_tuple_item);
++			(mapping->refer_count)--;
++		} else {
++			ct = nf_ct_tuplehash_to_ctrack(tuple_hash);
++			if (likely(ct != NULL))
++				nf_ct_put(ct);
++		}
++
++	}
++
++	/* kill the mapping if need */
++	pr_debug
++	    ("check_mapping6() refer_count for mapping at ext_port %d is now %d\n",
++	     mapping->port, mapping->refer_count);
++	if (mapping->refer_count <= 0) {
++		pr_debug("check_mapping6(): kill dying/unconfirmed mapping at ext port %d\n", mapping->port);
++		kill_mapping6(mapping);
++		return 0;
++	} else {
++		return 1;
++	}
++}
++
++#endif
++
++/* check if a mapping is valid.
++ * possibly delete and free an invalid mapping.
++ * the mapping should not be used anymore after check_mapping() returns 0. */
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 3, 0)
++static int check_mapping(struct nat_mapping *mapping, struct net *net, const struct nf_conntrack_zone *zone)
++{
++#else
++static int check_mapping(struct nat_mapping *mapping, struct net *net, const u16 zone)
++{
++#endif
++	struct list_head *iter, *tmp;
++	struct nat_mapping_original_tuple *original_tuple_item;
++	struct nf_conntrack_tuple_hash *tuple_hash;
++	struct nf_conn *ct;
++
++	/* for dying/unconfirmed conntrack tuples, an IPCT_DESTROY event may NOT be fired.
++	 * so we manually kill one of those tuples once we acquire one. */
++
++	list_for_each_safe(iter, tmp, &mapping->original_tuple_list) {
++		original_tuple_item = list_entry(iter, struct nat_mapping_original_tuple, node);
++
++		tuple_hash = nf_conntrack_find_get(net, zone, &original_tuple_item->tuple);
++
++		if (tuple_hash == NULL) {
++			pr_debug
++			    ("check_mapping(): tuple %s dying/unconfirmed. free this tuple.\n",
++			     nf_ct_stringify_tuple(&original_tuple_item->tuple));
++
++			list_del(&original_tuple_item->node);
++			kfree(original_tuple_item);
++			(mapping->refer_count)--;
++		} else {
++			ct = nf_ct_tuplehash_to_ctrack(tuple_hash);
++			if (likely(ct != NULL))
++				nf_ct_put(ct);
++		}
++
++	}
++
++	/* kill the mapping if need */
++	pr_debug
++	    ("check_mapping() refer_count for mapping at ext_port %d is now %d\n", mapping->port, mapping->refer_count);
++	if (mapping->refer_count <= 0) {
++		pr_debug("check_mapping(): kill dying/unconfirmed mapping at ext port %d\n", mapping->port);
++		kill_mapping(mapping);
++		return 0;
++	} else {
++		return 1;
++	}
++}
++
++#if IS_ENABLED(CONFIG_NF_NAT_IPV6) || (IS_ENABLED(CONFIG_IPV6) && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 1, 0))
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 3, 0)
++static struct nat_mapping6 *get_mapping6_by_ext_port(const uint16_t port, const union nf_inet_addr
++						     *ext_ip, struct net *net, const struct
++						     nf_conntrack_zone *zone)
++{
++#else
++static struct nat_mapping6 *get_mapping6_by_ext_port(const uint16_t port, const union nf_inet_addr
++						     *ext_ip, struct net *net, const u16 zone)
++{
++#endif
++	struct nat_mapping6 *p_current;
++	struct hlist_node *tmp;
++
++	hash_for_each_possible_safe(mapping6_table_by_ext_port, p_current, tmp, node_by_ext_port, port) {
++		if (p_current->port == port && check_mapping6(p_current, net, zone)
++		    && nf_inet_addr_cmp(&p_current->addr, ext_ip)) {
++			return p_current;
++		}
++	}
++
++	return NULL;
++}
++#endif
++
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 3, 0)
++static struct nat_mapping *get_mapping_by_ext_port(const uint16_t port, const __be32 ext_ip, struct net *net, const struct
++						   nf_conntrack_zone *zone)
++{
++#else
++static struct nat_mapping *get_mapping_by_ext_port(const uint16_t port,
++						   const __be32 ext_ip, struct net *net, const u16 zone)
++{
++#endif
++	struct nat_mapping *p_current;
++	struct hlist_node *tmp;
++
++	hash_for_each_possible_safe(mapping_table_by_ext_port, p_current, tmp, node_by_ext_port, port) {
++		if (p_current->port == port && check_mapping(p_current, net, zone)
++		    && p_current->addr == ext_ip) {
++			return p_current;
++		}
++	}
++
++	return NULL;
++}
++
++#if IS_ENABLED(CONFIG_NF_NAT_IPV6) || (IS_ENABLED(CONFIG_IPV6) && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 1, 0))
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 3, 0)
++static uint16_t find_appropriate_port6(struct net *net,
++				       const struct nf_conntrack_zone *zone,
++				       const uint16_t original_port, const union nf_inet_addr *ext_ip,
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 18, 0)
++				       struct nf_nat_range2 *range)
++#else
++				       struct nf_nat_range *range)
++#endif
++#else
++static uint16_t find_appropriate_port6(struct net *net, const u16 zone,
++				       const uint16_t original_port, const union nf_inet_addr *ext_ip,
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 18, 0)
++				       struct nf_nat_range2 *range)
++#else
++				       struct nf_nat_range *range)
++#endif
++#endif
++{
++	uint16_t min, start, selected, range_size, i;
++	struct nat_mapping6 *mapping = NULL;
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 18, 0)
++	// nf_nat_range2 specific
++	memset(&range->base_proto, 0, sizeof(range->base_proto));
++#endif
++	if (range->flags & NF_NAT_RANGE_PROTO_SPECIFIED) {
++		min = be16_to_cpu((range->min_proto).udp.port);
++		range_size = be16_to_cpu((range->max_proto).udp.port) - min + 1;
++	} else {
++		/* minimum port is 1024. same behavior as default linux NAT. */
++		min = 1024;
++		range_size = 65535 - min + 1;
++	}
++
++	if ((range->flags & NF_NAT_RANGE_PROTO_RANDOM)
++	    || (range->flags & NF_NAT_RANGE_PROTO_RANDOM_FULLY)) {
++		/* for now we do the same thing for both --random and --random-fully */
++
++		/* select a random starting point */
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 1, 0)
++		start = (uint16_t) (get_random_u32() % (u32) range_size);
++#else
++		start = (uint16_t) (prandom_u32() % (u32) range_size);
++#endif
++	} else {
++
++		if ((original_port >= min && original_port <= min + range_size - 1)
++		    || !(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) {
++			/* 1. try to preserve the port if it's available */
++			mapping = get_mapping6_by_ext_port(original_port, ext_ip, net, zone);
++			if (mapping == NULL) {
++				return original_port;
++			}
++		}
++
++		/* otherwise, we start from zero */
++		start = 0;
++	}
++
++	for (i = 0; i < range_size; i++) {
++		/* 2. try to find an available port */
++		selected = min + ((start + i) % range_size);
++		mapping = get_mapping6_by_ext_port(selected, ext_ip, net, zone);
++		if (mapping == NULL) {
++			return selected;
++		}
++	}
++
++	/* 3. at least we tried. override a previous mapping. */
++	selected = min + start;
++	mapping = get_mapping6_by_ext_port(selected, ext_ip, net, zone);
++	kill_mapping6(mapping);
++
++	return selected;
++}
++#endif
++
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 3, 0)
++static uint16_t find_appropriate_port(struct net *net,
++				      const struct nf_conntrack_zone *zone,
++				      const uint16_t original_port, const __be32 ext_ip,
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 18, 0)
++				      struct nf_nat_range2 *range)
++#else
++				      struct nf_nat_range *range)
++#endif
++#else
++static uint16_t find_appropriate_port(struct net *net, const u16 zone,
++				      const uint16_t original_port, const __be32 ext_ip,
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 18, 0)
++				      struct nf_nat_range2 *range)
++#else
++				      struct nf_nat_range *range)
++#endif
++#endif
++{
++	uint16_t min, start, selected, range_size, i;
++	struct nat_mapping *mapping = NULL;
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 18, 0)
++	// nf_nat_range2 specific
++	memset(&range->base_proto, 0, sizeof(range->base_proto));
++#endif
++	if (range->flags & NF_NAT_RANGE_PROTO_SPECIFIED) {
++		min = be16_to_cpu((range->min_proto).udp.port);
++		range_size = be16_to_cpu((range->max_proto).udp.port) - min + 1;
++	} else {
++		/* minimum port is 1024. same behavior as default linux NAT. */
++		min = 1024;
++		range_size = 65535 - min + 1;
++	}
++
++	if ((range->flags & NF_NAT_RANGE_PROTO_RANDOM)
++	    || (range->flags & NF_NAT_RANGE_PROTO_RANDOM_FULLY)) {
++		/* for now we do the same thing for both --random and --random-fully */
++
++		/* select a random starting point */
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 1, 0)
++		start = (uint16_t) (get_random_u32() % (u32) range_size);
++#else
++		start = (uint16_t) (prandom_u32() % (u32) range_size);
++#endif
++	} else {
++
++		if ((original_port >= min && original_port <= min + range_size - 1)
++		    || !(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) {
++			/* 1. try to preserve the port if it's available */
++			mapping = get_mapping_by_ext_port(original_port, ext_ip, net, zone);
++			if (mapping == NULL) {
++				return original_port;
++			}
++		}
++
++		/* otherwise, we start from zero */
++		start = 0;
++	}
++
++	for (i = 0; i < range_size; i++) {
++		/* 2. try to find an available port */
++		selected = min + ((start + i) % range_size);
++		mapping = get_mapping_by_ext_port(selected, ext_ip, net, zone);
++		if (mapping == NULL) {
++			return selected;
++		}
++	}
++
++	/* 3. at least we tried. override a previous mapping. */
++	selected = min + start;
++	mapping = get_mapping_by_ext_port(selected, ext_ip, net, zone);
++	kill_mapping(mapping);
++
++	return selected;
++}
++
++#if IS_ENABLED(CONFIG_NF_NAT_IPV6) || (IS_ENABLED(CONFIG_IPV6) && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 1, 0))
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 3, 0)
++static void find_leastused_ip6(const struct nf_conntrack_zone *zone,
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 18, 0)
++			       struct nf_nat_range2 *range,
++#else
++			       struct nf_nat_range *range,
++#endif
++			       const union nf_inet_addr *src,
++			       const union nf_inet_addr *dst, union nf_inet_addr *var_ipp)
++#else
++static void find_leastused_ip6(const u16 zone,
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 18, 0)
++			       struct nf_nat_range2 *range,
++#else
++			       struct nf_nat_range *range,
++#endif
++			       const union nf_inet_addr *src,
++			       const union nf_inet_addr *dst, union nf_inet_addr *var_ipp)
++#endif
++{
++	unsigned int i;
++	/* Host order */
++	u32 minip, maxip, j, dist;
++	bool full_range;
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 18, 0)
++	// nf_nat_range2 specific
++	memset(&(range->base_proto), 0, sizeof(range->base_proto));
++#endif
++
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 3, 0)
++	j = FULLCONE_HKEY(src, sizeof(union nf_inet_addr),
++			  range->flags & NF_NAT_RANGE_PERSISTENT ? 0 : dst->all[3] ^ zone->id, HASHTABLE_BUCKET_BITS);
++	//j = jhash2((u32 *) src, 4, range->flags & NF_NAT_RANGE_PERSISTENT ? 0 : dst->all[3] ^ zone->id);
++#else
++	j = FULLCONE_HKEY(src, sizeof(union nf_inet_addr),
++			  range->flags & NF_NAT_RANGE_PERSISTENT ? 0 : dst->all[3] ^ zone, HASHTABLE_BUCKET_BITS);
++	//j = jhash2((u32 *) src, 4, range->flags & NF_NAT_RANGE_PERSISTENT ? 0 : dst->all[3] ^ zone);
++#endif
++
++	full_range = false;
++	for (i = 0; i <= 3; i++) {
++		/* If first bytes of the address are at the maximum, use the
++		 * distance. Otherwise use the full range. */
++		if (!full_range) {
++			minip = ntohl(range->min_addr.all[i]);
++			maxip = ntohl(range->max_addr.all[i]);
++			dist = maxip - minip + 1;
++		} else {
++			minip = 0;
++			dist = ~0;
++		}
++
++		var_ipp->all[i] = (__force __be32) htonl(minip + reciprocal_scale(j, dist));
++		if (var_ipp->all[i] != range->max_addr.all[i])
++			full_range = true;
++
++		if (!(range->flags & NF_NAT_RANGE_PERSISTENT))
++			j ^= (__force u32) dst->all[i];
++	}
++}
++#endif
++
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 3, 0)
++static __be32 find_leastused_ip(const struct nf_conntrack_zone *zone,
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 18, 0)
++				struct nf_nat_range2 *range,
++#else
++				struct nf_nat_range *range,
++#endif
++				const __be32 src, const __be32 dst)
++#else
++static __be32 find_leastused_ip(const u16 zone,
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 18, 0)
++				struct nf_nat_range2 *range,
++#else
++				struct nf_nat_range *range,
++#endif
++				const __be32 src, const __be32 dst)
++#endif
++{
++	/* Host order */
++	u32 minip, maxip, j, dist;
++
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 18, 0)
++	// nf_nat_range2 specific
++	memset(&(range->base_proto), 0, sizeof(range->base_proto));
++#endif
++
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 3, 0)
++	j = FULLCONE_HKEY(&src, sizeof(__be32), range->flags & NF_NAT_RANGE_PERSISTENT ? 0 : dst ^ zone->id,
++			  HASHTABLE_BUCKET_BITS);
++	//j = jhash_1word((u32) src, range->flags & NF_NAT_RANGE_PERSISTENT ? 0 : dst ^ zone->id);
++#else
++	j = FULLCONE_HKEY(&src, sizeof(__be32), range->flags & NF_NAT_RANGE_PERSISTENT ? 0 : dst ^ zone,
++			  HASHTABLE_BUCKET_BITS);
++	//j = jhash_1word((u32) src, range->flags & NF_NAT_RANGE_PERSISTENT ? 0 : dst ^ zone);
++#endif
++
++	minip = ntohl(range->min_addr.ip);
++	maxip = ntohl(range->max_addr.ip);
++	dist = maxip - minip + 1;
++
++	return (__be32) htonl(minip + reciprocal_scale(j, dist));
++}
++
++void nf_nat_fullcone_destroy_mappings(void)
++{
++	struct nat_mapping *p_current;
++#if IS_ENABLED(CONFIG_NF_NAT_IPV6) || (IS_ENABLED(CONFIG_IPV6) && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 1, 0))
++	struct nat_mapping6 *p6_current;
++#endif
++	struct hlist_node *tmp;
++	int i;
++
++	spin_lock_bh(&fullconenat_lock);
++
++	hash_for_each_safe(mapping_table_by_ext_port, i, tmp, p_current, node_by_ext_port) {
++		kill_mapping(p_current);
++	}
++
++	spin_unlock_bh(&fullconenat_lock);
++
++#if IS_ENABLED(CONFIG_NF_NAT_IPV6) || (IS_ENABLED(CONFIG_IPV6) && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 1, 0))
++	spin_lock_bh(&fullconenat6_lock);
++
++	hash_for_each_safe(mapping6_table_by_ext_port, i, tmp, p6_current, node_by_ext_port) {
++		kill_mapping6(p6_current);
++	}
++
++	spin_unlock_bh(&fullconenat6_lock);
++#endif
++}
++
++EXPORT_SYMBOL_GPL(nf_nat_fullcone_destroy_mappings);
++
++/*
++ * nfproto choices
++ * enum {
++	NFPROTO_INET   =  1,
++	NFPROTO_IPV4   =  2,
++	NFPROTO_IPV6   = 10,
++};
++ */
++static unsigned int nf_nat_handle_prerouting(u8 nfproto, struct sk_buff *skb, unsigned int hooknum,
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 18, 0)
++					     struct nf_nat_range2 *newrange)
++#else
++					     struct nf_nat_range *newrange)
++#endif
++{
++	unsigned int ret;
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 3, 0)
++	const struct nf_conntrack_zone *zone;
++#else
++	u16 zone;
++#endif
++	struct net *net;
++	struct nf_conn *ct;
++	enum ip_conntrack_info ctinfo;
++	struct nf_conntrack_tuple *ct_tuple_origin;
++
++	uint16_t port, original_port;
++	uint8_t protonum;
++
++/* NFPROTO specific def */
++	struct nat_mapping *mapping;
++	struct nat_mapping6 *mapping_6;
++
++	__be32 ip;
++	union nf_inet_addr *ip_6;
++	/* NFPROTO specific def end */
++
++	WARN_ON(!(nfproto == NFPROTO_IPV4 || nfproto == NFPROTO_IPV6));
++
++	/* NFPROTO specific init */
++	mapping = NULL;
++	mapping_6 = NULL;
++
++	ip = 0;
++	ip_6 = NULL;
++	/* NFPROTO specific init end */
++
++	original_port = 0;
++	ret = NFT_CONTINUE;	// BUG: use XT_CONTINUE for Xtables
++
++	ct = nf_ct_get(skb, &ctinfo);
++	net = nf_ct_net(ct);
++	zone = nf_ct_zone(ct);
++
++	ct_tuple_origin = &(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
++
++	protonum = (ct_tuple_origin->dst).protonum;
++	if (protonum != IPPROTO_UDP) {
++		// Currently only UDP traffic is supported for full-cone NAT.
++		// For other protos FULLCONENAT is equivalent to MASQUERADE.
++		return ret;
++	}
++
++	if (nfproto == NFPROTO_IPV4) {
++		ip = (ct_tuple_origin->dst).u3.ip;
++	} else if (nfproto == NFPROTO_IPV6) {
++		ip_6 = &(ct_tuple_origin->dst).u3;
++	}
++
++	port = be16_to_cpu((ct_tuple_origin->dst).u.udp.port);
++
++	if (nfproto == NFPROTO_IPV4) {
++		spin_lock_bh(&fullconenat_lock);
++	} else if (nfproto == NFPROTO_IPV6) {
++		spin_lock_bh(&fullconenat6_lock);
++	}
++
++	/* find an active mapping based on the inbound port */
++	if (nfproto == NFPROTO_IPV4) {
++		mapping = get_mapping_by_ext_port(port, ip, net, zone);
++	} else if (nfproto == NFPROTO_IPV6) {
++		mapping_6 = get_mapping6_by_ext_port(port, ip_6, net, zone);
++	}
++
++	if (nfproto == NFPROTO_IPV4) {
++		if (mapping == NULL) {
++			goto unlock;
++		}
++	} else if (nfproto == NFPROTO_IPV6) {
++		if (mapping_6 == NULL) {
++			goto unlock;
++		}
++	}
++
++	newrange->flags = NF_NAT_RANGE_MAP_IPS | NF_NAT_RANGE_PROTO_SPECIFIED;
++	if (nfproto == NFPROTO_IPV4) {
++		newrange->min_addr.ip = mapping->int_addr;
++		newrange->max_addr.ip = mapping->int_addr;
++		newrange->min_proto.udp.port = cpu_to_be16(mapping->int_port);
++	} else if (nfproto == NFPROTO_IPV6) {
++		newrange->min_addr = mapping_6->int_addr;
++		newrange->max_addr = mapping_6->int_addr;
++		newrange->min_proto.udp.port = cpu_to_be16(mapping_6->int_port);
++	}
++	newrange->max_proto = newrange->min_proto;
++
++	if (nfproto == NFPROTO_IPV4) {
++		pr_debug("<INBOUND DNAT> %s ==> %pI4:%d\n",
++			 nf_ct_stringify_tuple(ct_tuple_origin), &mapping->int_addr, mapping->int_port);
++	} else if (nfproto == NFPROTO_IPV6) {
++		pr_debug("<INBOUND DNAT> %s ==> [%pI6c]:%d\n",
++			 fullcone_nf_ct_stringify_tuple6(ct_tuple_origin), &mapping_6->int_addr, mapping_6->int_port);
++	}
++
++	ret = nf_nat_setup_info(ct, newrange, HOOK2MANIP(hooknum));
++
++	if (ret == NF_ACCEPT) {
++		if (nfproto == NFPROTO_IPV4) {
++			add_original_tuple_to_mapping(mapping, ct_tuple_origin);
++			pr_debug
++			    ("INBOUND: refer_count for mapping at ext_port %d is now %d\n",
++			     mapping->port, mapping->refer_count);
++		} else if (nfproto == NFPROTO_IPV6) {
++			add_original_tuple_to_mapping6(mapping_6, ct_tuple_origin);
++			pr_debug
++			    ("INBOUND: refer_count for mapping_6 at ext_port %d is now %d\n",
++			     mapping_6->port, mapping_6->refer_count);
++		}
++
++	}
++
++unlock:
++	if (nfproto == NFPROTO_IPV4) {
++		spin_unlock_bh(&fullconenat_lock);
++	} else if (nfproto == NFPROTO_IPV6) {
++		spin_unlock_bh(&fullconenat6_lock);
++	}
++
++	return ret;
++
++}
++
++static unsigned int nf_nat_handle_postrouting(u8 nfproto, struct sk_buff *skb, unsigned int hooknum,
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 18, 0)
++					      struct nf_nat_range2 *range, struct nf_nat_range2 *newrange,
++#else
++					      struct nf_nat_range *range, struct nf_nat_range *newrange,
++#endif
++					      const struct net_device *out)
++{
++	unsigned int ret;
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 3, 0)
++	const struct nf_conntrack_zone *zone;
++#else
++	u16 zone;
++#endif
++	struct net *net;
++	struct nf_conn *ct;
++	enum ip_conntrack_info ctinfo;
++	struct nf_conn_nat *nat;
++	struct nf_conntrack_tuple *ct_tuple, *ct_tuple_origin;
++	uint16_t port, original_port, want_port;
++	uint8_t protonum;
++	bool is_src_mapping_active;
++
++	/* NFPROTO specific def */
++	struct nat_mapping *mapping, *src_mapping;
++	struct nat_mapping6 *mapping_6, *src_mapping_6;
++
++	__be32 ip;
++	union nf_inet_addr *ip_6;
++
++	const struct rtable *rt;
++	__be32 newsrc, nh;
++
++	/* NFPROTO specific def end */
++
++	WARN_ON(!(nfproto == NFPROTO_IPV4 || nfproto == NFPROTO_IPV6));
++
++	/* NFPROTO specific init */
++	mapping = NULL;
++	src_mapping = NULL;
++	mapping_6 = NULL;
++	src_mapping_6 = NULL;
++
++	ip = 0;
++	ip_6 = NULL;
++	/* NFPROTO specific init end */
++
++	original_port = 0;
++	ret = NFT_CONTINUE;	// BUG: use XT_CONTINUE for Xtables
++
++	ct = nf_ct_get(skb, &ctinfo);
++	net = nf_ct_net(ct);
++	zone = nf_ct_zone(ct);
++
++	ct_tuple_origin = &(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
++	protonum = (ct_tuple_origin->dst).protonum;
++
++	if (range->flags & NF_NAT_RANGE_MAP_IPS) {
++		if (nfproto == NFPROTO_IPV4) {
++			newrange->min_addr.ip = range->min_addr.ip;
++			newrange->max_addr.ip = range->max_addr.ip;
++		} else if (nfproto == NFPROTO_IPV6) {
++			newrange->min_addr = range->min_addr;
++			newrange->max_addr = range->max_addr;
++		}
++
++	} else {
++		if (nfproto == NFPROTO_IPV4) {
++			rt = skb_rtable(skb);
++			nh = rt_nexthop(rt, ip_hdr(skb)->daddr);
++			newsrc = inet_select_addr(out, nh, RT_SCOPE_UNIVERSE);
++
++			if (unlikely(!newsrc))
++				return NF_DROP;
++			newrange->min_addr.ip = newsrc;
++			newrange->max_addr.ip = newsrc;
++		} else if (nfproto == NFPROTO_IPV6) {
++			if (unlikely
++			    (nat_ipv6_dev_get_saddr
++			     (nf_ct_net(ct), out, &ipv6_hdr(skb)->daddr, 0, &(newrange->min_addr.in6)) < 0))
++				return NF_DROP;
++			newrange->max_addr = newrange->min_addr;
++
++		}
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 16, 0)
++		nat = nf_ct_nat_ext_add(ct);
++#else
++		nat = nfct_nat(ct);
++#endif
++		if (likely(nat))
++			nat->masq_index = out->ifindex;
++
++	}
++
++	if (protonum == IPPROTO_UDP) {
++		if (nfproto == NFPROTO_IPV4) {
++			ip = (ct_tuple_origin->src).u3.ip;
++		} else if (nfproto == NFPROTO_IPV6) {
++			ip_6 = &(ct_tuple_origin->src).u3;
++		}
++
++		original_port = be16_to_cpu((ct_tuple_origin->src).u.udp.port);
++
++		if (nfproto == NFPROTO_IPV4) {
++			spin_lock_bh(&fullconenat_lock);
++		} else if (nfproto == NFPROTO_IPV6) {
++			spin_lock_bh(&fullconenat6_lock);
++		}
++
++		if (nfproto == NFPROTO_IPV4) {
++			if (newrange->min_addr.ip != newrange->max_addr.ip)
++				src_mapping =
++				    get_mapping_by_int_src_inrange(ip,
++								   original_port,
++								   newrange->min_addr.ip, newrange->max_addr.ip);
++			else
++				src_mapping = get_mapping_by_int_src(ip, original_port, newrange->min_addr.ip);
++		} else if (nfproto == NFPROTO_IPV6) {
++			if (!nf_inet_addr_cmp(&newrange->min_addr, &newrange->max_addr))
++				src_mapping_6 =
++				    get_mapping6_by_int_src_inrange(ip_6,
++								    original_port,
++								    &newrange->min_addr, &newrange->max_addr);
++			else
++				src_mapping_6 = get_mapping6_by_int_src(ip_6, original_port, &newrange->min_addr);
++		}
++
++		if (nfproto == NFPROTO_IPV4) {
++			is_src_mapping_active = src_mapping != NULL && check_mapping(src_mapping, net, zone);
++		} else if (nfproto == NFPROTO_IPV6) {
++			is_src_mapping_active = src_mapping_6 != NULL && check_mapping6(src_mapping_6, net, zone);
++		}
++
++		if (is_src_mapping_active) {
++
++			/* outbound nat: if a previously established mapping is active,
++			 * we will reuse that mapping. */
++
++			newrange->flags = NF_NAT_RANGE_MAP_IPS | NF_NAT_RANGE_PROTO_SPECIFIED;
++			if (nfproto == NFPROTO_IPV4) {
++				newrange->min_proto.udp.port = cpu_to_be16(src_mapping->port);
++			} else if (nfproto == NFPROTO_IPV6) {
++				newrange->min_proto.udp.port = cpu_to_be16(src_mapping_6->port);
++			}
++
++			newrange->max_proto = newrange->min_proto;
++
++			if (nfproto == NFPROTO_IPV4) {
++				if (newrange->min_addr.ip != newrange->max_addr.ip) {
++					newrange->min_addr.ip = src_mapping->addr;
++					newrange->max_addr.ip = newrange->min_addr.ip;
++				}
++			} else if (nfproto == NFPROTO_IPV6) {
++				if (!nf_inet_addr_cmp(&newrange->min_addr, &newrange->max_addr)) {
++					newrange->min_addr = src_mapping_6->addr;
++					newrange->max_addr = newrange->min_addr;
++				}
++			}
++
++		} else {
++
++			/* if not, we find a new external IP:port to map to.
++			 * the SNAT may fail so we should re-check the mapped port later. */
++
++			if (nfproto == NFPROTO_IPV4) {
++				if (newrange->min_addr.ip != newrange->max_addr.ip) {
++					newrange->min_addr.ip =
++					    find_leastused_ip(zone, range, ip, (ct_tuple_origin->dst).u3.ip);
++					newrange->max_addr.ip = newrange->min_addr.ip;
++				}
++				want_port =
++				    find_appropriate_port(net, zone, original_port, newrange->min_addr.ip, range);
++			} else if (nfproto == NFPROTO_IPV6) {
++
++				if (!nf_inet_addr_cmp(&newrange->min_addr, &newrange->max_addr)) {
++					find_leastused_ip6(zone, range, ip_6,
++							   &(ct_tuple_origin->dst).u3, &newrange->min_addr);
++					newrange->max_addr = newrange->min_addr;
++				}
++
++				want_port =
++				    find_appropriate_port6(net, zone, original_port, &newrange->min_addr, range);
++			}
++
++			newrange->flags = NF_NAT_RANGE_MAP_IPS | NF_NAT_RANGE_PROTO_SPECIFIED;
++			newrange->min_proto.udp.port = cpu_to_be16(want_port);
++			newrange->max_proto = newrange->min_proto;
++
++			if (nfproto == NFPROTO_IPV4) {
++				src_mapping = NULL;
++			} else if (nfproto == NFPROTO_IPV6) {
++				src_mapping_6 = NULL;
++			}
++
++		}
++	}
++
++	/* do SNAT now */
++	ret = nf_nat_setup_info(ct, newrange, HOOK2MANIP(hooknum));
++
++	if (protonum != IPPROTO_UDP) {
++		/* non-UDP packets, bailout */
++		goto out;
++	}
++	if (ret != NF_ACCEPT) {
++		/* failed SNAT, bailout */
++		goto unlock;
++	}
++
++	/* the reply tuple contains the mapped port. */
++	ct_tuple = &(ct->tuplehash[IP_CT_DIR_REPLY].tuple);
++	/* this is the resulted mapped port. */
++	port = be16_to_cpu((ct_tuple->dst).u.udp.port);
++
++	if (nfproto == NFPROTO_IPV4) {
++		pr_debug("<OUTBOUND SNAT> %s ==> %d\n", nf_ct_stringify_tuple(ct_tuple_origin), port);
++	} else if (nfproto == NFPROTO_IPV6) {
++		pr_debug("<OUTBOUND SNAT> %s ==> %d\n", fullcone_nf_ct_stringify_tuple6(ct_tuple_origin), port);
++	}
++
++	/* save the mapping information into our mapping table */
++
++	if (nfproto == NFPROTO_IPV4) {
++		mapping = src_mapping;
++		if (mapping == NULL) {
++			mapping = allocate_mapping(ip, original_port, port, (ct_tuple->dst).u3.ip);
++		}
++		if (likely(mapping != NULL)) {
++			add_original_tuple_to_mapping(mapping, ct_tuple_origin);
++			pr_debug
++			    (" OUTBOUND: refer_count for mapping at ext_port %d is now %d\n",
++			     mapping->port, mapping->refer_count);
++		}
++	} else if (nfproto == NFPROTO_IPV6) {
++		mapping_6 = src_mapping_6;
++		if (mapping_6 == NULL) {
++			mapping_6 = allocate_mapping6(ip_6, original_port, port, &(ct_tuple->dst).u3);
++		}
++		if (likely(mapping_6 != NULL)) {
++			add_original_tuple_to_mapping6(mapping_6, ct_tuple_origin);
++			pr_debug
++			    ("OUTBOUND: refer_count for mapping at ext_port %d is now %d\n",
++			     mapping_6->port, mapping_6->refer_count);
++		}
++	}
++
++unlock:
++	if (nfproto == NFPROTO_IPV4) {
++		spin_unlock_bh(&fullconenat_lock);
++	} else if (nfproto == NFPROTO_IPV6) {
++		spin_unlock_bh(&fullconenat6_lock);
++	}
++
++out:
++	return ret;
++}
++
++unsigned int nf_nat_fullcone_ipv4(struct sk_buff *skb, unsigned int hooknum,
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 18, 0)
++				  struct nf_nat_range2 *range,
++#else
++				  struct nf_nat_range *range,
++#endif
++				  const struct net_device *out)
++{
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 18, 0)
++	struct nf_nat_range2 newrange;
++#else
++	struct nf_nat_range newrange;
++#endif
++
++	WARN_ON(!(hooknum == NF_INET_POST_ROUTING || hooknum == NF_INET_PRE_ROUTING));
++
++	memset(&newrange.min_addr, 0, sizeof(newrange.min_addr));
++	memset(&newrange.max_addr, 0, sizeof(newrange.max_addr));
++	newrange.flags = range->flags | NF_NAT_RANGE_MAP_IPS;
++	newrange.min_proto = range->min_proto;
++	newrange.max_proto = range->max_proto;
++
++	switch (hooknum) {
++	case NF_INET_PRE_ROUTING:
++		/* inbound packets */
++		return nf_nat_handle_prerouting(NFPROTO_IPV4, skb, hooknum, &newrange);
++	case NF_INET_POST_ROUTING:
++		/* outbound packets */
++		return nf_nat_handle_postrouting(NFPROTO_IPV4, skb, hooknum, range, &newrange, out);
++	}
++
++	WARN_ON(1);
++	// logical error
++	return 5;
++}
++
++EXPORT_SYMBOL_GPL(nf_nat_fullcone_ipv4);
++
++#if IS_ENABLED(CONFIG_IPV6)
++static int
++nat_ipv6_dev_get_saddr(struct net *net, const struct net_device *dev,
++		       const struct in6_addr *daddr, unsigned int srcprefs, struct in6_addr *saddr)
++{
++#ifdef CONFIG_IPV6_MODULE
++	const struct nf_ipv6_ops *v6_ops = nf_get_ipv6_ops();
++
++	if (!v6_ops)
++		return -EHOSTUNREACH;
++
++	return v6_ops->dev_get_saddr(net, dev, daddr, srcprefs, saddr);
++#else
++	return ipv6_dev_get_saddr(net, dev, daddr, srcprefs, saddr);
++#endif
++}
++#endif
++
++unsigned int nf_nat_fullcone_ipv6(struct sk_buff *skb, unsigned int hooknum,
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 18, 0)
++				  struct nf_nat_range2 *range,
++#else
++				  struct nf_nat_range *range,
++#endif
++				  const struct net_device *out)
++{
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 18, 0)
++	struct nf_nat_range2 newrange;
++#else
++	struct nf_nat_range newrange;
++#endif
++
++	WARN_ON(!(hooknum == NF_INET_POST_ROUTING || hooknum == NF_INET_PRE_ROUTING));
++
++	memset(&newrange.min_addr, 0, sizeof(newrange.min_addr));
++	memset(&newrange.max_addr, 0, sizeof(newrange.max_addr));
++	newrange.flags = range->flags | NF_NAT_RANGE_MAP_IPS;
++	newrange.min_proto = range->min_proto;
++	newrange.max_proto = range->max_proto;
++
++	switch (hooknum) {
++	case NF_INET_PRE_ROUTING:
++		/* inbound packets */
++		return nf_nat_handle_prerouting(NFPROTO_IPV6, skb, hooknum, &newrange);
++	case NF_INET_POST_ROUTING:
++		/* outbound packets */
++		return nf_nat_handle_postrouting(NFPROTO_IPV6, skb, hooknum, range, &newrange, out);
++	}
++
++	WARN_ON(1);
++	// logical error
++	return 5;
++}
++
++EXPORT_SYMBOL_GPL(nf_nat_fullcone_ipv6);
++
++MODULE_LICENSE("GPL");
++MODULE_AUTHOR("Syrone Wong <wong.syrone@gmail.com>");
++MODULE_DESCRIPTION("Netfilter fullcone expression support library of RFC3489 full cone NAT");
+--- /dev/null
++++ b/net/netfilter/nft_ext_fullcone.c
+@@ -0,0 +1,466 @@
++// SPDX-License-Identifier: GPL-2.0-only
++
++/*
++ * Nftables NAT extension: fullcone expression support
++ *
++ * Copyright (c) 2018 Chion Tang <tech@chionlab.moe>
++ *   Original xt_FULLCONENAT and related iptables extension author
++ * Copyright (c) 2019-2022 GitHub/llccd Twitter/@gNodeB
++ *   Added IPv6 support for xt_FULLCONENAT and ip6tables extension
++ *   Ported to recent kernel versions
++ * Copyright (c) 2022 Syrone Wong <wong.syrone@gmail.com>
++ *   Massively rewrite the whole module, split the original code into library and nftables 'fullcone' expression module
++ * Copyright (c) 2022 Alexandre Frade <kernel@xanmod.org>
++ *   Ported to latest kernel source tree
++ */
++
++#define pr_fmt(fmt) "fullcone " KBUILD_MODNAME ": " fmt
++#define NF_FULLCONE_WORKQUEUE_NAME "fullcone " KBUILD_MODNAME ": wq"
++
++#include <linux/kernel.h>
++#include <linux/init.h>
++#include <linux/module.h>
++#include <linux/netlink.h>
++#include <linux/netfilter.h>
++#include <linux/netfilter/nf_tables.h>
++#include <net/netfilter/nf_tables.h>
++#include <net/netfilter/nf_nat.h>
++
++#include <linux/workqueue.h>
++
++#include <net/netfilter/nf_nat_fullcone.h>
++
++static void nft_fullcone_set_regs(const struct nft_expr *expr, const struct nft_regs *regs,
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 18, 0)
++				  struct nf_nat_range2 *range);
++#else
++				  struct nf_nat_range *range);
++#endif
++
++#ifdef CONFIG_NF_CONNTRACK_CHAIN_EVENTS
++struct notifier_block ct_event_notifier;
++#else
++struct nf_ct_event_notifier ct_event_notifier;
++#endif
++static DEFINE_MUTEX(nf_ct_net_event_lock);
++int ct_event_notifier_registered = 0;
++
++int module_refer_count = 0;
++
++static void gc_worker(struct work_struct *work);
++static struct workqueue_struct *wq __read_mostly = NULL;
++static DECLARE_DELAYED_WORK(gc_worker_wk, gc_worker);
++
++static void gc_worker(struct work_struct *work)
++{
++	nf_nat_fullcone_handle_dying_tuples();
++}
++
++struct nft_fullcone {
++	u32 flags;
++	u8 sreg_proto_min;
++	u8 sreg_proto_max;
++};
++
++static const struct nla_policy nft_fullcone_policy[NFTA_FULLCONE_MAX + 1] = {
++	[NFTA_FULLCONE_FLAGS] = {.type = NLA_U32 },
++	[NFTA_FULLCONE_REG_PROTO_MIN] = {.type = NLA_U32 },
++	[NFTA_FULLCONE_REG_PROTO_MAX] = {.type = NLA_U32 },
++};
++
++/* conntrack destroy event callback function */
++#ifdef CONFIG_NF_CONNTRACK_CHAIN_EVENTS
++static int ct_event_cb(struct notifier_block *this, unsigned long events, void *ptr)
++{
++	struct nf_ct_event *item = ptr;
++#elif LINUX_VERSION_CODE >= KERNEL_VERSION(5, 15, 0)
++static int ct_event_cb(unsigned int events, const struct nf_ct_event *item)
++{
++#else
++static int ct_event_cb(unsigned int events, struct nf_ct_event *item)
++{
++#endif
++	struct nf_conn *ct;
++	struct nf_conntrack_tuple *ct_tuple_reply, *ct_tuple_original;
++	uint8_t protonum;
++	struct tuple_list *dying_tuple_item;
++
++	ct = item->ct;
++	/* we handle only conntrack destroy events */
++	if (ct == NULL || !(events & (1 << IPCT_DESTROY))) {
++		return 0;
++	}
++
++	ct_tuple_original = &(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
++
++	ct_tuple_reply = &(ct->tuplehash[IP_CT_DIR_REPLY].tuple);
++
++	protonum = (ct_tuple_original->dst).protonum;
++	if (protonum != IPPROTO_UDP) {
++		return 0;
++	}
++
++	dying_tuple_item = kmalloc(sizeof(struct tuple_list), GFP_ATOMIC);
++
++	if (dying_tuple_item == NULL) {
++		pr_debug("warning: ct_event_cb(): kmalloc failed.\n");
++		return 0;
++	}
++
++	memcpy(&(dying_tuple_item->tuple_original), ct_tuple_original, sizeof(struct nf_conntrack_tuple));
++	memcpy(&(dying_tuple_item->tuple_reply), ct_tuple_reply, sizeof(struct nf_conntrack_tuple));
++
++	nf_nat_fullcone_dying_tuple_list_add(&(dying_tuple_item->list));
++
++	if (wq != NULL)
++		queue_delayed_work(wq, &gc_worker_wk, msecs_to_jiffies(100));
++
++	return 0;
++}
++
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 15, 0) && !defined(CONFIG_NF_CONNTRACK_CHAIN_EVENTS)
++static int exp_event_cb(unsigned int events, const struct nf_exp_event *item)
++{
++	return 0;
++}
++#endif
++
++static int nft_fullcone_validate(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nft_data **data)
++{
++	int err;
++
++	err = nft_chain_validate_dependency(ctx->chain, NFT_CHAIN_T_NAT);
++	if (err < 0)
++		return err;
++
++	// TODO: check hooks
++	return nft_chain_validate_hooks(ctx->chain, (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_POST_ROUTING));
++}
++
++static int nft_fullcone_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr *const tb[])
++{
++	int err;
++	int register_ct_notifier_ret = 0;
++
++	err = nf_ct_netns_get(ctx->net, ctx->family);
++
++	mutex_lock(&nf_ct_net_event_lock);
++
++	module_refer_count++;
++
++	pr_debug("nft_fullcone_init(): module_refer_count is now %d\n", module_refer_count);
++
++	if (module_refer_count == 1) {
++#ifdef CONFIG_NF_CONNTRACK_CHAIN_EVENTS
++		ct_event_notifier.notifier_call = ct_event_cb;
++#elif LINUX_VERSION_CODE >= KERNEL_VERSION(5, 15, 0)
++		ct_event_notifier.ct_event = ct_event_cb;
++		ct_event_notifier.exp_event = exp_event_cb;
++#else
++		ct_event_notifier.fcn = ct_event_cb;
++#endif
++
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 15, 0) && !defined(CONFIG_NF_CONNTRACK_CHAIN_EVENTS)
++		if (!READ_ONCE(ctx->net->ct.nf_conntrack_event_cb)) {
++			nf_conntrack_register_notifier(ctx->net, &ct_event_notifier);
++		}
++#else
++		register_ct_notifier_ret = nf_conntrack_register_notifier(ctx->net, &ct_event_notifier);
++#endif
++
++		if (register_ct_notifier_ret) {
++			/* non-zero means failure */
++			pr_warn("failed to register a conntrack notifier. Disable active GC for mappings.\n");
++		} else {
++			ct_event_notifier_registered = 1;
++			pr_debug("nft_fullcone_init(): ct_event_notifier registered\n");
++		}
++
++	}
++
++	mutex_unlock(&nf_ct_net_event_lock);
++
++	return err;
++}
++
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 2, 0)
++static int nft_fullcone_dump(struct sk_buff *skb, const struct nft_expr *expr, bool reset)
++#else
++static int nft_fullcone_dump(struct sk_buff *skb, const struct nft_expr *expr)
++#endif
++{
++	const struct nft_fullcone *priv = nft_expr_priv(expr);
++
++	if (priv->flags != 0 && nla_put_be32(skb, NFTA_FULLCONE_FLAGS, htonl(priv->flags)))
++		goto nla_put_failure;
++
++	if (priv->sreg_proto_min) {
++		if (nft_dump_register(skb, NFTA_FULLCONE_REG_PROTO_MIN,
++				      priv->sreg_proto_min) ||
++		    nft_dump_register(skb, NFTA_FULLCONE_REG_PROTO_MAX, priv->sreg_proto_max))
++			goto nla_put_failure;
++	}
++
++	return 0;
++
++nla_put_failure:
++	return -1;
++}
++
++/* nft_fullcone_set_regs sets nft_regs from nft_expr fullcone specific private data */
++static void nft_fullcone_set_regs(const struct nft_expr *expr, const struct nft_regs *regs,
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 18, 0)
++				  struct nf_nat_range2 *range
++#else
++				  struct nf_nat_range *range
++#endif
++    )
++{
++	// private data connected via nft_expr_type.ops <==> nft_expr_ops.type
++	// private data type from nft_expr_type.{policy,maxattr,ops}
++	// private data size from nft_expr_ops.size
++	struct nft_fullcone *priv = nft_expr_priv(expr);
++	range->flags = priv->flags;
++	if (priv->sreg_proto_min) {
++		range->min_proto.all = (__force __be16)
++		    nft_reg_load16(&regs->data[priv->sreg_proto_min]);
++		range->max_proto.all = (__force __be16)
++		    nft_reg_load16(&regs->data[priv->sreg_proto_max]);
++	}
++}
++
++static void nft_fullcone_ipv4_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt)
++{
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 18, 0)
++	struct nf_nat_range2 range;
++#else
++	struct nf_nat_range range;
++#endif
++
++	memset(&range, 0, sizeof(range));
++	nft_fullcone_set_regs(expr, regs, &range);
++	regs->verdict.code = nf_nat_fullcone_ipv4(pkt->skb, nft_hook(pkt), &range, nft_out(pkt));
++}
++
++static void nft_fullcone_common_destory(const struct nft_ctx *ctx)
++{
++	mutex_lock(&nf_ct_net_event_lock);
++
++	module_refer_count--;
++
++	pr_debug("nft_fullcone_common_destory(): module_refer_count is now %d\n", module_refer_count);
++
++	if (module_refer_count == 0) {
++		if (ct_event_notifier_registered) {
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 15, 0) && !defined(CONFIG_NF_CONNTRACK_CHAIN_EVENTS)
++			nf_conntrack_unregister_notifier(ctx->net);
++#else
++			nf_conntrack_unregister_notifier(ctx->net, &ct_event_notifier);
++#endif
++			ct_event_notifier_registered = 0;
++
++			pr_debug("nft_fullcone_common_destory(): ct_event_notifier unregistered\n");
++
++		}
++	}
++
++	mutex_unlock(&nf_ct_net_event_lock);
++}
++
++static void nft_fullcone_ipv4_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr)
++{
++	nft_fullcone_common_destory(ctx);
++	nf_ct_netns_put(ctx->net, NFPROTO_IPV4);
++}
++
++static struct nft_expr_type nft_fullcone_ipv4_type;
++static const struct nft_expr_ops nft_fullcone_ipv4_ops = {
++	.type = &nft_fullcone_ipv4_type,
++	.size = NFT_EXPR_SIZE(sizeof(struct nft_fullcone)),
++	.eval = nft_fullcone_ipv4_eval,
++	.init = nft_fullcone_init,
++	.destroy = nft_fullcone_ipv4_destroy,
++	.dump = nft_fullcone_dump,
++	.validate = nft_fullcone_validate,
++};
++
++static struct nft_expr_type nft_fullcone_ipv4_type __read_mostly = {
++	.family = NFPROTO_IPV4,
++	.name = "fullcone",
++	.ops = &nft_fullcone_ipv4_ops,
++	.policy = nft_fullcone_policy,
++	.maxattr = NFTA_FULLCONE_MAX,
++	.owner = THIS_MODULE,
++};
++
++#ifdef CONFIG_NF_TABLES_IPV6
++static void nft_fullcone_ipv6_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt)
++{
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 18, 0)
++	struct nf_nat_range2 range;
++#else
++	struct nf_nat_range range;
++#endif
++
++	memset(&range, 0, sizeof(range));
++	nft_fullcone_set_regs(expr, regs, &range);
++	regs->verdict.code = nf_nat_fullcone_ipv6(pkt->skb, nft_hook(pkt), &range, nft_out(pkt));
++}
++
++static void nft_fullcone_ipv6_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr)
++{
++	nft_fullcone_common_destory(ctx);
++	nf_ct_netns_put(ctx->net, NFPROTO_IPV6);
++}
++
++static struct nft_expr_type nft_fullcone_ipv6_type;
++static const struct nft_expr_ops nft_fullcone_ipv6_ops = {
++	.type = &nft_fullcone_ipv6_type,
++	.size = NFT_EXPR_SIZE(sizeof(struct nft_fullcone)),
++	.eval = nft_fullcone_ipv6_eval,
++	.init = nft_fullcone_init,
++	.destroy = nft_fullcone_ipv6_destroy,
++	.dump = nft_fullcone_dump,
++	.validate = nft_fullcone_validate,
++};
++
++static struct nft_expr_type nft_fullcone_ipv6_type __read_mostly = {
++	.family = NFPROTO_IPV6,
++	.name = "fullcone",
++	.ops = &nft_fullcone_ipv6_ops,
++	.policy = nft_fullcone_policy,
++	.maxattr = NFTA_FULLCONE_MAX,
++	.owner = THIS_MODULE,
++};
++
++static int __init nft_fullcone_module_init_ipv6(void)
++{
++	return nft_register_expr(&nft_fullcone_ipv6_type);
++}
++
++static void nft_fullcone_module_exit_ipv6(void)
++{
++	nft_unregister_expr(&nft_fullcone_ipv6_type);
++}
++#else
++static inline int nft_fullcone_module_init_ipv6(void)
++{
++	return 0;
++}
++
++static inline void nft_fullcone_module_exit_ipv6(void)
++{
++}
++#endif
++
++#ifdef CONFIG_NF_TABLES_INET
++static void nft_fullcone_inet_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt)
++{
++	switch (nft_pf(pkt)) {
++	case NFPROTO_IPV4:
++		return nft_fullcone_ipv4_eval(expr, regs, pkt);
++	case NFPROTO_IPV6:
++		return nft_fullcone_ipv6_eval(expr, regs, pkt);
++	}
++
++	WARN_ON_ONCE(1);
++}
++
++static void nft_fullcone_inet_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr)
++{
++	nft_fullcone_common_destory(ctx);
++	nf_ct_netns_put(ctx->net, NFPROTO_INET);
++}
++
++static struct nft_expr_type nft_fullcone_inet_type;
++static const struct nft_expr_ops nft_fullcone_inet_ops = {
++	.type = &nft_fullcone_inet_type,
++	.size = NFT_EXPR_SIZE(sizeof(struct nft_fullcone)),
++	.eval = nft_fullcone_inet_eval,
++	.init = nft_fullcone_init,
++	.destroy = nft_fullcone_inet_destroy,
++	.dump = nft_fullcone_dump,
++	.validate = nft_fullcone_validate,
++};
++
++static struct nft_expr_type nft_fullcone_inet_type __read_mostly = {
++	.family = NFPROTO_INET,
++	.name = "fullcone",
++	.ops = &nft_fullcone_inet_ops,
++	.policy = nft_fullcone_policy,
++	.maxattr = NFTA_FULLCONE_MAX,
++	.owner = THIS_MODULE,
++};
++
++static int __init nft_fullcone_module_init_inet(void)
++{
++	return nft_register_expr(&nft_fullcone_inet_type);
++}
++
++static void nft_fullcone_module_exit_inet(void)
++{
++	nft_unregister_expr(&nft_fullcone_inet_type);
++}
++#else
++static inline int nft_fullcone_module_init_inet(void)
++{
++	return 0;
++}
++
++static inline void nft_fullcone_module_exit_inet(void)
++{
++}
++#endif
++
++static int __init nft_fullcone_module_init(void)
++{
++	int ret;
++
++	ret = nft_fullcone_module_init_ipv6();
++	if (ret < 0)
++		return ret;
++
++	ret = nft_fullcone_module_init_inet();
++	if (ret < 0) {
++		nft_fullcone_module_exit_ipv6();
++		return ret;
++	}
++
++	ret = nft_register_expr(&nft_fullcone_ipv4_type);
++	if (ret < 0) {
++		nft_fullcone_module_exit_inet();
++		nft_fullcone_module_exit_ipv6();
++		return ret;
++	}
++
++	wq = create_singlethread_workqueue(NF_FULLCONE_WORKQUEUE_NAME);
++	if (wq == NULL) {
++		pr_err("failed to create workqueue %s\n", NF_FULLCONE_WORKQUEUE_NAME);
++	}
++
++	return ret;
++}
++
++static void __exit nft_fullcone_module_exit(void)
++{
++	nft_fullcone_module_exit_ipv6();
++	nft_fullcone_module_exit_inet();
++	nft_unregister_expr(&nft_fullcone_ipv4_type);
++
++	if (wq) {
++		cancel_delayed_work_sync(&gc_worker_wk);
++		flush_workqueue(wq);
++		destroy_workqueue(wq);
++	}
++
++	nf_nat_fullcone_handle_dying_tuples();
++	nf_nat_fullcone_destroy_mappings();
++}
++
++module_init(nft_fullcone_module_init);
++module_exit(nft_fullcone_module_exit);
++
++MODULE_LICENSE("GPL");
++MODULE_AUTHOR("Syrone Wong <wong.syrone@gmail.com>");
++MODULE_ALIAS_NFT_EXPR("fullcone");
++MODULE_DESCRIPTION("Netfilter nftables fullcone expression support of RFC3489 full cone NAT");
diff --git a/debian/patches/patchset-xanmod/net/netfilter/0002-netfilter-add-xt_FLOWOFFLOAD-target.patch b/debian/patches/patchset-xanmod/net/netfilter/0002-netfilter-add-xt_FLOWOFFLOAD-target.patch
new file mode 100644
index 0000000..ca309c3
--- /dev/null
+++ b/debian/patches/patchset-xanmod/net/netfilter/0002-netfilter-add-xt_FLOWOFFLOAD-target.patch
@@ -0,0 +1,809 @@
+From 9a066610b055315bf155a99b2ea0a58245ef11e2 Mon Sep 17 00:00:00 2001
+From: Felix Fietkau <nbd@nbd.name>
+Date: Tue, 20 Feb 2018 15:56:02 +0100
+Subject: [PATCH 2/2] netfilter: add xt_FLOWOFFLOAD target
+
+Signed-off-by: Felix Fietkau <nbd@nbd.name>
+Signed-off-by: Alexandre Frade <kernel@xanmod.org>
+---
+ include/net/netfilter/nf_flow_table.h         |   5 +
+ include/uapi/linux/netfilter/xt_FLOWOFFLOAD.h |  17 +
+ net/netfilter/Kconfig                         |   9 +
+ net/netfilter/Makefile                        |   1 +
+ net/netfilter/nf_flow_table_core.c            |   5 +-
+ net/netfilter/xt_FLOWOFFLOAD.c                | 698 ++++++++++++++++++
+ 6 files changed, 732 insertions(+), 3 deletions(-)
+ create mode 100644 include/uapi/linux/netfilter/xt_FLOWOFFLOAD.h
+ create mode 100644 net/netfilter/xt_FLOWOFFLOAD.c
+
+--- a/include/net/netfilter/nf_flow_table.h
++++ b/include/net/netfilter/nf_flow_table.h
+@@ -294,6 +294,11 @@ void nf_flow_table_free(struct nf_flowta
+ 
+ void flow_offload_teardown(struct flow_offload *flow);
+ 
++int nf_flow_table_iterate(struct nf_flowtable *flow_table,
++                      void (*iter)(struct nf_flowtable *flowtable,
++                                   struct flow_offload *flow, void *data),
++                      void *data);
++
+ void nf_flow_snat_port(const struct flow_offload *flow,
+ 		       struct sk_buff *skb, unsigned int thoff,
+ 		       u8 protocol, enum flow_offload_tuple_dir dir);
+--- /dev/null
++++ b/include/uapi/linux/netfilter/xt_FLOWOFFLOAD.h
+@@ -0,0 +1,17 @@
++/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
++#ifndef _XT_FLOWOFFLOAD_H
++#define _XT_FLOWOFFLOAD_H
++
++#include <linux/types.h>
++
++enum {
++	XT_FLOWOFFLOAD_HW	= 1 << 0,
++
++	XT_FLOWOFFLOAD_MASK	= XT_FLOWOFFLOAD_HW
++};
++
++struct xt_flowoffload_target_info {
++	__u32 flags;
++};
++
++#endif /* _XT_FLOWOFFLOAD_H */
+--- a/net/netfilter/Kconfig
++++ b/net/netfilter/Kconfig
+@@ -1041,6 +1041,15 @@ config NETFILTER_XT_TARGET_NOTRACK
+ 	depends on NETFILTER_ADVANCED
+ 	select NETFILTER_XT_TARGET_CT
+ 
++config NETFILTER_XT_TARGET_FLOWOFFLOAD
++	tristate '"FLOWOFFLOAD" target support'
++	depends on NF_FLOW_TABLE
++	depends on NETFILTER_INGRESS
++	help
++	  This option adds a `FLOWOFFLOAD' target, which uses the nf_flow_offload
++	  module to speed up processing of packets by bypassing the usual
++	  netfilter chains
++
+ config NETFILTER_XT_TARGET_RATEEST
+ 	tristate '"RATEEST" target support'
+ 	depends on NETFILTER_ADVANCED
+--- a/net/netfilter/Makefile
++++ b/net/netfilter/Makefile
+@@ -168,6 +168,7 @@ obj-$(CONFIG_NETFILTER_XT_TARGET_CLASSIF
+ obj-$(CONFIG_NETFILTER_XT_TARGET_CONNSECMARK) += xt_CONNSECMARK.o
+ obj-$(CONFIG_NETFILTER_XT_TARGET_CT) += xt_CT.o
+ obj-$(CONFIG_NETFILTER_XT_TARGET_DSCP) += xt_DSCP.o
++obj-$(CONFIG_NETFILTER_XT_TARGET_FLOWOFFLOAD) += xt_FLOWOFFLOAD.o
+ obj-$(CONFIG_NETFILTER_XT_TARGET_HL) += xt_HL.o
+ obj-$(CONFIG_NETFILTER_XT_TARGET_HMARK) += xt_HMARK.o
+ obj-$(CONFIG_NETFILTER_XT_TARGET_LED) += xt_LED.o
+--- a/net/netfilter/nf_flow_table_core.c
++++ b/net/netfilter/nf_flow_table_core.c
+@@ -7,7 +7,6 @@
+ #include <linux/netdevice.h>
+ #include <net/ip.h>
+ #include <net/ip6_route.h>
+-#include <net/netfilter/nf_tables.h>
+ #include <net/netfilter/nf_flow_table.h>
+ #include <net/netfilter/nf_conntrack.h>
+ #include <net/netfilter/nf_conntrack_core.h>
+@@ -373,8 +372,7 @@ flow_offload_lookup(struct nf_flowtable
+ }
+ EXPORT_SYMBOL_GPL(flow_offload_lookup);
+ 
+-static int
+-nf_flow_table_iterate(struct nf_flowtable *flow_table,
++int nf_flow_table_iterate(struct nf_flowtable *flow_table,
+ 		      void (*iter)(struct nf_flowtable *flowtable,
+ 				   struct flow_offload *flow, void *data),
+ 		      void *data)
+@@ -435,6 +433,7 @@ static void nf_flow_offload_gc_step(stru
+ 		nf_flow_offload_stats(flow_table, flow);
+ 	}
+ }
++EXPORT_SYMBOL_GPL(nf_flow_table_iterate);
+ 
+ void nf_flow_table_gc_run(struct nf_flowtable *flow_table)
+ {
+--- /dev/null
++++ b/net/netfilter/xt_FLOWOFFLOAD.c
+@@ -0,0 +1,698 @@
++/*
++ * Copyright (C) 2018-2021 Felix Fietkau <nbd@nbd.name>
++ *
++ * This program is free software; you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License version 2 as
++ * published by the Free Software Foundation.
++ */
++#include <linux/module.h>
++#include <linux/init.h>
++#include <linux/netfilter.h>
++#include <linux/netfilter/xt_FLOWOFFLOAD.h>
++#include <linux/if_vlan.h>
++#include <net/ip.h>
++#include <net/netfilter/nf_conntrack.h>
++#include <net/netfilter/nf_conntrack_extend.h>
++#include <net/netfilter/nf_conntrack_helper.h>
++#include <net/netfilter/nf_flow_table.h>
++
++struct xt_flowoffload_hook {
++	struct hlist_node list;
++	struct nf_hook_ops ops;
++	struct net *net;
++	bool registered;
++	bool used;
++};
++
++struct xt_flowoffload_table {
++	struct nf_flowtable ft;
++	struct hlist_head hooks;
++	struct delayed_work work;
++};
++
++struct nf_forward_info {
++	const struct net_device *indev;
++	const struct net_device *outdev;
++	const struct net_device *hw_outdev;
++	struct id {
++		__u16	id;
++		__be16	proto;
++	} encap[NF_FLOW_TABLE_ENCAP_MAX];
++	u8 num_encaps;
++	u8 ingress_vlans;
++	u8 h_source[ETH_ALEN];
++	u8 h_dest[ETH_ALEN];
++	enum flow_offload_xmit_type xmit_type;
++};
++
++static DEFINE_SPINLOCK(hooks_lock);
++
++struct xt_flowoffload_table flowtable[2];
++
++static unsigned int
++xt_flowoffload_net_hook(void *priv, struct sk_buff *skb,
++			const struct nf_hook_state *state)
++{
++	struct vlan_ethhdr *veth;
++	__be16 proto;
++
++	switch (skb->protocol) {
++	case htons(ETH_P_8021Q):
++		veth = (struct vlan_ethhdr *)skb_mac_header(skb);
++		proto = veth->h_vlan_encapsulated_proto;
++		break;
++	case htons(ETH_P_PPP_SES):
++		if (!nf_flow_pppoe_proto(skb, &proto))
++			return NF_ACCEPT;
++		break;
++	default:
++		proto = skb->protocol;
++		break;
++	}
++
++	switch (proto) {
++	case htons(ETH_P_IP):
++		return nf_flow_offload_ip_hook(priv, skb, state);
++	case htons(ETH_P_IPV6):
++		return nf_flow_offload_ipv6_hook(priv, skb, state);
++	}
++
++	return NF_ACCEPT;
++}
++
++static int
++xt_flowoffload_create_hook(struct xt_flowoffload_table *table,
++			   struct net_device *dev)
++{
++	struct xt_flowoffload_hook *hook;
++	struct nf_hook_ops *ops;
++
++	hook = kzalloc(sizeof(*hook), GFP_ATOMIC);
++	if (!hook)
++		return -ENOMEM;
++
++	ops = &hook->ops;
++	ops->pf = NFPROTO_NETDEV;
++	ops->hooknum = NF_NETDEV_INGRESS;
++	ops->priority = 10;
++	ops->priv = &table->ft;
++	ops->hook = xt_flowoffload_net_hook;
++	ops->dev = dev;
++
++	hlist_add_head(&hook->list, &table->hooks);
++	mod_delayed_work(system_power_efficient_wq, &table->work, 0);
++
++	return 0;
++}
++
++static struct xt_flowoffload_hook *
++flow_offload_lookup_hook(struct xt_flowoffload_table *table,
++			 struct net_device *dev)
++{
++	struct xt_flowoffload_hook *hook;
++
++	hlist_for_each_entry(hook, &table->hooks, list) {
++		if (hook->ops.dev == dev)
++			return hook;
++	}
++
++	return NULL;
++}
++
++static void
++xt_flowoffload_check_device(struct xt_flowoffload_table *table,
++			    struct net_device *dev)
++{
++	struct xt_flowoffload_hook *hook;
++
++	if (!dev)
++		return;
++
++	spin_lock_bh(&hooks_lock);
++	hook = flow_offload_lookup_hook(table, dev);
++	if (hook)
++		hook->used = true;
++	else
++		xt_flowoffload_create_hook(table, dev);
++	spin_unlock_bh(&hooks_lock);
++}
++
++static void
++xt_flowoffload_register_hooks(struct xt_flowoffload_table *table)
++{
++	struct xt_flowoffload_hook *hook;
++
++restart:
++	hlist_for_each_entry(hook, &table->hooks, list) {
++		if (hook->registered)
++			continue;
++
++		hook->registered = true;
++		hook->net = dev_net(hook->ops.dev);
++		spin_unlock_bh(&hooks_lock);
++		nf_register_net_hook(hook->net, &hook->ops);
++		if (table->ft.flags & NF_FLOWTABLE_HW_OFFLOAD)
++			table->ft.type->setup(&table->ft, hook->ops.dev,
++					      FLOW_BLOCK_BIND);
++		spin_lock_bh(&hooks_lock);
++		goto restart;
++	}
++
++}
++
++static bool
++xt_flowoffload_cleanup_hooks(struct xt_flowoffload_table *table)
++{
++	struct xt_flowoffload_hook *hook;
++	bool active = false;
++
++restart:
++	spin_lock_bh(&hooks_lock);
++	hlist_for_each_entry(hook, &table->hooks, list) {
++		if (hook->used || !hook->registered) {
++			active = true;
++			continue;
++		}
++
++		hlist_del(&hook->list);
++		spin_unlock_bh(&hooks_lock);
++		if (table->ft.flags & NF_FLOWTABLE_HW_OFFLOAD)
++			table->ft.type->setup(&table->ft, hook->ops.dev,
++					      FLOW_BLOCK_UNBIND);
++		nf_unregister_net_hook(hook->net, &hook->ops);
++		kfree(hook);
++		goto restart;
++	}
++	spin_unlock_bh(&hooks_lock);
++
++	return active;
++}
++
++static void
++xt_flowoffload_check_hook(struct nf_flowtable *flowtable,
++			  struct flow_offload *flow, void *data)
++{
++	struct xt_flowoffload_table *table;
++	struct flow_offload_tuple *tuple0 = &flow->tuplehash[0].tuple;
++	struct flow_offload_tuple *tuple1 = &flow->tuplehash[1].tuple;
++	struct xt_flowoffload_hook *hook;
++
++	table = container_of(flowtable, struct xt_flowoffload_table, ft);
++
++	spin_lock_bh(&hooks_lock);
++	hlist_for_each_entry(hook, &table->hooks, list) {
++		if (hook->ops.dev->ifindex != tuple0->iifidx &&
++		    hook->ops.dev->ifindex != tuple1->iifidx)
++			continue;
++
++		hook->used = true;
++	}
++	spin_unlock_bh(&hooks_lock);
++}
++
++static void
++xt_flowoffload_hook_work(struct work_struct *work)
++{
++	struct xt_flowoffload_table *table;
++	struct xt_flowoffload_hook *hook;
++	int err;
++
++	table = container_of(work, struct xt_flowoffload_table, work.work);
++
++	spin_lock_bh(&hooks_lock);
++	xt_flowoffload_register_hooks(table);
++	hlist_for_each_entry(hook, &table->hooks, list)
++		hook->used = false;
++	spin_unlock_bh(&hooks_lock);
++
++	err = nf_flow_table_iterate(&table->ft, xt_flowoffload_check_hook,
++				    NULL);
++	if (err && err != -EAGAIN)
++		goto out;
++
++	if (!xt_flowoffload_cleanup_hooks(table))
++		return;
++
++out:
++	queue_delayed_work(system_power_efficient_wq, &table->work, HZ);
++}
++
++static bool
++xt_flowoffload_skip(struct sk_buff *skb, int family)
++{
++	if (skb_sec_path(skb))
++		return true;
++
++	if (family == NFPROTO_IPV4) {
++		const struct ip_options *opt = &(IPCB(skb)->opt);
++
++		if (unlikely(opt->optlen))
++			return true;
++	}
++
++	return false;
++}
++
++static enum flow_offload_xmit_type nf_xmit_type(struct dst_entry *dst)
++{
++	if (dst_xfrm(dst))
++		return FLOW_OFFLOAD_XMIT_XFRM;
++
++	return FLOW_OFFLOAD_XMIT_NEIGH;
++}
++
++static void nf_default_forward_path(struct nf_flow_route *route,
++				    struct dst_entry *dst_cache,
++				    enum ip_conntrack_dir dir,
++				    struct net_device **dev)
++{
++	dev[!dir] = dst_cache->dev;
++	route->tuple[!dir].in.ifindex	= dst_cache->dev->ifindex;
++	route->tuple[dir].dst		= dst_cache;
++	route->tuple[dir].xmit_type	= nf_xmit_type(dst_cache);
++}
++
++static bool nf_is_valid_ether_device(const struct net_device *dev)
++{
++	if (!dev || (dev->flags & IFF_LOOPBACK) || dev->type != ARPHRD_ETHER ||
++	    dev->addr_len != ETH_ALEN || !is_valid_ether_addr(dev->dev_addr))
++		return false;
++
++	return true;
++}
++
++static void nf_dev_path_info(const struct net_device_path_stack *stack,
++			     struct nf_forward_info *info,
++			     unsigned char *ha)
++{
++	const struct net_device_path *path;
++	int i;
++
++	memcpy(info->h_dest, ha, ETH_ALEN);
++
++	for (i = 0; i < stack->num_paths; i++) {
++		path = &stack->path[i];
++		switch (path->type) {
++		case DEV_PATH_ETHERNET:
++		case DEV_PATH_DSA:
++		case DEV_PATH_VLAN:
++		case DEV_PATH_PPPOE:
++			info->indev = path->dev;
++			if (is_zero_ether_addr(info->h_source))
++				memcpy(info->h_source, path->dev->dev_addr, ETH_ALEN);
++
++			if (path->type == DEV_PATH_ETHERNET)
++				break;
++			if (path->type == DEV_PATH_DSA) {
++				i = stack->num_paths;
++				break;
++			}
++
++			/* DEV_PATH_VLAN and DEV_PATH_PPPOE */
++			if (info->num_encaps >= NF_FLOW_TABLE_ENCAP_MAX) {
++				info->indev = NULL;
++				break;
++			}
++			if (!info->outdev)
++				info->outdev = path->dev;
++			info->encap[info->num_encaps].id = path->encap.id;
++			info->encap[info->num_encaps].proto = path->encap.proto;
++			info->num_encaps++;
++			if (path->type == DEV_PATH_PPPOE)
++				memcpy(info->h_dest, path->encap.h_dest, ETH_ALEN);
++			break;
++		case DEV_PATH_BRIDGE:
++			if (is_zero_ether_addr(info->h_source))
++				memcpy(info->h_source, path->dev->dev_addr, ETH_ALEN);
++
++			switch (path->bridge.vlan_mode) {
++			case DEV_PATH_BR_VLAN_UNTAG_HW:
++				info->ingress_vlans |= BIT(info->num_encaps - 1);
++				break;
++			case DEV_PATH_BR_VLAN_TAG:
++				info->encap[info->num_encaps].id = path->bridge.vlan_id;
++				info->encap[info->num_encaps].proto = path->bridge.vlan_proto;
++				info->num_encaps++;
++				break;
++			case DEV_PATH_BR_VLAN_UNTAG:
++				info->num_encaps--;
++				break;
++			case DEV_PATH_BR_VLAN_KEEP:
++				break;
++			}
++			break;
++		default:
++			info->indev = NULL;
++			break;
++		}
++	}
++	if (!info->outdev)
++		info->outdev = info->indev;
++
++	info->hw_outdev = info->indev;
++
++	if (nf_is_valid_ether_device(info->indev))
++		info->xmit_type = FLOW_OFFLOAD_XMIT_DIRECT;
++}
++
++static int nf_dev_fill_forward_path(const struct nf_flow_route *route,
++				     const struct dst_entry *dst_cache,
++				     const struct nf_conn *ct,
++				     enum ip_conntrack_dir dir, u8 *ha,
++				     struct net_device_path_stack *stack)
++{
++	const void *daddr = &ct->tuplehash[!dir].tuple.src.u3;
++	struct net_device *dev = dst_cache->dev;
++	struct neighbour *n;
++	u8 nud_state;
++
++	if (!nf_is_valid_ether_device(dev))
++		goto out;
++
++	n = dst_neigh_lookup(dst_cache, daddr);
++	if (!n)
++		return -1;
++
++	read_lock_bh(&n->lock);
++	nud_state = n->nud_state;
++	ether_addr_copy(ha, n->ha);
++	read_unlock_bh(&n->lock);
++	neigh_release(n);
++
++	if (!(nud_state & NUD_VALID))
++		return -1;
++
++out:
++	return dev_fill_forward_path(dev, ha, stack);
++}
++
++static void nf_dev_forward_path(struct nf_flow_route *route,
++				const struct nf_conn *ct,
++				enum ip_conntrack_dir dir,
++				struct net_device **devs)
++{
++	const struct dst_entry *dst = route->tuple[dir].dst;
++	struct net_device_path_stack stack;
++	struct nf_forward_info info = {};
++	unsigned char ha[ETH_ALEN];
++	int i;
++
++	if (nf_dev_fill_forward_path(route, dst, ct, dir, ha, &stack) >= 0)
++		nf_dev_path_info(&stack, &info, ha);
++
++	devs[!dir] = (struct net_device *)info.indev;
++	if (!info.indev)
++		return;
++
++	route->tuple[!dir].in.ifindex = info.indev->ifindex;
++	for (i = 0; i < info.num_encaps; i++) {
++		route->tuple[!dir].in.encap[i].id = info.encap[i].id;
++		route->tuple[!dir].in.encap[i].proto = info.encap[i].proto;
++	}
++	route->tuple[!dir].in.num_encaps = info.num_encaps;
++	route->tuple[!dir].in.ingress_vlans = info.ingress_vlans;
++
++	if (info.xmit_type == FLOW_OFFLOAD_XMIT_DIRECT) {
++		memcpy(route->tuple[dir].out.h_source, info.h_source, ETH_ALEN);
++		memcpy(route->tuple[dir].out.h_dest, info.h_dest, ETH_ALEN);
++		route->tuple[dir].out.ifindex = info.outdev->ifindex;
++		route->tuple[dir].out.hw_ifindex = info.hw_outdev->ifindex;
++		route->tuple[dir].xmit_type = info.xmit_type;
++	}
++}
++
++static int
++xt_flowoffload_route(struct sk_buff *skb, const struct nf_conn *ct,
++		     const struct xt_action_param *par,
++		     struct nf_flow_route *route, enum ip_conntrack_dir dir,
++		     struct net_device **devs)
++{
++	struct dst_entry *this_dst = skb_dst(skb);
++	struct dst_entry *other_dst = NULL;
++	struct flowi fl;
++
++	memset(&fl, 0, sizeof(fl));
++	switch (xt_family(par)) {
++	case NFPROTO_IPV4:
++		fl.u.ip4.daddr = ct->tuplehash[dir].tuple.src.u3.ip;
++		fl.u.ip4.flowi4_oif = xt_in(par)->ifindex;
++		break;
++	case NFPROTO_IPV6:
++		fl.u.ip6.saddr = ct->tuplehash[!dir].tuple.dst.u3.in6;
++		fl.u.ip6.daddr = ct->tuplehash[dir].tuple.src.u3.in6;
++		fl.u.ip6.flowi6_oif = xt_in(par)->ifindex;
++		break;
++	}
++
++	nf_route(xt_net(par), &other_dst, &fl, false, xt_family(par));
++	if (!other_dst)
++		return -ENOENT;
++
++	nf_default_forward_path(route, this_dst, dir, devs);
++	nf_default_forward_path(route, other_dst, !dir, devs);
++
++	if (route->tuple[dir].xmit_type	== FLOW_OFFLOAD_XMIT_NEIGH &&
++	    route->tuple[!dir].xmit_type == FLOW_OFFLOAD_XMIT_NEIGH) {
++		nf_dev_forward_path(route, ct, dir, devs);
++		nf_dev_forward_path(route, ct, !dir, devs);
++	}
++
++	return 0;
++}
++
++static unsigned int
++flowoffload_tg(struct sk_buff *skb, const struct xt_action_param *par)
++{
++	struct xt_flowoffload_table *table;
++	const struct xt_flowoffload_target_info *info = par->targinfo;
++	struct tcphdr _tcph, *tcph = NULL;
++	enum ip_conntrack_info ctinfo;
++	enum ip_conntrack_dir dir;
++	struct nf_flow_route route = {};
++	struct flow_offload *flow = NULL;
++	struct net_device *devs[2] = {};
++	struct nf_conn *ct;
++	struct net *net;
++
++	if (xt_flowoffload_skip(skb, xt_family(par)))
++		return XT_CONTINUE;
++
++	ct = nf_ct_get(skb, &ctinfo);
++	if (ct == NULL)
++		return XT_CONTINUE;
++
++	switch (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum) {
++	case IPPROTO_TCP:
++		if (ct->proto.tcp.state != TCP_CONNTRACK_ESTABLISHED)
++			return XT_CONTINUE;
++
++		tcph = skb_header_pointer(skb, par->thoff,
++					  sizeof(_tcph), &_tcph);
++		if (unlikely(!tcph || tcph->fin || tcph->rst))
++			return XT_CONTINUE;
++		break;
++	case IPPROTO_UDP:
++		break;
++	default:
++		return XT_CONTINUE;
++	}
++
++	if (nf_ct_ext_exist(ct, NF_CT_EXT_HELPER) ||
++	    ct->status & (IPS_SEQ_ADJUST | IPS_NAT_CLASH))
++		return XT_CONTINUE;
++
++	if (!nf_ct_is_confirmed(ct))
++		return XT_CONTINUE;
++
++	devs[dir] = xt_out(par);
++	devs[!dir] = xt_in(par);
++
++	if (!devs[dir] || !devs[!dir])
++		return XT_CONTINUE;
++
++	if (test_and_set_bit(IPS_OFFLOAD_BIT, &ct->status))
++		return XT_CONTINUE;
++
++	dir = CTINFO2DIR(ctinfo);
++
++	if (xt_flowoffload_route(skb, ct, par, &route, dir, devs) < 0)
++		goto err_flow_route;
++
++	flow = flow_offload_alloc(ct);
++	if (!flow)
++		goto err_flow_alloc;
++
++	flow_offload_route_init(flow, &route);
++
++	if (tcph) {
++		ct->proto.tcp.seen[0].flags |= IP_CT_TCP_FLAG_BE_LIBERAL;
++		ct->proto.tcp.seen[1].flags |= IP_CT_TCP_FLAG_BE_LIBERAL;
++	}
++
++	table = &flowtable[!!(info->flags & XT_FLOWOFFLOAD_HW)];
++
++	net = read_pnet(&table->ft.net);
++	if (!net)
++		write_pnet(&table->ft.net, xt_net(par));
++
++	if (flow_offload_add(&table->ft, flow) < 0)
++		goto err_flow_add;
++
++	xt_flowoffload_check_device(table, devs[0]);
++	xt_flowoffload_check_device(table, devs[1]);
++
++	dst_release(route.tuple[!dir].dst);
++
++	return XT_CONTINUE;
++
++err_flow_add:
++	flow_offload_free(flow);
++err_flow_alloc:
++	dst_release(route.tuple[!dir].dst);
++err_flow_route:
++	clear_bit(IPS_OFFLOAD_BIT, &ct->status);
++
++	return XT_CONTINUE;
++}
++
++static int flowoffload_chk(const struct xt_tgchk_param *par)
++{
++	struct xt_flowoffload_target_info *info = par->targinfo;
++
++	if (info->flags & ~XT_FLOWOFFLOAD_MASK)
++		return -EINVAL;
++
++	return 0;
++}
++
++static struct xt_target offload_tg_reg __read_mostly = {
++	.family		= NFPROTO_UNSPEC,
++	.name		= "FLOWOFFLOAD",
++	.revision	= 0,
++	.targetsize	= sizeof(struct xt_flowoffload_target_info),
++	.usersize	= sizeof(struct xt_flowoffload_target_info),
++	.checkentry	= flowoffload_chk,
++	.target		= flowoffload_tg,
++	.me		= THIS_MODULE,
++};
++
++static int flow_offload_netdev_event(struct notifier_block *this,
++				     unsigned long event, void *ptr)
++{
++	struct xt_flowoffload_hook *hook0, *hook1;
++	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
++
++	if (event != NETDEV_UNREGISTER)
++		return NOTIFY_DONE;
++
++	spin_lock_bh(&hooks_lock);
++	hook0 = flow_offload_lookup_hook(&flowtable[0], dev);
++	if (hook0)
++		hlist_del(&hook0->list);
++
++	hook1 = flow_offload_lookup_hook(&flowtable[1], dev);
++	if (hook1)
++		hlist_del(&hook1->list);
++	spin_unlock_bh(&hooks_lock);
++
++	if (hook0) {
++		nf_unregister_net_hook(hook0->net, &hook0->ops);
++		kfree(hook0);
++	}
++
++	if (hook1) {
++		nf_unregister_net_hook(hook1->net, &hook1->ops);
++		kfree(hook1);
++	}
++
++	nf_flow_table_cleanup(dev);
++
++	return NOTIFY_DONE;
++}
++
++static struct notifier_block flow_offload_netdev_notifier = {
++	.notifier_call	= flow_offload_netdev_event,
++};
++
++static int nf_flow_rule_route_inet(struct net *net,
++				   struct flow_offload *flow,
++				   enum flow_offload_tuple_dir dir,
++				   struct nf_flow_rule *flow_rule)
++{
++	const struct flow_offload_tuple *flow_tuple = &flow->tuplehash[dir].tuple;
++	int err;
++
++	switch (flow_tuple->l3proto) {
++	case NFPROTO_IPV4:
++		err = nf_flow_rule_route_ipv4(net, flow, dir, flow_rule);
++		break;
++	case NFPROTO_IPV6:
++		err = nf_flow_rule_route_ipv6(net, flow, dir, flow_rule);
++		break;
++	default:
++		err = -1;
++		break;
++	}
++
++	return err;
++}
++
++static struct nf_flowtable_type flowtable_inet = {
++	.family		= NFPROTO_INET,
++	.init		= nf_flow_table_init,
++	.setup		= nf_flow_table_offload_setup,
++	.action		= nf_flow_rule_route_inet,
++	.free		= nf_flow_table_free,
++	.hook		= xt_flowoffload_net_hook,
++	.owner		= THIS_MODULE,
++};
++
++static int init_flowtable(struct xt_flowoffload_table *tbl)
++{
++	INIT_DELAYED_WORK(&tbl->work, xt_flowoffload_hook_work);
++	tbl->ft.type = &flowtable_inet;
++	tbl->ft.flags = NF_FLOWTABLE_COUNTER;
++
++	return nf_flow_table_init(&tbl->ft);
++}
++
++static int __init xt_flowoffload_tg_init(void)
++{
++	int ret;
++
++	register_netdevice_notifier(&flow_offload_netdev_notifier);
++
++	ret = init_flowtable(&flowtable[0]);
++	if (ret)
++		return ret;
++
++	ret = init_flowtable(&flowtable[1]);
++	if (ret)
++		goto cleanup;
++
++	flowtable[1].ft.flags |= NF_FLOWTABLE_HW_OFFLOAD;
++
++	ret = xt_register_target(&offload_tg_reg);
++	if (ret)
++		goto cleanup2;
++
++	return 0;
++
++cleanup2:
++	nf_flow_table_free(&flowtable[1].ft);
++cleanup:
++	nf_flow_table_free(&flowtable[0].ft);
++	return ret;
++}
++
++static void __exit xt_flowoffload_tg_exit(void)
++{
++	xt_unregister_target(&offload_tg_reg);
++	unregister_netdevice_notifier(&flow_offload_netdev_notifier);
++	nf_flow_table_free(&flowtable[0].ft);
++	nf_flow_table_free(&flowtable[1].ft);
++}
++
++MODULE_LICENSE("GPL");
++module_init(xt_flowoffload_tg_init);
++module_exit(xt_flowoffload_tg_exit);
diff --git a/debian/patches/patchset-xanmod/net/tcp/cloudflare/0001-tcp-Add-a-sysctl-to-skip-tcp-collapse-processing-whe.patch b/debian/patches/patchset-xanmod/net/tcp/cloudflare/0001-tcp-Add-a-sysctl-to-skip-tcp-collapse-processing-whe.patch
new file mode 100644
index 0000000..06f4c22
--- /dev/null
+++ b/debian/patches/patchset-xanmod/net/tcp/cloudflare/0001-tcp-Add-a-sysctl-to-skip-tcp-collapse-processing-whe.patch
@@ -0,0 +1,152 @@
+From 772c6e460211ac740b2550fa75be36b8a49731fe Mon Sep 17 00:00:00 2001
+From: "mfreemon@cloudflare.com" <mfreemon@cloudflare.com>
+Date: Tue, 1 Mar 2022 17:06:02 -0600
+Subject: [PATCH] tcp: Add a sysctl to skip tcp collapse processing when the
+ receive buffer is full
+
+For context and additional information about this patch, see the
+blog post at https://blog.cloudflare.com/optimizing-tcp-for-high-throughput-and-low-latency/
+
+sysctl:  net.ipv4.tcp_collapse_max_bytes
+
+If tcp_collapse_max_bytes is non-zero, attempt to collapse the
+queue to free up memory if the current amount of memory allocated
+is less than tcp_collapse_max_bytes.  Otherwise, the packet is
+dropped without attempting to collapse the queue.
+
+If tcp_collapse_max_bytes is zero, this feature is disabled
+and the default Linux behavior is used.  The default Linux
+behavior is to always perform the attempt to collapse the
+queue to free up memory.
+
+When the receive queue is small, we want to collapse the
+queue.  There are two reasons for this: (a) the latency of
+performing the collapse will be small on a small queue, and
+(b) we want to avoid sending a congestion signal (via a
+packet drop) to the sender when the receive queue is small.
+
+The result is that we avoid latency spikes caused by the
+time it takes to perform the collapse logic when the receive
+queue is large and full, while preserving existing behavior
+and performance for all other cases.
+
+Signed-off-by: Alexandre Frade <kernel@xanmod.org>
+---
+ include/net/netns/ipv4.h   |  1 +
+ include/trace/events/tcp.h |  7 +++++++
+ net/ipv4/sysctl_net_ipv4.c |  7 +++++++
+ net/ipv4/tcp_input.c       | 36 ++++++++++++++++++++++++++++++++++++
+ net/ipv4/tcp_ipv4.c        |  1 +
+ 5 files changed, 52 insertions(+)
+
+--- a/include/net/netns/ipv4.h
++++ b/include/net/netns/ipv4.h
+@@ -223,6 +223,7 @@ struct netns_ipv4 {
+ 
+ 	u8 sysctl_fib_notify_on_flag_change;
+ 	u8 sysctl_tcp_syn_linear_timeouts;
++	unsigned int sysctl_tcp_collapse_max_bytes;
+ 
+ #ifdef CONFIG_NET_L3_MASTER_DEV
+ 	u8 sysctl_udp_l3mdev_accept;
+--- a/include/trace/events/tcp.h
++++ b/include/trace/events/tcp.h
+@@ -213,6 +213,13 @@ DEFINE_EVENT(tcp_event_sk, tcp_rcv_space
+ 	TP_ARGS(sk)
+ );
+ 
++DEFINE_EVENT(tcp_event_sk, tcp_collapse_max_bytes_exceeded,
++
++	TP_PROTO(struct sock *sk),
++
++	TP_ARGS(sk)
++);
++
+ TRACE_EVENT(tcp_retransmit_synack,
+ 
+ 	TP_PROTO(const struct sock *sk, const struct request_sock *req),
+--- a/net/ipv4/sysctl_net_ipv4.c
++++ b/net/ipv4/sysctl_net_ipv4.c
+@@ -1558,6 +1558,13 @@ static struct ctl_table ipv4_net_table[]
+ 		.extra2		= SYSCTL_ONE,
+ 	},
+ 	{
++		.procname	= "tcp_collapse_max_bytes",
++		.data		= &init_net.ipv4.sysctl_tcp_collapse_max_bytes,
++		.maxlen		= sizeof(unsigned int),
++		.mode		= 0644,
++		.proc_handler	= proc_douintvec_minmax,
++	},
++	{
+ 		.procname	= "tcp_pingpong_thresh",
+ 		.data		= &init_net.ipv4.sysctl_tcp_pingpong_thresh,
+ 		.maxlen		= sizeof(u8),
+--- a/net/ipv4/tcp_input.c
++++ b/net/ipv4/tcp_input.c
+@@ -5645,6 +5645,7 @@ static bool tcp_prune_ofo_queue(struct s
+ static int tcp_prune_queue(struct sock *sk, const struct sk_buff *in_skb)
+ {
+ 	struct tcp_sock *tp = tcp_sk(sk);
++	struct net *net = sock_net(sk);
+ 
+ 	NET_INC_STATS(sock_net(sk), LINUX_MIB_PRUNECALLED);
+ 
+@@ -5656,6 +5657,39 @@ static int tcp_prune_queue(struct sock *
+ 	if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
+ 		return 0;
+ 
++	/* For context and additional information about this patch, see the
++	 * blog post at
++	 *
++	 * sysctl:  net.ipv4.tcp_collapse_max_bytes
++	 *
++	 * If tcp_collapse_max_bytes is non-zero, attempt to collapse the
++	 * queue to free up memory if the current amount of memory allocated
++	 * is less than tcp_collapse_max_bytes.  Otherwise, the packet is
++	 * dropped without attempting to collapse the queue.
++	 *
++	 * If tcp_collapse_max_bytes is zero, this feature is disabled
++	 * and the default Linux behavior is used.  The default Linux
++	 * behavior is to always perform the attempt to collapse the
++	 * queue to free up memory.
++	 *
++	 * When the receive queue is small, we want to collapse the
++	 * queue.  There are two reasons for this: (a) the latency of
++	 * performing the collapse will be small on a small queue, and
++	 * (b) we want to avoid sending a congestion signal (via a
++	 * packet drop) to the sender when the receive queue is small.
++	 *
++	 * The result is that we avoid latency spikes caused by the
++	 * time it takes to perform the collapse logic when the receive
++	 * queue is large and full, while preserving existing behavior
++	 * and performance for all other cases.
++	 */
++	if (net->ipv4.sysctl_tcp_collapse_max_bytes &&
++		(atomic_read(&sk->sk_rmem_alloc) > net->ipv4.sysctl_tcp_collapse_max_bytes)) {
++		/* We are dropping the packet */
++		trace_tcp_collapse_max_bytes_exceeded(sk);
++		goto do_not_collapse;
++	}
++
+ 	tcp_collapse_ofo_queue(sk);
+ 	if (!skb_queue_empty(&sk->sk_receive_queue))
+ 		tcp_collapse(sk, &sk->sk_receive_queue, NULL,
+@@ -5674,6 +5708,8 @@ static int tcp_prune_queue(struct sock *
+ 	if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
+ 		return 0;
+ 
++do_not_collapse:
++
+ 	/* If we are really being abused, tell the caller to silently
+ 	 * drop receive data on the floor.  It will get retransmitted
+ 	 * and hopefully then we'll have sufficient space.
+--- a/net/ipv4/tcp_ipv4.c
++++ b/net/ipv4/tcp_ipv4.c
+@@ -3508,6 +3508,7 @@ static int __net_init tcp_sk_init(struct
+ 
+ 	net->ipv4.sysctl_tcp_syn_linear_timeouts = 4;
+ 	net->ipv4.sysctl_tcp_shrink_window = 0;
++	net->ipv4.sysctl_tcp_collapse_max_bytes = 0;
+ 
+ 	net->ipv4.sysctl_tcp_pingpong_thresh = 1;
+ 	net->ipv4.sysctl_tcp_rto_min_us = jiffies_to_usecs(TCP_RTO_MIN);
diff --git a/debian/patches/patchset-xanmod/pci_acso/0001-PCI-Enable-overrides-for-missing-ACS-capabilities.patch b/debian/patches/patchset-xanmod/pci_acso/0001-PCI-Enable-overrides-for-missing-ACS-capabilities.patch
new file mode 100644
index 0000000..4581564
--- /dev/null
+++ b/debian/patches/patchset-xanmod/pci_acso/0001-PCI-Enable-overrides-for-missing-ACS-capabilities.patch
@@ -0,0 +1,191 @@
+From 454d96573eea54fcb7714a4a455d13173dfb9768 Mon Sep 17 00:00:00 2001
+From: Mark Weiman <mark.weiman@markzz.com>
+Date: Sun, 12 Aug 2018 11:36:21 -0400
+Subject: [PATCH] PCI: Enable overrides for missing ACS capabilities
+
+This an updated version of Alex Williamson's patch from:
+https://lkml.org/lkml/2013/5/30/513
+
+Original commit message follows:
+
+PCIe ACS (Access Control Services) is the PCIe 2.0+ feature that
+allows us to control whether transactions are allowed to be redirected
+in various subnodes of a PCIe topology.  For instance, if two
+endpoints are below a root port or downsteam switch port, the
+downstream port may optionally redirect transactions between the
+devices, bypassing upstream devices.  The same can happen internally
+on multifunction devices.  The transaction may never be visible to the
+upstream devices.
+
+One upstream device that we particularly care about is the IOMMU.  If
+a redirection occurs in the topology below the IOMMU, then the IOMMU
+cannot provide isolation between devices.  This is why the PCIe spec
+encourages topologies to include ACS support.  Without it, we have to
+assume peer-to-peer DMA within a hierarchy can bypass IOMMU isolation.
+
+Unfortunately, far too many topologies do not support ACS to make this
+a steadfast requirement.  Even the latest chipsets from Intel are only
+sporadically supporting ACS.  We have trouble getting interconnect
+vendors to include the PCIe spec required PCIe capability, let alone
+suggested features.
+
+Therefore, we need to add some flexibility.  The pcie_acs_override=
+boot option lets users opt-in specific devices or sets of devices to
+assume ACS support.  The "downstream" option assumes full ACS support
+on root ports and downstream switch ports.  The "multifunction"
+option assumes the subset of ACS features available on multifunction
+endpoints and upstream switch ports are supported.  The "id:nnnn:nnnn"
+option enables ACS support on devices matching the provided vendor
+and device IDs, allowing more strategic ACS overrides.  These options
+may be combined in any order.  A maximum of 16 id specific overrides
+are available.  It's suggested to use the most limited set of options
+necessary to avoid completely disabling ACS across the topology.
+Note to hardware vendors, we have facilities to permanently quirk
+specific devices which enforce isolation but not provide an ACS
+capability.  Please contact me to have your devices added and save
+your customers the hassle of this boot option.
+
+Rebased-by: Alexandre Frade <kernel@xanmod.org>
+Signed-off-by: Mark Weiman <mark.weiman@markzz.com>
+Signed-off-by: Alexandre Frade <kernel@xanmod.org>
+---
+ .../admin-guide/kernel-parameters.txt         |   9 ++
+ drivers/pci/quirks.c                          | 102 ++++++++++++++++++
+ 2 files changed, 111 insertions(+)
+
+--- a/Documentation/admin-guide/kernel-parameters.txt
++++ b/Documentation/admin-guide/kernel-parameters.txt
+@@ -4414,6 +4414,15 @@
+ 		nomsi		[MSI] If the PCI_MSI kernel config parameter is
+ 				enabled, this kernel boot option can be used to
+ 				disable the use of MSI interrupts system-wide.
++		pcie_acs_override =
++					[PCIE] Override missing PCIe ACS support for:
++				downstream
++					All downstream ports - full ACS capabilities
++				multifunction
++					All multifunction devices - multifunction ACS subset
++				id:nnnn:nnnn
++					Specific device - full ACS capabilities
++					Specified as vid:did (vendor/device ID) in hex
+ 		noioapicquirk	[APIC] Disable all boot interrupt quirks.
+ 				Safety option to keep boot IRQs enabled. This
+ 				should never be necessary.
+--- a/drivers/pci/quirks.c
++++ b/drivers/pci/quirks.c
+@@ -3747,6 +3747,106 @@ static void quirk_no_bus_reset(struct pc
+ 	dev->dev_flags |= PCI_DEV_FLAGS_NO_BUS_RESET;
+ }
+ 
++static bool acs_on_downstream;
++static bool acs_on_multifunction;
++
++#define NUM_ACS_IDS 16
++struct acs_on_id {
++	unsigned short vendor;
++	unsigned short device;
++};
++static struct acs_on_id acs_on_ids[NUM_ACS_IDS];
++static u8 max_acs_id;
++
++static __init int pcie_acs_override_setup(char *p)
++{
++	if (!p)
++		return -EINVAL;
++
++	while (*p) {
++		if (!strncmp(p, "downstream", 10))
++			acs_on_downstream = true;
++		if (!strncmp(p, "multifunction", 13))
++			acs_on_multifunction = true;
++		if (!strncmp(p, "id:", 3)) {
++			char opt[5];
++			int ret;
++			long val;
++
++			if (max_acs_id >= NUM_ACS_IDS - 1) {
++				pr_warn("Out of PCIe ACS override slots (%d)\n",
++						NUM_ACS_IDS);
++				goto next;
++			}
++
++			p += 3;
++			snprintf(opt, 5, "%s", p);
++			ret = kstrtol(opt, 16, &val);
++			if (ret) {
++				pr_warn("PCIe ACS ID parse error %d\n", ret);
++				goto next;
++			}
++			acs_on_ids[max_acs_id].vendor = val;
++
++			p += strcspn(p, ":");
++			if (*p != ':') {
++				pr_warn("PCIe ACS invalid ID\n");
++				goto next;
++			}
++
++			p++;
++			snprintf(opt, 5, "%s", p);
++			ret = kstrtol(opt, 16, &val);
++			if (ret) {
++				pr_warn("PCIe ACS ID parse error %d\n", ret);
++				goto next;
++			}
++			acs_on_ids[max_acs_id].device = val;
++			max_acs_id++;
++		}
++next:
++		p += strcspn(p, ",");
++		if (*p == ',')
++			p++;
++	}
++
++	if (acs_on_downstream || acs_on_multifunction || max_acs_id)
++		pr_warn("Warning: PCIe ACS overrides enabled; This may allow non-IOMMU protected peer-to-peer DMA\n");
++
++	return 0;
++}
++early_param("pcie_acs_override", pcie_acs_override_setup);
++
++static int pcie_acs_overrides(struct pci_dev *dev, u16 acs_flags)
++{
++	int i;
++
++	/* Never override ACS for legacy devices or devices with ACS caps */
++	if (!pci_is_pcie(dev) ||
++		pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ACS))
++			return -ENOTTY;
++
++	for (i = 0; i < max_acs_id; i++)
++		if (acs_on_ids[i].vendor == dev->vendor &&
++			acs_on_ids[i].device == dev->device)
++				return 1;
++
++	switch (pci_pcie_type(dev)) {
++	case PCI_EXP_TYPE_DOWNSTREAM:
++	case PCI_EXP_TYPE_ROOT_PORT:
++		if (acs_on_downstream)
++			return 1;
++		break;
++	case PCI_EXP_TYPE_ENDPOINT:
++	case PCI_EXP_TYPE_UPSTREAM:
++	case PCI_EXP_TYPE_LEG_END:
++	case PCI_EXP_TYPE_RC_END:
++		if (acs_on_multifunction && dev->multifunction)
++			return 1;
++	}
++
++	return -ENOTTY;
++}
+ /*
+  * Some NVIDIA GPU devices do not work with bus reset, SBR needs to be
+  * prevented for those affected devices.
+@@ -5168,6 +5268,8 @@ static const struct pci_dev_acs_enabled
+ 	{ PCI_VENDOR_ID_ZHAOXIN, PCI_ANY_ID, pci_quirk_zhaoxin_pcie_ports_acs },
+ 	/* Wangxun nics */
+ 	{ PCI_VENDOR_ID_WANGXUN, PCI_ANY_ID, pci_quirk_wangxun_nic_acs },
++	/* ACS override */
++	{ PCI_ANY_ID, PCI_ANY_ID, pcie_acs_overrides },
+ 	{ 0 }
+ };
+ 
diff --git a/debian/patches/patchset-xanmod/valve/0001-extcon-Add-driver-for-Steam-Deck.patch b/debian/patches/patchset-xanmod/valve/0001-extcon-Add-driver-for-Steam-Deck.patch
new file mode 100644
index 0000000..a32e0d3
--- /dev/null
+++ b/debian/patches/patchset-xanmod/valve/0001-extcon-Add-driver-for-Steam-Deck.patch
@@ -0,0 +1,219 @@
+From e054b70adebd85ed4ab62b458ee6ec3e902d970f Mon Sep 17 00:00:00 2001
+From: Andrey Smirnov <andrew.smirnov@gmail.com>
+Date: Sun, 27 Feb 2022 14:46:08 -0800
+Subject: [PATCH 1/6] extcon: Add driver for Steam Deck
+
+(cherry picked from commit f9f2eddae582ae39d5f89c1218448fc259b90aa8)
+Signed-off-by: Cristian Ciocaltea <cristian.ciocaltea@collabora.com>
+Signed-off-by: Alexandre Frade <kernel@xanmod.org>
+---
+ drivers/extcon/Kconfig            |   7 ++
+ drivers/extcon/Makefile           |   1 +
+ drivers/extcon/extcon-steamdeck.c | 180 ++++++++++++++++++++++++++++++
+ 3 files changed, 188 insertions(+)
+ create mode 100644 drivers/extcon/extcon-steamdeck.c
+
+--- a/drivers/extcon/Kconfig
++++ b/drivers/extcon/Kconfig
+@@ -203,4 +203,11 @@ config EXTCON_RTK_TYPE_C
+ 	  The DHC (Digital Home Hub) RTD series SoC contains a type c module.
+ 	  This driver will detect the status of the type-c port.
+ 
++config EXTCON_STEAMDECK
++	tristate "Steam Deck extcon support"
++	depends on MFD_STEAMDECK
++	help
++	  Say Y here to enable support of USB Type C cable detection extcon
++	  support on Steam Deck devices
++
+ endif
+--- a/drivers/extcon/Makefile
++++ b/drivers/extcon/Makefile
+@@ -26,3 +26,4 @@ obj-$(CONFIG_EXTCON_USB_GPIO)	+= extcon-
+ obj-$(CONFIG_EXTCON_USBC_CROS_EC) += extcon-usbc-cros-ec.o
+ obj-$(CONFIG_EXTCON_USBC_TUSB320) += extcon-usbc-tusb320.o
+ obj-$(CONFIG_EXTCON_RTK_TYPE_C) += extcon-rtk-type-c.o
++obj-$(CONFIG_EXTCON_STEAMDECK)  += extcon-steamdeck.o
+--- /dev/null
++++ b/drivers/extcon/extcon-steamdeck.c
+@@ -0,0 +1,180 @@
++
++#include <linux/acpi.h>
++#include <linux/platform_device.h>
++#include <linux/extcon-provider.h>
++
++#define ACPI_STEAMDECK_NOTIFY_STATUS	0x80
++
++/* 0 - port connected, 1 -port disconnected */
++#define ACPI_STEAMDECK_PORT_CONNECT	BIT(0)
++/* 0 - Upstream Facing Port, 1 - Downdstream Facing Port */
++#define ACPI_STEAMDECK_CUR_DATA_ROLE	BIT(3)
++/*
++ * Debouncing delay to allow negotiation process to settle. 2s value
++ * was arrived at via trial and error.
++ */
++#define STEAMDECK_ROLE_SWITCH_DELAY	(msecs_to_jiffies(2000))
++
++struct steamdeck_extcon {
++	struct acpi_device *adev;
++	struct delayed_work role_work;
++	struct extcon_dev *edev;
++	struct device *dev;
++};
++
++static int steamdeck_read_pdcs(struct steamdeck_extcon *sd, unsigned long long *pdcs)
++{
++	acpi_status status;
++
++	status = acpi_evaluate_integer(sd->adev->handle, "PDCS", NULL, pdcs);
++	if (ACPI_FAILURE(status)) {
++		dev_err(sd->dev, "PDCS evaluation failed: %s\n",
++			acpi_format_exception(status));
++		return -EIO;
++	}
++
++	return 0;
++}
++
++static void steamdeck_usb_role_work(struct work_struct *work)
++{
++	struct steamdeck_extcon *sd =
++		container_of(work, struct steamdeck_extcon, role_work.work);
++	unsigned long long pdcs;
++	bool usb_host;
++
++	if (steamdeck_read_pdcs(sd, &pdcs))
++		return;
++
++	/*
++	 * We only care about these two
++	 */
++	pdcs &= ACPI_STEAMDECK_PORT_CONNECT | ACPI_STEAMDECK_CUR_DATA_ROLE;
++
++	/*
++	 * For "connect" events our role is determined by a bit in
++	 * PDCS, for "disconnect" we switch to being a gadget
++	 * unconditionally. The thinking for the latter is we don't
++	 * want to start acting as a USB host until we get
++	 * confirmation from the firmware that we are a USB host
++	 */
++	usb_host = (pdcs & ACPI_STEAMDECK_PORT_CONNECT) ?
++		pdcs & ACPI_STEAMDECK_CUR_DATA_ROLE : false;
++
++	dev_dbg(sd->dev, "USB role is %s\n", usb_host ? "host" : "device");
++	WARN_ON(extcon_set_state_sync(sd->edev, EXTCON_USB_HOST,
++				      usb_host));
++
++}
++
++static void steamdeck_notify(acpi_handle handle, u32 event, void *context)
++{
++	struct device *dev = context;
++	struct steamdeck_extcon *sd = dev_get_drvdata(dev);
++	unsigned long long pdcs;
++	unsigned long delay;
++
++	switch (event) {
++	case ACPI_STEAMDECK_NOTIFY_STATUS:
++		if (steamdeck_read_pdcs(sd, &pdcs))
++			return;
++		/*
++		 * We process "disconnect" events immediately and
++		 * "connect" events with a delay to give the HW time
++		 * to settle. For example attaching USB hub (at least
++		 * for HW used for testing) will generate intermediary
++		 * event with "host" bit not set, followed by the one
++		 * that does have it set.
++		 */
++		delay = (pdcs & ACPI_STEAMDECK_PORT_CONNECT) ?
++			STEAMDECK_ROLE_SWITCH_DELAY : 0;
++
++		queue_delayed_work(system_long_wq, &sd->role_work, delay);
++		break;
++	default:
++		dev_warn(dev, "Unsupported event [0x%x]\n", event);
++	}
++}
++
++static void steamdeck_remove_notify_handler(void *data)
++{
++	struct steamdeck_extcon *sd = data;
++
++	acpi_remove_notify_handler(sd->adev->handle, ACPI_DEVICE_NOTIFY,
++				   steamdeck_notify);
++	cancel_delayed_work_sync(&sd->role_work);
++}
++
++static const unsigned int steamdeck_extcon_cable[] = {
++	EXTCON_USB,
++	EXTCON_USB_HOST,
++	EXTCON_CHG_USB_SDP,
++	EXTCON_CHG_USB_CDP,
++	EXTCON_CHG_USB_DCP,
++	EXTCON_CHG_USB_ACA,
++	EXTCON_NONE,
++};
++
++static int steamdeck_extcon_probe(struct platform_device *pdev)
++{
++	struct device *dev = &pdev->dev;
++	struct steamdeck_extcon *sd;
++	acpi_status status;
++	int ret;
++
++	sd = devm_kzalloc(dev, sizeof(*sd), GFP_KERNEL);
++	if (!sd)
++		return -ENOMEM;
++
++	INIT_DELAYED_WORK(&sd->role_work, steamdeck_usb_role_work);
++	platform_set_drvdata(pdev, sd);
++	sd->adev = ACPI_COMPANION(dev->parent);
++	sd->dev  = dev;
++	sd->edev = devm_extcon_dev_allocate(dev, steamdeck_extcon_cable);
++	if (IS_ERR(sd->edev))
++		return PTR_ERR(sd->edev);
++
++	ret = devm_extcon_dev_register(dev, sd->edev);
++	if (ret < 0) {
++		dev_err(dev, "Failed to register extcon device: %d\n", ret);
++		return ret;
++	}
++
++	/*
++	 * Set initial role value
++	 */
++	queue_delayed_work(system_long_wq, &sd->role_work, 0);
++	flush_delayed_work(&sd->role_work);
++
++	status = acpi_install_notify_handler(sd->adev->handle,
++					     ACPI_DEVICE_NOTIFY,
++					     steamdeck_notify,
++					     dev);
++	if (ACPI_FAILURE(status)) {
++		dev_err(dev, "Error installing ACPI notify handler\n");
++		return -EIO;
++	}
++
++	ret = devm_add_action_or_reset(dev, steamdeck_remove_notify_handler,
++				       sd);
++	return ret;
++}
++
++static const struct platform_device_id steamdeck_extcon_id_table[] = {
++	{ .name = "steamdeck-extcon" },
++	{}
++};
++MODULE_DEVICE_TABLE(platform, steamdeck_extcon_id_table);
++
++static struct platform_driver steamdeck_extcon_driver = {
++	.probe = steamdeck_extcon_probe,
++	.driver = {
++		.name = "steamdeck-extcon",
++	},
++	.id_table = steamdeck_extcon_id_table,
++};
++module_platform_driver(steamdeck_extcon_driver);
++
++MODULE_AUTHOR("Andrey Smirnov <andrew.smirnov@gmail.com>");
++MODULE_DESCRIPTION("Steam Deck extcon driver");
++MODULE_LICENSE("GPL");
diff --git a/debian/patches/patchset-xanmod/valve/0002-hwmon-Add-driver-for-Steam-Deck-s-EC-sensors.patch b/debian/patches/patchset-xanmod/valve/0002-hwmon-Add-driver-for-Steam-Deck-s-EC-sensors.patch
new file mode 100644
index 0000000..ec80cc4
--- /dev/null
+++ b/debian/patches/patchset-xanmod/valve/0002-hwmon-Add-driver-for-Steam-Deck-s-EC-sensors.patch
@@ -0,0 +1,274 @@
+From be312a466f817a38a48668f93bb07a75ff9fa875 Mon Sep 17 00:00:00 2001
+From: Andrey Smirnov <andrew.smirnov@gmail.com>
+Date: Sat, 19 Feb 2022 16:09:45 -0800
+Subject: [PATCH 2/6] hwmon: Add driver for Steam Deck's EC sensors
+
+Add driver for sensors exposed by EC firmware on Steam Deck hardware.
+
+(cherry picked from commit 6917aac77bee6185ae3920b936cdbe7876118c0b)
+Signed-off-by: Cristian Ciocaltea <cristian.ciocaltea@collabora.com>
+Signed-off-by: Alexandre Frade <kernel@xanmod.org>
+---
+ drivers/hwmon/Kconfig           |  11 ++
+ drivers/hwmon/Makefile          |   1 +
+ drivers/hwmon/steamdeck-hwmon.c | 224 ++++++++++++++++++++++++++++++++
+ 3 files changed, 236 insertions(+)
+ create mode 100644 drivers/hwmon/steamdeck-hwmon.c
+
+--- a/drivers/hwmon/Kconfig
++++ b/drivers/hwmon/Kconfig
+@@ -2050,6 +2050,17 @@ config SENSORS_SCH5636
+ 	  This driver can also be built as a module. If so, the module
+ 	  will be called sch5636.
+ 
++config SENSORS_STEAMDECK
++	tristate "Steam Deck EC sensors"
++	depends on MFD_STEAMDECK
++	help
++	  If you say yes here you get support for the hardware
++	  monitoring features exposed by EC firmware on Steam Deck
++	  devices
++
++	  This driver can also be built as a module. If so, the module
++	  will be called steamdeck-hwmon.
++
+ config SENSORS_STTS751
+ 	tristate "ST Microelectronics STTS751"
+ 	depends on I2C
+--- a/drivers/hwmon/Makefile
++++ b/drivers/hwmon/Makefile
+@@ -207,6 +207,7 @@ obj-$(CONFIG_SENSORS_SMSC47M1)	+= smsc47
+ obj-$(CONFIG_SENSORS_SMSC47M192)+= smsc47m192.o
+ obj-$(CONFIG_SENSORS_SPARX5)	+= sparx5-temp.o
+ obj-$(CONFIG_SENSORS_SPD5118)	+= spd5118.o
++obj-$(CONFIG_SENSORS_STEAMDECK) += steamdeck-hwmon.o
+ obj-$(CONFIG_SENSORS_STTS751)	+= stts751.o
+ obj-$(CONFIG_SENSORS_SURFACE_FAN)+= surface_fan.o
+ obj-$(CONFIG_SENSORS_SY7636A)	+= sy7636a-hwmon.o
+--- /dev/null
++++ b/drivers/hwmon/steamdeck-hwmon.c
+@@ -0,0 +1,224 @@
++// SPDX-License-Identifier: GPL-2.0+
++/*
++ * Steam Deck EC sensors driver
++ *
++ * Copyright (C) 2021-2022 Valve Corporation
++ */
++
++#include <linux/acpi.h>
++#include <linux/hwmon.h>
++#include <linux/platform_device.h>
++
++#define STEAMDECK_HWMON_NAME	"steamdeck-hwmon"
++
++struct steamdeck_hwmon {
++	struct acpi_device *adev;
++};
++
++static long
++steamdeck_hwmon_get(struct steamdeck_hwmon *sd, const char *method)
++{
++	unsigned long long val;
++	if (ACPI_FAILURE(acpi_evaluate_integer(sd->adev->handle,
++					       (char *)method, NULL, &val)))
++		return -EIO;
++
++	return val;
++}
++
++static int
++steamdeck_hwmon_read(struct device *dev, enum hwmon_sensor_types type,
++		     u32 attr, int channel, long *out)
++{
++	struct steamdeck_hwmon *sd = dev_get_drvdata(dev);
++
++	switch (type) {
++	case hwmon_curr:
++		if (attr != hwmon_curr_input)
++			return -EOPNOTSUPP;
++
++		*out = steamdeck_hwmon_get(sd, "PDAM");
++		if (*out < 0)
++			return *out;
++		break;
++	case hwmon_in:
++		if (attr != hwmon_in_input)
++			return -EOPNOTSUPP;
++
++		*out = steamdeck_hwmon_get(sd, "PDVL");
++		if (*out < 0)
++			return *out;
++		break;
++	case hwmon_temp:
++		if (attr != hwmon_temp_input)
++			return -EOPNOTSUPP;
++
++		*out = steamdeck_hwmon_get(sd, "BATT");
++		if (*out < 0)
++			return *out;
++		/*
++		 * Assuming BATT returns deg C we need to mutiply it
++		 * by 1000 to convert to mC
++		 */
++		*out *= 1000;
++		break;
++	case hwmon_fan:
++		switch (attr) {
++		case hwmon_fan_input:
++			*out = steamdeck_hwmon_get(sd, "FANR");
++			if (*out < 0)
++				return *out;
++			break;
++		case hwmon_fan_target:
++			*out = steamdeck_hwmon_get(sd, "FSSR");
++			if (*out < 0)
++				return *out;
++			break;
++		case hwmon_fan_fault:
++			*out = steamdeck_hwmon_get(sd, "FANC");
++			if (*out < 0)
++				return *out;
++			/*
++			 * FANC (Fan check):
++			 * 0: Abnormal
++			 * 1: Normal
++			 */
++			*out = !*out;
++			break;
++		default:
++			return -EOPNOTSUPP;
++		}
++		break;
++	default:
++		return -EOPNOTSUPP;
++	}
++
++	return 0;
++}
++
++static int
++steamdeck_hwmon_read_string(struct device *dev, enum hwmon_sensor_types type,
++			    u32 attr, int channel, const char **str)
++{
++	switch (type) {
++		/*
++		 * These two aren't, strictly speaking, measured. EC
++		 * firmware just reports what PD negotiation resulted
++		 * in.
++		 */
++	case hwmon_curr:
++		*str = "PD Contract Current";
++		break;
++	case hwmon_in:
++		*str = "PD Contract Voltage";
++		break;
++	case hwmon_temp:
++		*str = "Battery Temp";
++		break;
++	case hwmon_fan:
++		*str = "System Fan";
++		break;
++	default:
++		return -EOPNOTSUPP;
++	}
++
++	return 0;
++}
++
++static int
++steamdeck_hwmon_write(struct device *dev, enum hwmon_sensor_types type,
++		      u32 attr, int channel, long val)
++{
++	struct steamdeck_hwmon *sd = dev_get_drvdata(dev);
++
++	if (type != hwmon_fan ||
++	    attr != hwmon_fan_target)
++		return -EOPNOTSUPP;
++
++	val = clamp_val(val, 0, 7300);
++
++	if (ACPI_FAILURE(acpi_execute_simple_method(sd->adev->handle,
++						    "FANS", val)))
++		return -EIO;
++
++	return 0;
++}
++
++static umode_t
++steamdeck_hwmon_is_visible(const void *data, enum hwmon_sensor_types type,
++			   u32 attr, int channel)
++{
++	if (type == hwmon_fan &&
++	    attr == hwmon_fan_target)
++		return 0644;
++
++	return 0444;
++}
++
++static const struct hwmon_channel_info *steamdeck_hwmon_info[] = {
++	HWMON_CHANNEL_INFO(in,
++			   HWMON_I_INPUT | HWMON_I_LABEL),
++	HWMON_CHANNEL_INFO(curr,
++			   HWMON_C_INPUT | HWMON_C_LABEL),
++	HWMON_CHANNEL_INFO(temp,
++			   HWMON_T_INPUT | HWMON_T_LABEL),
++	HWMON_CHANNEL_INFO(fan,
++			   HWMON_F_INPUT | HWMON_F_LABEL |
++			   HWMON_F_TARGET | HWMON_F_FAULT),
++	NULL
++};
++
++static const struct hwmon_ops steamdeck_hwmon_ops = {
++	.is_visible = steamdeck_hwmon_is_visible,
++	.read = steamdeck_hwmon_read,
++	.read_string = steamdeck_hwmon_read_string,
++	.write = steamdeck_hwmon_write,
++};
++
++static const struct hwmon_chip_info steamdeck_hwmon_chip_info = {
++	.ops = &steamdeck_hwmon_ops,
++	.info = steamdeck_hwmon_info,
++};
++
++static int steamdeck_hwmon_probe(struct platform_device *pdev)
++{
++	struct device *dev = &pdev->dev;
++	struct steamdeck_hwmon *sd;
++	struct device *hwmon;
++
++	sd = devm_kzalloc(dev, sizeof(*sd), GFP_KERNEL);
++	if (!sd)
++		return -ENOMEM;
++
++	sd->adev = ACPI_COMPANION(dev->parent);
++	hwmon = devm_hwmon_device_register_with_info(dev,
++						     "steamdeck_hwmon",
++						     sd,
++						     &steamdeck_hwmon_chip_info,
++						     NULL);
++	if (IS_ERR(hwmon)) {
++		dev_err(dev, "Failed to register HWMON device");
++		return PTR_ERR(hwmon);
++	}
++
++	return 0;
++}
++
++static const struct platform_device_id steamdeck_hwmon_id_table[] = {
++	{ .name = STEAMDECK_HWMON_NAME },
++	{}
++};
++MODULE_DEVICE_TABLE(platform, steamdeck_hwmon_id_table);
++
++static struct platform_driver steamdeck_hwmon_driver = {
++	.probe = steamdeck_hwmon_probe,
++	.driver = {
++		.name = STEAMDECK_HWMON_NAME,
++	},
++	.id_table = steamdeck_hwmon_id_table,
++};
++module_platform_driver(steamdeck_hwmon_driver);
++
++MODULE_AUTHOR("Andrey Smirnov <andrew.smirnov@gmail.com>");
++MODULE_DESCRIPTION("Steam Deck EC sensors driver");
++MODULE_LICENSE("GPL");
diff --git a/debian/patches/patchset-xanmod/valve/0003-hwmon-steamdeck-hwmon-Add-support-for-max-battery-le.patch b/debian/patches/patchset-xanmod/valve/0003-hwmon-steamdeck-hwmon-Add-support-for-max-battery-le.patch
new file mode 100644
index 0000000..874d5b9
--- /dev/null
+++ b/debian/patches/patchset-xanmod/valve/0003-hwmon-steamdeck-hwmon-Add-support-for-max-battery-le.patch
@@ -0,0 +1,104 @@
+From efe7b80f8a7448265d238c13ed1b6e327a9c739e Mon Sep 17 00:00:00 2001
+From: Andrey Smirnov <andrew.smirnov@gmail.com>
+Date: Sat, 15 Jul 2023 12:58:54 -0700
+Subject: [PATCH 3/6] hwmon: steamdeck-hwmon: Add support for max battery
+ level/rate
+
+Add support for max battery level/charge rate attributes.
+
+Signed-off-by: Andrey Smirnov <andrew.smirnov@gmail.com>
+(cherry picked from commit 50af83e8fd75dc52221edd3fb6fd7a7f70c4d8a4)
+Signed-off-by: Cristian Ciocaltea <cristian.ciocaltea@collabora.com>
+Signed-off-by: Alexandre Frade <kernel@xanmod.org>
+---
+ drivers/hwmon/steamdeck-hwmon.c | 72 ++++++++++++++++++++++++++++++++-
+ 1 file changed, 71 insertions(+), 1 deletion(-)
+
+--- a/drivers/hwmon/steamdeck-hwmon.c
++++ b/drivers/hwmon/steamdeck-hwmon.c
+@@ -180,6 +180,76 @@ static const struct hwmon_chip_info stea
+ 	.info = steamdeck_hwmon_info,
+ };
+ 
++
++static ssize_t
++steamdeck_hwmon_simple_store(struct device *dev, const char *buf, size_t count,
++			     const char *method,
++			     unsigned long upper_limit)
++{
++	struct steamdeck_hwmon *sd = dev_get_drvdata(dev);
++	unsigned long value;
++
++	if (kstrtoul(buf, 10, &value) || value >= upper_limit)
++		return -EINVAL;
++
++	if (ACPI_FAILURE(acpi_execute_simple_method(sd->adev->handle,
++						    (char *)method, value)))
++		return -EIO;
++
++	return count;
++}
++
++static ssize_t
++steamdeck_hwmon_simple_show(struct device *dev, char *buf,
++			    const char *method)
++{
++	struct steamdeck_hwmon *sd = dev_get_drvdata(dev);
++	unsigned long value;
++
++	value = steamdeck_hwmon_get(sd, method);
++	if (value < 0)
++		return value;
++
++	return sprintf(buf, "%ld\n", value);
++}
++
++#define STEAMDECK_HWMON_ATTR_RW(_name, _set_method, _get_method,	\
++				_upper_limit)				\
++	static ssize_t _name##_show(struct device *dev,			\
++				    struct device_attribute *attr,	\
++				    char *buf)				\
++	{								\
++		return steamdeck_hwmon_simple_show(dev, buf,		\
++						   _get_method);	\
++	}								\
++	static ssize_t _name##_store(struct device *dev,		\
++				     struct device_attribute *attr,	\
++				     const char *buf, size_t count)	\
++	{								\
++		return steamdeck_hwmon_simple_store(dev, buf, count,	\
++						    _set_method,	\
++						    _upper_limit);	\
++	}								\
++	static DEVICE_ATTR_RW(_name)
++
++STEAMDECK_HWMON_ATTR_RW(max_battery_charge_level, "FCBL", "SFBL", 101);
++STEAMDECK_HWMON_ATTR_RW(max_battery_charge_rate,  "CHGR", "GCHR", 101);
++
++static struct attribute *steamdeck_hwmon_attributes[] = {
++	&dev_attr_max_battery_charge_level.attr,
++	&dev_attr_max_battery_charge_rate.attr,
++	NULL
++};
++
++static const struct attribute_group steamdeck_hwmon_group = {
++	.attrs = steamdeck_hwmon_attributes,
++};
++
++static const struct attribute_group *steamdeck_hwmon_groups[] = {
++	&steamdeck_hwmon_group,
++	NULL
++};
++
+ static int steamdeck_hwmon_probe(struct platform_device *pdev)
+ {
+ 	struct device *dev = &pdev->dev;
+@@ -195,7 +265,7 @@ static int steamdeck_hwmon_probe(struct
+ 						     "steamdeck_hwmon",
+ 						     sd,
+ 						     &steamdeck_hwmon_chip_info,
+-						     NULL);
++						     steamdeck_hwmon_groups);
+ 	if (IS_ERR(hwmon)) {
+ 		dev_err(dev, "Failed to register HWMON device");
+ 		return PTR_ERR(hwmon);
diff --git a/debian/patches/patchset-xanmod/valve/0004-leds-steamdeck-Add-support-for-Steam-Deck-LED.patch b/debian/patches/patchset-xanmod/valve/0004-leds-steamdeck-Add-support-for-Steam-Deck-LED.patch
new file mode 100644
index 0000000..98c746a
--- /dev/null
+++ b/debian/patches/patchset-xanmod/valve/0004-leds-steamdeck-Add-support-for-Steam-Deck-LED.patch
@@ -0,0 +1,118 @@
+From e76032abd59b7d2b15c8106630423e3091b298ce Mon Sep 17 00:00:00 2001
+From: Andrey Smirnov <andrew.smirnov@gmail.com>
+Date: Sun, 27 Feb 2022 12:58:05 -0800
+Subject: [PATCH 4/6] leds: steamdeck: Add support for Steam Deck LED
+
+(cherry picked from commit 85a86d19aa7022ff0555023d53aef78323a42d0c)
+Signed-off-by: Cristian Ciocaltea <cristian.ciocaltea@collabora.com>
+Signed-off-by: Alexandre Frade <kernel@xanmod.org>
+---
+ drivers/leds/Kconfig          |  7 ++++
+ drivers/leds/Makefile         |  1 +
+ drivers/leds/leds-steamdeck.c | 74 +++++++++++++++++++++++++++++++++++
+ 3 files changed, 82 insertions(+)
+ create mode 100644 drivers/leds/leds-steamdeck.c
+
+--- a/drivers/leds/Kconfig
++++ b/drivers/leds/Kconfig
+@@ -951,6 +951,13 @@ config LEDS_ACER_A500
+ 	  This option enables support for the Power Button LED of
+ 	  Acer Iconia Tab A500.
+ 
++config LEDS_STEAMDECK
++	tristate "LED support for Steam Deck"
++	depends on LEDS_CLASS && MFD_STEAMDECK
++	help
++	  This option enabled support for the status LED (next to the
++	  power button) on Steam Deck
++
+ source "drivers/leds/blink/Kconfig"
+ 
+ comment "Flash and Torch LED drivers"
+--- a/drivers/leds/Makefile
++++ b/drivers/leds/Makefile
+@@ -81,6 +81,7 @@ obj-$(CONFIG_LEDS_POWERNV)		+= leds-powe
+ obj-$(CONFIG_LEDS_PWM)			+= leds-pwm.o
+ obj-$(CONFIG_LEDS_REGULATOR)		+= leds-regulator.o
+ obj-$(CONFIG_LEDS_SC27XX_BLTC)		+= leds-sc27xx-bltc.o
++obj-$(CONFIG_LEDS_STEAMDECK)		+= leds-steamdeck.o
+ obj-$(CONFIG_LEDS_SUN50I_A100)		+= leds-sun50i-a100.o
+ obj-$(CONFIG_LEDS_SUNFIRE)		+= leds-sunfire.o
+ obj-$(CONFIG_LEDS_SYSCON)		+= leds-syscon.o
+--- /dev/null
++++ b/drivers/leds/leds-steamdeck.c
+@@ -0,0 +1,74 @@
++// SPDX-License-Identifier: GPL-2.0+
++
++/*
++ * Steam Deck EC MFD LED cell driver
++ *
++ * Copyright (C) 2021-2022 Valve Corporation
++ *
++ */
++
++#include <linux/acpi.h>
++#include <linux/leds.h>
++#include <linux/platform_device.h>
++
++struct steamdeck_led {
++	struct acpi_device *adev;
++	struct led_classdev cdev;
++};
++
++static int steamdeck_leds_brightness_set(struct led_classdev *cdev,
++					 enum led_brightness value)
++{
++	struct steamdeck_led *sd = container_of(cdev, struct steamdeck_led,
++						cdev);
++
++	if (ACPI_FAILURE(acpi_execute_simple_method(sd->adev->handle,
++						    "CHBV", value)))
++		return -EIO;
++
++	return 0;
++}
++
++static int steamdeck_leds_probe(struct platform_device *pdev)
++{
++	struct device *dev = &pdev->dev;
++	struct steamdeck_led *sd;
++	int ret;
++
++	sd = devm_kzalloc(dev, sizeof(*sd), GFP_KERNEL);
++	if (!sd)
++		return -ENOMEM;
++
++	sd->adev = ACPI_COMPANION(dev->parent);
++
++	sd->cdev.name = "status:white";
++	sd->cdev.brightness_set_blocking = steamdeck_leds_brightness_set;
++	sd->cdev.max_brightness = 100;
++
++	ret = devm_led_classdev_register(dev, &sd->cdev);
++	if (ret) {
++		dev_err(dev, "Failed to register LEDs device: %d\n", ret);
++		return ret;
++	}
++
++	return 0;
++}
++
++static const struct platform_device_id steamdeck_leds_id_table[] = {
++	{ .name = "steamdeck-leds" },
++	{}
++};
++MODULE_DEVICE_TABLE(platform, steamdeck_leds_id_table);
++
++static struct platform_driver steamdeck_leds_driver = {
++	.probe = steamdeck_leds_probe,
++	.driver = {
++		.name = "steamdeck-leds",
++	},
++	.id_table = steamdeck_leds_id_table,
++};
++module_platform_driver(steamdeck_leds_driver);
++
++MODULE_AUTHOR("Andrey Smirnov <andrew.smirnov@gmail.com>");
++MODULE_DESCRIPTION("Steam Deck LEDs driver");
++MODULE_LICENSE("GPL");
diff --git a/debian/patches/patchset-xanmod/valve/0005-mfd-Add-MFD-core-driver-for-Steam-Deck.patch b/debian/patches/patchset-xanmod/valve/0005-mfd-Add-MFD-core-driver-for-Steam-Deck.patch
new file mode 100644
index 0000000..8227294
--- /dev/null
+++ b/debian/patches/patchset-xanmod/valve/0005-mfd-Add-MFD-core-driver-for-Steam-Deck.patch
@@ -0,0 +1,176 @@
+From c3cf089a9da1216d3e1e047eba2ec46e0febe457 Mon Sep 17 00:00:00 2001
+From: Andrey Smirnov <andrew.smirnov@gmail.com>
+Date: Sat, 19 Feb 2022 16:08:36 -0800
+Subject: [PATCH 5/6] mfd: Add MFD core driver for Steam Deck
+
+Add MFD core driver for Steam Deck. Doesn't really do much so far
+besides instantiating a number of MFD cells that implement all the
+interesting functionality.
+
+(cherry picked from commit 5f534c2d6ebdefccb9c024eb0f013bc1c0c622d9)
+Signed-off-by: Cristian Ciocaltea <cristian.ciocaltea@collabora.com>
+Signed-off-by: Alexandre Frade <kernel@xanmod.org>
+---
+ drivers/mfd/Kconfig     |  11 ++++
+ drivers/mfd/Makefile    |   2 +
+ drivers/mfd/steamdeck.c | 127 ++++++++++++++++++++++++++++++++++++++++
+ 3 files changed, 140 insertions(+)
+ create mode 100644 drivers/mfd/steamdeck.c
+
+--- a/drivers/mfd/Kconfig
++++ b/drivers/mfd/Kconfig
+@@ -2390,5 +2390,16 @@ config MFD_RSMU_SPI
+ 	  Additional drivers must be enabled in order to use the functionality
+ 	  of the device.
+ 
++config MFD_STEAMDECK
++	tristate "Valve Steam Deck"
++	select MFD_CORE
++	depends on ACPI
++	depends on X86_64 || COMPILE_TEST
++	help
++	  This driver registers various MFD cells that expose aspects
++	  of Steam Deck specific ACPI functionality.
++
++	  Say N here, unless you are running on Steam Deck hardware.
++
+ endmenu
+ endif
+--- a/drivers/mfd/Makefile
++++ b/drivers/mfd/Makefile
+@@ -288,3 +288,5 @@ obj-$(CONFIG_MFD_ATC260X_I2C)	+= atc260x
+ 
+ obj-$(CONFIG_MFD_RSMU_I2C)	+= rsmu_i2c.o rsmu_core.o
+ obj-$(CONFIG_MFD_RSMU_SPI)	+= rsmu_spi.o rsmu_core.o
++
++obj-$(CONFIG_MFD_STEAMDECK)	+= steamdeck.o
+--- /dev/null
++++ b/drivers/mfd/steamdeck.c
+@@ -0,0 +1,127 @@
++// SPDX-License-Identifier: GPL-2.0+
++
++/*
++ * Steam Deck EC MFD core driver
++ *
++ * Copyright (C) 2021-2022 Valve Corporation
++ *
++ */
++
++#include <linux/acpi.h>
++#include <linux/platform_device.h>
++#include <linux/mfd/core.h>
++
++#define STEAMDECK_STA_OK			\
++	(ACPI_STA_DEVICE_ENABLED |		\
++	 ACPI_STA_DEVICE_PRESENT |		\
++	 ACPI_STA_DEVICE_FUNCTIONING)
++
++struct steamdeck {
++	struct acpi_device *adev;
++	struct device *dev;
++};
++
++#define STEAMDECK_ATTR_RO(_name, _method)				\
++	static ssize_t _name##_show(struct device *dev,			\
++				    struct device_attribute *attr,	\
++				    char *buf)				\
++	{								\
++		struct steamdeck *sd = dev_get_drvdata(dev);		\
++		unsigned long long val;					\
++									\
++		if (ACPI_FAILURE(acpi_evaluate_integer(			\
++					 sd->adev->handle,		\
++					 _method, NULL, &val)))		\
++			return -EIO;					\
++									\
++		return sysfs_emit(buf, "%llu\n", val);			\
++	}								\
++	static DEVICE_ATTR_RO(_name)
++
++STEAMDECK_ATTR_RO(firmware_version, "PDFW");
++STEAMDECK_ATTR_RO(board_id, "BOID");
++
++static struct attribute *steamdeck_attrs[] = {
++	&dev_attr_firmware_version.attr,
++	&dev_attr_board_id.attr,
++	NULL
++};
++
++ATTRIBUTE_GROUPS(steamdeck);
++
++static const struct mfd_cell steamdeck_cells[] = {
++	{ .name = "steamdeck-hwmon"  },
++	{ .name = "steamdeck-leds"   },
++	{ .name = "steamdeck-extcon" },
++};
++
++static void steamdeck_remove_sysfs_groups(void *data)
++{
++	struct steamdeck *sd = data;
++
++	sysfs_remove_groups(&sd->dev->kobj, steamdeck_groups);
++}
++
++static int steamdeck_probe(struct platform_device *pdev)
++{
++	struct device *dev = &pdev->dev;
++	unsigned long long sta;
++	struct steamdeck *sd;
++	acpi_status status;
++	int ret;
++
++	sd = devm_kzalloc(dev, sizeof(*sd), GFP_KERNEL);
++	if (!sd)
++		return -ENOMEM;
++	sd->adev = ACPI_COMPANION(dev);
++	sd->dev = dev;
++	platform_set_drvdata(pdev, sd);
++
++	status = acpi_evaluate_integer(sd->adev->handle, "_STA",
++				       NULL, &sta);
++	if (ACPI_FAILURE(status)) {
++		dev_err(dev, "Status check failed (0x%x)\n", status);
++		return -EINVAL;
++	}
++
++	if ((sta & STEAMDECK_STA_OK) != STEAMDECK_STA_OK) {
++		dev_err(dev, "Device is not ready\n");
++		return -EINVAL;
++	}
++
++	ret = sysfs_create_groups(&dev->kobj, steamdeck_groups);
++	if (ret) {
++		dev_err(dev, "Failed to create sysfs group\n");
++		return ret;
++	}
++
++	ret = devm_add_action_or_reset(dev, steamdeck_remove_sysfs_groups,
++				       sd);
++	if (ret) {
++		dev_err(dev, "Failed to register devres action\n");
++		return ret;
++	}
++
++	return devm_mfd_add_devices(dev, PLATFORM_DEVID_NONE,
++				    steamdeck_cells, ARRAY_SIZE(steamdeck_cells),
++				    NULL, 0, NULL);
++}
++
++static const struct acpi_device_id steamdeck_device_ids[] = {
++	{ "VLV0100", 0 },
++	{ "", 0 },
++};
++MODULE_DEVICE_TABLE(acpi, steamdeck_device_ids);
++
++static struct platform_driver steamdeck_driver = {
++	.probe = steamdeck_probe,
++	.driver = {
++		.name = "steamdeck",
++		.acpi_match_table = steamdeck_device_ids,
++	},
++};
++module_platform_driver(steamdeck_driver);
++
++MODULE_AUTHOR("Andrey Smirnov <andrew.smirnov@gmail.com>");
++MODULE_DESCRIPTION("Steam Deck EC MFD core driver");
++MODULE_LICENSE("GPL");
diff --git a/debian/patches/patchset-xanmod/valve/0006-mfd-steamdeck-Expose-controller-board-power-in-sysfs.patch b/debian/patches/patchset-xanmod/valve/0006-mfd-steamdeck-Expose-controller-board-power-in-sysfs.patch
new file mode 100644
index 0000000..7153da1
--- /dev/null
+++ b/debian/patches/patchset-xanmod/valve/0006-mfd-steamdeck-Expose-controller-board-power-in-sysfs.patch
@@ -0,0 +1,49 @@
+From c24abd65482a08109e756c612e83c8c8450a263b Mon Sep 17 00:00:00 2001
+From: Andrey Smirnov <andrew.smirnov@gmail.com>
+Date: Sun, 24 Sep 2023 15:02:33 -0700
+Subject: [PATCH 6/6] mfd: steamdeck: Expose controller board power in sysfs
+
+As of version 118 Deck's BIOS implements "SCBP" method that allows
+gating power of the controller board (VBUS). Add a basic WO method to
+our root MFD device to allow toggling that.
+
+Signed-off-by: Andrey Smirnov <andrew.smirnov@gmail.com>
+(cherry picked from commit f97f32718acc10cbb51fef925842392e80904d74)
+Signed-off-by: Cristian Ciocaltea <cristian.ciocaltea@collabora.com>
+Signed-off-by: Alexandre Frade <kernel@xanmod.org>
+---
+ drivers/mfd/steamdeck.c | 20 ++++++++++++++++++++
+ 1 file changed, 20 insertions(+)
+
+--- a/drivers/mfd/steamdeck.c
++++ b/drivers/mfd/steamdeck.c
+@@ -41,9 +41,29 @@ struct steamdeck {
+ STEAMDECK_ATTR_RO(firmware_version, "PDFW");
+ STEAMDECK_ATTR_RO(board_id, "BOID");
+ 
++static ssize_t controller_board_power_store(struct device *dev,
++					    struct device_attribute *attr,
++					    const char *buf, size_t count)
++{
++	struct steamdeck *sd = dev_get_drvdata(dev);
++	bool enabled;
++	ssize_t ret = kstrtobool(buf, &enabled);
++
++	if (ret)
++		return ret;
++
++	if (ACPI_FAILURE(acpi_execute_simple_method(sd->adev->handle,
++						    "SCBP", enabled)))
++		return -EIO;
++
++	return count;
++}
++static DEVICE_ATTR_WO(controller_board_power);
++
+ static struct attribute *steamdeck_attrs[] = {
+ 	&dev_attr_firmware_version.attr,
+ 	&dev_attr_board_id.attr,
++	&dev_attr_controller_board_power.attr,
+ 	NULL
+ };
+ 
diff --git a/debian/patches/patchset-xanmod/xanmod/0001-XANMOD-kbuild-Add-GCC-SMS-based-modulo-scheduling-fl.patch b/debian/patches/patchset-xanmod/xanmod/0001-XANMOD-kbuild-Add-GCC-SMS-based-modulo-scheduling-fl.patch
new file mode 100644
index 0000000..967c233
--- /dev/null
+++ b/debian/patches/patchset-xanmod/xanmod/0001-XANMOD-kbuild-Add-GCC-SMS-based-modulo-scheduling-fl.patch
@@ -0,0 +1,25 @@
+From 2a949c896ea66e647d94254c28b1d4d6194ee14b Mon Sep 17 00:00:00 2001
+From: Alexandre Frade <kernel@xanmod.org>
+Date: Mon, 16 Sep 2024 00:55:35 +0000
+Subject: [PATCH 03/19] XANMOD: kbuild: Add GCC SMS-based modulo scheduling
+ flags
+
+Signed-off-by: Alexandre Frade <kernel@xanmod.org>
+---
+ Makefile | 5 +++++
+ 1 file changed, 5 insertions(+)
+
+--- a/Makefile
++++ b/Makefile
+@@ -822,6 +822,11 @@ KBUILD_CFLAGS += -Os
+ KBUILD_RUSTFLAGS += -Copt-level=s
+ endif
+ 
++# Perform swing modulo scheduling immediately before the first scheduling pass.
++# This pass looks at innermost loops and reorders their instructions by
++# overlapping different iterations.
++KBUILD_CFLAGS += $(call cc-option,-fmodulo-sched -fmodulo-sched-allow-regmoves)
++
+ # Always set `debug-assertions` and `overflow-checks` because their default
+ # depends on `opt-level` and `debug-assertions`, respectively.
+ KBUILD_RUSTFLAGS += -Cdebug-assertions=$(if $(CONFIG_RUST_DEBUG_ASSERTIONS),y,n)
diff --git a/debian/patches/patchset-xanmod/xanmod/0002-kbuild-Remove-GCC-minimal-function-alignment.patch b/debian/patches/patchset-xanmod/xanmod/0002-kbuild-Remove-GCC-minimal-function-alignment.patch
new file mode 100644
index 0000000..4bf2185
--- /dev/null
+++ b/debian/patches/patchset-xanmod/xanmod/0002-kbuild-Remove-GCC-minimal-function-alignment.patch
@@ -0,0 +1,76 @@
+From 867a311f231e89b4c9194579e40718f751761ffc Mon Sep 17 00:00:00 2001
+From: Alexandre Frade <kernel@xanmod.org>
+Date: Sat, 31 Aug 2024 16:57:41 +0000
+Subject: [PATCH 04/19] kbuild: Remove GCC minimal function alignment
+
+Signed-off-by: Alexandre Frade <kernel@xanmod.org>
+---
+ Makefile                       |  7 -------
+ arch/Kconfig                   | 12 ------------
+ include/linux/compiler_types.h | 10 +++++-----
+ 3 files changed, 5 insertions(+), 24 deletions(-)
+
+--- a/Makefile
++++ b/Makefile
+@@ -981,15 +981,8 @@ export CC_FLAGS_FPU
+ export CC_FLAGS_NO_FPU
+ 
+ ifneq ($(CONFIG_FUNCTION_ALIGNMENT),0)
+-# Set the minimal function alignment. Use the newer GCC option
+-# -fmin-function-alignment if it is available, or fall back to -falign-funtions.
+-# See also CONFIG_CC_HAS_SANE_FUNCTION_ALIGNMENT.
+-ifdef CONFIG_CC_HAS_MIN_FUNCTION_ALIGNMENT
+-KBUILD_CFLAGS += -fmin-function-alignment=$(CONFIG_FUNCTION_ALIGNMENT)
+-else
+ KBUILD_CFLAGS += -falign-functions=$(CONFIG_FUNCTION_ALIGNMENT)
+ endif
+-endif
+ 
+ # arch Makefile may override CC so keep this after arch Makefile is included
+ NOSTDINC_FLAGS += -nostdinc
+--- a/arch/Kconfig
++++ b/arch/Kconfig
+@@ -1628,18 +1628,6 @@ config FUNCTION_ALIGNMENT
+ 	default 4 if FUNCTION_ALIGNMENT_4B
+ 	default 0
+ 
+-config CC_HAS_MIN_FUNCTION_ALIGNMENT
+-	# Detect availability of the GCC option -fmin-function-alignment which
+-	# guarantees minimal alignment for all functions, unlike
+-	# -falign-functions which the compiler ignores for cold functions.
+-	def_bool $(cc-option, -fmin-function-alignment=8)
+-
+-config CC_HAS_SANE_FUNCTION_ALIGNMENT
+-	# Set if the guaranteed alignment with -fmin-function-alignment is
+-	# available or extra care is required in the kernel. Clang provides
+-	# strict alignment always, even with -falign-functions.
+-	def_bool CC_HAS_MIN_FUNCTION_ALIGNMENT || CC_IS_CLANG
+-
+ config ARCH_NEED_CMPXCHG_1_EMU
+ 	bool
+ 
+--- a/include/linux/compiler_types.h
++++ b/include/linux/compiler_types.h
+@@ -99,17 +99,17 @@ static inline void __chk_io_ptr(const vo
+  *   gcc: https://gcc.gnu.org/onlinedocs/gcc/Label-Attributes.html#index-cold-label-attribute
+  *
+  * When -falign-functions=N is in use, we must avoid the cold attribute as
+- * GCC drops the alignment for cold functions. Worse, GCC can implicitly mark
+- * callees of cold functions as cold themselves, so it's not sufficient to add
+- * __function_aligned here as that will not ensure that callees are correctly
+- * aligned.
++ * contemporary versions of GCC drop the alignment for cold functions. Worse,
++ * GCC can implicitly mark callees of cold functions as cold themselves, so
++ * it's not sufficient to add __function_aligned here as that will not ensure
++ * that callees are correctly aligned.
+  *
+  * See:
+  *
+  *   https://lore.kernel.org/lkml/Y77%2FqVgvaJidFpYt@FVFF77S0Q05N
+  *   https://gcc.gnu.org/bugzilla/show_bug.cgi?id=88345#c9
+  */
+-#if defined(CONFIG_CC_HAS_SANE_FUNCTION_ALIGNMENT) || (CONFIG_FUNCTION_ALIGNMENT == 0)
++#if !defined(CONFIG_CC_IS_GCC) || (CONFIG_FUNCTION_ALIGNMENT == 0)
+ #define __cold				__attribute__((__cold__))
+ #else
+ #define __cold
diff --git a/debian/patches/patchset-xanmod/xanmod/0003-XANMOD-fair-Set-scheduler-tunable-latencies-to-unsca.patch b/debian/patches/patchset-xanmod/xanmod/0003-XANMOD-fair-Set-scheduler-tunable-latencies-to-unsca.patch
new file mode 100644
index 0000000..bcbb768
--- /dev/null
+++ b/debian/patches/patchset-xanmod/xanmod/0003-XANMOD-fair-Set-scheduler-tunable-latencies-to-unsca.patch
@@ -0,0 +1,22 @@
+From e98005253085b5a00881c085fab388a4742e700b Mon Sep 17 00:00:00 2001
+From: Alexandre Frade <kernel@xanmod.org>
+Date: Thu, 11 May 2023 19:41:41 +0000
+Subject: [PATCH 05/19] XANMOD: fair: Set scheduler tunable latencies to
+ unscaled
+
+Signed-off-by: Alexandre Frade <kernel@xanmod.org>
+---
+ kernel/sched/fair.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -66,7 +66,7 @@
+  *
+  * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
+  */
+-unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG;
++unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE;
+ 
+ /*
+  * Minimal preemption granularity for CPU-bound tasks:
diff --git a/debian/patches/patchset-xanmod/xanmod/0004-XANMOD-sched-Add-yield_type-sysctl-to-reduce-or-disa.patch b/debian/patches/patchset-xanmod/xanmod/0004-XANMOD-sched-Add-yield_type-sysctl-to-reduce-or-disa.patch
new file mode 100644
index 0000000..2f6063e
--- /dev/null
+++ b/debian/patches/patchset-xanmod/xanmod/0004-XANMOD-sched-Add-yield_type-sysctl-to-reduce-or-disa.patch
@@ -0,0 +1,71 @@
+From 07341955b471371f5414f94814c49289de9319cf Mon Sep 17 00:00:00 2001
+From: Alexandre Frade <kernel@xanmod.org>
+Date: Sun, 15 Sep 2024 23:03:38 +0000
+Subject: [PATCH 06/19] XANMOD: sched: Add yield_type sysctl to reduce or
+ disable sched_yield
+
+Signed-off-by: Alexandre Frade <kernel@xanmod.org>
+---
+ kernel/sched/syscalls.c | 16 +++++++++++++++-
+ kernel/sysctl.c         | 10 ++++++++++
+ 2 files changed, 25 insertions(+), 1 deletion(-)
+
+--- a/kernel/sched/syscalls.c
++++ b/kernel/sched/syscalls.c
+@@ -1457,15 +1457,29 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t
+ 	return ret;
+ }
+ 
++/**
++ * sysctl_sched_yield_type - choose the yield level that will perform.
++ *
++ * 0: No yield.
++ * 1: Yield only to better priority/deadline tasks.
++ * 2: Re-queue current tasks. (default CFS)
++ */
++__read_mostly int sysctl_sched_yield_type = 2;
++
+ static void do_sched_yield(void)
+ {
+ 	struct rq_flags rf;
+ 	struct rq *rq;
+ 
++	if (!sysctl_sched_yield_type)
++		return;
++
+ 	rq = this_rq_lock_irq(&rf);
+ 
+ 	schedstat_inc(rq->yld_count);
+-	current->sched_class->yield_task(rq);
++
++	if (sysctl_sched_yield_type > 1)
++		current->sched_class->yield_task(rq);
+ 
+ 	preempt_disable();
+ 	rq_unlock_irq(rq, &rf);
+--- a/kernel/sysctl.c
++++ b/kernel/sysctl.c
+@@ -97,6 +97,7 @@ static const int six_hundred_forty_kb =
+ #endif
+ 
+ 
++extern int sysctl_sched_yield_type;
+ static const int ngroups_max = NGROUPS_MAX;
+ static const int cap_last_cap = CAP_LAST_CAP;
+ 
+@@ -1631,6 +1632,15 @@ static struct ctl_table kern_table[] = {
+ 		.proc_handler	= proc_dointvec,
+ 	},
+ #endif
++	{
++		.procname	= "yield_type",
++		.data		= &sysctl_sched_yield_type,
++		.maxlen		= sizeof (int),
++		.mode		= 0644,
++		.proc_handler	= proc_dointvec,
++		.extra1		= SYSCTL_ZERO,
++		.extra2		= SYSCTL_TWO,
++	},
+ #ifdef CONFIG_PROC_SYSCTL
+ 	{
+ 		.procname	= "tainted",
diff --git a/debian/patches/patchset-xanmod/xanmod/0005-XANMOD-block-mq-deadline-Increase-write-priority-to-.patch b/debian/patches/patchset-xanmod/xanmod/0005-XANMOD-block-mq-deadline-Increase-write-priority-to-.patch
new file mode 100644
index 0000000..5c5c68e
--- /dev/null
+++ b/debian/patches/patchset-xanmod/xanmod/0005-XANMOD-block-mq-deadline-Increase-write-priority-to-.patch
@@ -0,0 +1,39 @@
+From 9d6ba825d8c8635e217f7596fb4401c9ef9d408a Mon Sep 17 00:00:00 2001
+From: Alexandre Frade <kernel@xanmod.org>
+Date: Wed, 11 May 2022 18:56:51 +0000
+Subject: [PATCH 07/19] XANMOD: block/mq-deadline: Increase write priority to
+ improve responsiveness
+
+Signed-off-by: Alexandre Frade <kernel@xanmod.org>
+---
+ block/mq-deadline.c | 7 +++++--
+ 1 file changed, 5 insertions(+), 2 deletions(-)
+
+--- a/block/mq-deadline.c
++++ b/block/mq-deadline.c
+@@ -4,6 +4,9 @@
+  *  for the blk-mq scheduling framework
+  *
+  *  Copyright (C) 2016 Jens Axboe <axboe@kernel.dk>
++ *
++ *  Tunes for responsiveness by Alexandre Frade
++ *  (C) 2022 Alexandre Frade <kernel@xanmod.org>
+  */
+ #include <linux/kernel.h>
+ #include <linux/fs.h>
+@@ -28,13 +31,13 @@
+  * See Documentation/block/deadline-iosched.rst
+  */
+ static const int read_expire = HZ / 2;  /* max time before a read is submitted. */
+-static const int write_expire = 5 * HZ; /* ditto for writes, these limits are SOFT! */
++static const int write_expire = HZ;     /* ditto for writes, these limits are SOFT! */
+ /*
+  * Time after which to dispatch lower priority requests even if higher
+  * priority requests are pending.
+  */
+ static const int prio_aging_expire = 10 * HZ;
+-static const int writes_starved = 2;    /* max times reads can starve a write */
++static const int writes_starved = 1;    /* max times reads can starve a write */
+ static const int fifo_batch = 16;       /* # of sequential requests treated as one
+ 				     by the above parameters. For throughput. */
+ 
diff --git a/debian/patches/patchset-xanmod/xanmod/0006-XANMOD-block-mq-deadline-Disable-front_merges-by-def.patch b/debian/patches/patchset-xanmod/xanmod/0006-XANMOD-block-mq-deadline-Disable-front_merges-by-def.patch
new file mode 100644
index 0000000..00b1d78
--- /dev/null
+++ b/debian/patches/patchset-xanmod/xanmod/0006-XANMOD-block-mq-deadline-Disable-front_merges-by-def.patch
@@ -0,0 +1,22 @@
+From f4b45e0e5444254caf5992cde662236867ac388b Mon Sep 17 00:00:00 2001
+From: Alexandre Frade <kernel@xanmod.org>
+Date: Thu, 6 Jan 2022 16:59:01 +0000
+Subject: [PATCH 08/19] XANMOD: block/mq-deadline: Disable front_merges by
+ default
+
+Signed-off-by: Alexandre Frade <kernel@xanmod.org>
+---
+ block/mq-deadline.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/block/mq-deadline.c
++++ b/block/mq-deadline.c
+@@ -600,7 +600,7 @@ static int dd_init_sched(struct request_
+ 	dd->fifo_expire[DD_READ] = read_expire;
+ 	dd->fifo_expire[DD_WRITE] = write_expire;
+ 	dd->writes_starved = writes_starved;
+-	dd->front_merges = 1;
++	dd->front_merges = 0;
+ 	dd->last_dir = DD_WRITE;
+ 	dd->fifo_batch = fifo_batch;
+ 	dd->prio_aging_expire = prio_aging_expire;
diff --git a/debian/patches/patchset-xanmod/xanmod/0007-XANMOD-block-Set-rq_affinity-to-force-complete-I-O-r.patch b/debian/patches/patchset-xanmod/xanmod/0007-XANMOD-block-Set-rq_affinity-to-force-complete-I-O-r.patch
new file mode 100644
index 0000000..63f64f6
--- /dev/null
+++ b/debian/patches/patchset-xanmod/xanmod/0007-XANMOD-block-Set-rq_affinity-to-force-complete-I-O-r.patch
@@ -0,0 +1,23 @@
+From 08580ad4d8ffb10cb30a228361d2d914a1502e3c Mon Sep 17 00:00:00 2001
+From: Alexandre Frade <kernel@xanmod.org>
+Date: Mon, 16 Sep 2024 15:36:01 +0000
+Subject: [PATCH 09/19] XANMOD: block: Set rq_affinity to force complete I/O
+ requests on same CPU
+
+Signed-off-by: Alexandre Frade <kernel@xanmod.org>
+---
+ include/linux/blkdev.h | 3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+--- a/include/linux/blkdev.h
++++ b/include/linux/blkdev.h
+@@ -605,7 +605,8 @@ enum {
+ 	QUEUE_FLAG_MAX
+ };
+ 
+-#define QUEUE_FLAG_MQ_DEFAULT	(1UL << QUEUE_FLAG_SAME_COMP)
++#define QUEUE_FLAG_MQ_DEFAULT	((1UL << QUEUE_FLAG_SAME_COMP) |		\
++				 (1UL << QUEUE_FLAG_SAME_FORCE))
+ 
+ void blk_queue_flag_set(unsigned int flag, struct request_queue *q);
+ void blk_queue_flag_clear(unsigned int flag, struct request_queue *q);
diff --git a/debian/patches/patchset-xanmod/xanmod/0008-XANMOD-blk-wbt-Set-wbt_default_latency_nsec-to-2msec.patch b/debian/patches/patchset-xanmod/xanmod/0008-XANMOD-blk-wbt-Set-wbt_default_latency_nsec-to-2msec.patch
new file mode 100644
index 0000000..10bf8f8
--- /dev/null
+++ b/debian/patches/patchset-xanmod/xanmod/0008-XANMOD-blk-wbt-Set-wbt_default_latency_nsec-to-2msec.patch
@@ -0,0 +1,30 @@
+From 278088038e8259faf193e23ac16b48b0350da6fb Mon Sep 17 00:00:00 2001
+From: Alexandre Frade <kernel@xanmod.org>
+Date: Mon, 15 Jul 2024 04:50:34 +0000
+Subject: [PATCH 10/19] XANMOD: blk-wbt: Set wbt_default_latency_nsec() to
+ 2msec
+
+Signed-off-by: Alexandre Frade <kernel@xanmod.org>
+---
+ block/blk-wbt.c | 10 ++--------
+ 1 file changed, 2 insertions(+), 8 deletions(-)
+
+--- a/block/blk-wbt.c
++++ b/block/blk-wbt.c
+@@ -730,14 +730,8 @@ EXPORT_SYMBOL_GPL(wbt_enable_default);
+ 
+ u64 wbt_default_latency_nsec(struct request_queue *q)
+ {
+-	/*
+-	 * We default to 2msec for non-rotational storage, and 75msec
+-	 * for rotational storage.
+-	 */
+-	if (blk_queue_nonrot(q))
+-		return 2000000ULL;
+-	else
+-		return 75000000ULL;
++	/* XanMod defaults to 2msec for any type of storage */
++	return 2000000ULL;
+ }
+ 
+ static int wbt_data_dir(const struct request *rq)
diff --git a/debian/patches/patchset-xanmod/xanmod/0009-XANMOD-kconfig-add-500Hz-timer-interrupt-kernel-conf.patch b/debian/patches/patchset-xanmod/xanmod/0009-XANMOD-kconfig-add-500Hz-timer-interrupt-kernel-conf.patch
new file mode 100644
index 0000000..1b9156c
--- /dev/null
+++ b/debian/patches/patchset-xanmod/xanmod/0009-XANMOD-kconfig-add-500Hz-timer-interrupt-kernel-conf.patch
@@ -0,0 +1,35 @@
+From 1637fbf551572f4578ed5644ae485b8da659b509 Mon Sep 17 00:00:00 2001
+From: Alexandre Frade <kernel@xanmod.org>
+Date: Mon, 29 Jan 2018 17:26:15 +0000
+Subject: [PATCH 11/19] XANMOD: kconfig: add 500Hz timer interrupt kernel
+ config option
+
+Signed-off-by: Alexandre Frade <kernel@xanmod.org>
+---
+ kernel/Kconfig.hz | 8 ++++++++
+ 1 file changed, 8 insertions(+)
+
+--- a/kernel/Kconfig.hz
++++ b/kernel/Kconfig.hz
+@@ -40,6 +40,13 @@ choice
+ 	 on SMP and NUMA systems and exactly dividing by both PAL and
+ 	 NTSC frame rates for video and multimedia work.
+ 
++	config HZ_500
++		bool "500 HZ"
++	help
++	 500 Hz is a balanced timer frequency. Provides fast interactivity
++	 on desktops with great smoothness without increasing CPU power
++	 consumption and sacrificing the battery life on laptops.
++
+ 	config HZ_1000
+ 		bool "1000 HZ"
+ 	help
+@@ -53,6 +60,7 @@ config HZ
+ 	default 100 if HZ_100
+ 	default 250 if HZ_250
+ 	default 300 if HZ_300
++	default 500 if HZ_500
+ 	default 1000 if HZ_1000
+ 
+ config SCHED_HRTICK
diff --git a/debian/patches/patchset-xanmod/xanmod/0010-XANMOD-dcache-cache_pressure-50-decreases-the-rate-a.patch b/debian/patches/patchset-xanmod/xanmod/0010-XANMOD-dcache-cache_pressure-50-decreases-the-rate-a.patch
new file mode 100644
index 0000000..0cee4de
--- /dev/null
+++ b/debian/patches/patchset-xanmod/xanmod/0010-XANMOD-dcache-cache_pressure-50-decreases-the-rate-a.patch
@@ -0,0 +1,22 @@
+From 53a8dc8afb62674acc347ac7d9ebb46338ab7d64 Mon Sep 17 00:00:00 2001
+From: Alexandre Frade <kernel@xanmod.org>
+Date: Mon, 29 Jan 2018 16:59:22 +0000
+Subject: [PATCH 12/19] XANMOD: dcache: cache_pressure = 50 decreases the rate
+ at which VFS caches are reclaimed
+
+Signed-off-by: Alexandre Frade <kernel@xanmod.org>
+---
+ fs/dcache.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/fs/dcache.c
++++ b/fs/dcache.c
+@@ -73,7 +73,7 @@
+  * If no ancestor relationship:
+  * arbitrary, since it's serialized on rename_lock
+  */
+-int sysctl_vfs_cache_pressure __read_mostly = 100;
++int sysctl_vfs_cache_pressure __read_mostly = 50;
+ EXPORT_SYMBOL_GPL(sysctl_vfs_cache_pressure);
+ 
+ __cacheline_aligned_in_smp DEFINE_SEQLOCK(rename_lock);
diff --git a/debian/patches/patchset-xanmod/xanmod/0011-XANMOD-mm-vmscan-Set-minimum-amount-of-swapping.patch b/debian/patches/patchset-xanmod/xanmod/0011-XANMOD-mm-vmscan-Set-minimum-amount-of-swapping.patch
new file mode 100644
index 0000000..354cd74
--- /dev/null
+++ b/debian/patches/patchset-xanmod/xanmod/0011-XANMOD-mm-vmscan-Set-minimum-amount-of-swapping.patch
@@ -0,0 +1,21 @@
+From 73df371934a8b28f57030858965f6869c3705836 Mon Sep 17 00:00:00 2001
+From: Alexandre Frade <kernel@xanmod.org>
+Date: Wed, 14 Aug 2024 18:54:53 +0000
+Subject: [PATCH 14/19] XANMOD: mm/vmscan: Set minimum amount of swapping
+
+Signed-off-by: Alexandre Frade <kernel@xanmod.org>
+---
+ mm/vmscan.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/mm/vmscan.c
++++ b/mm/vmscan.c
+@@ -199,7 +199,7 @@ struct scan_control {
+ /*
+  * From 0 .. MAX_SWAPPINESS.  Higher means more swappy.
+  */
+-int vm_swappiness = 60;
++int vm_swappiness = 1;
+ 
+ #ifdef CONFIG_MEMCG
+ 
diff --git a/debian/patches/patchset-xanmod/xanmod/0012-XANMOD-sched-autogroup-Add-kernel-parameter-and-conf.patch b/debian/patches/patchset-xanmod/xanmod/0012-XANMOD-sched-autogroup-Add-kernel-parameter-and-conf.patch
new file mode 100644
index 0000000..ac8cbe4
--- /dev/null
+++ b/debian/patches/patchset-xanmod/xanmod/0012-XANMOD-sched-autogroup-Add-kernel-parameter-and-conf.patch
@@ -0,0 +1,84 @@
+From 8b42012dc4aa0008ad10cf7575f684ebc3a587cf Mon Sep 17 00:00:00 2001
+From: Alexandre Frade <kernel@xanmod.org>
+Date: Wed, 15 Jun 2022 17:07:29 +0000
+Subject: [PATCH 15/19] XANMOD: sched/autogroup: Add kernel parameter and
+ config option to enable/disable autogroup feature by default
+
+Signed-off-by: Alexandre Frade <kernel@xanmod.org>
+---
+ Documentation/admin-guide/kernel-parameters.txt |  6 ++++--
+ init/Kconfig                                    | 12 ++++++++++++
+ kernel/sched/autogroup.c                        |  9 ++++++---
+ 3 files changed, 22 insertions(+), 5 deletions(-)
+
+--- a/Documentation/admin-guide/kernel-parameters.txt
++++ b/Documentation/admin-guide/kernel-parameters.txt
+@@ -493,6 +493,10 @@
+ 			Format: <int> (must be >=0)
+ 			Default: 64
+ 
++	autogroup=	[KNL] Enable or disable scheduler automatic task group
++			creation.
++			Format: <bool>
++
+ 	bau=		[X86_UV] Enable the BAU on SGI UV.  The default
+ 			behavior is to disable the BAU (i.e. bau=0).
+ 			Format: { "0" | "1" }
+@@ -3835,8 +3839,6 @@
+ 	noapic		[SMP,APIC,EARLY] Tells the kernel to not make use of any
+ 			IOAPICs that may be present in the system.
+ 
+-	noautogroup	Disable scheduler automatic task group creation.
+-
+ 	nocache		[ARM,EARLY]
+ 
+ 	no_console_suspend
+--- a/init/Kconfig
++++ b/init/Kconfig
+@@ -1309,6 +1309,18 @@ config SCHED_AUTOGROUP
+ 	  desktop applications.  Task group autogeneration is currently based
+ 	  upon task session.
+ 
++config SCHED_AUTOGROUP_DEFAULT_ENABLED
++	bool "Enable automatic process group scheduling feature"
++	default y
++	depends on SCHED_AUTOGROUP
++	help
++	  If set, automatic process group scheduling will be enabled per
++	  default but can be disabled through passing autogroup=0 on the
++	  kernel commandline during boot or a value of 0 via the file
++	  proc/sys/kernel/sched_autogroup_enabled.
++
++	  If unsure say Y.
++
+ config RELAY
+ 	bool "Kernel->user space relay support (formerly relayfs)"
+ 	select IRQ_WORK
+--- a/kernel/sched/autogroup.c
++++ b/kernel/sched/autogroup.c
+@@ -4,7 +4,8 @@
+  * Auto-group scheduling implementation:
+  */
+ 
+-unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1;
++unsigned int __read_mostly sysctl_sched_autogroup_enabled =
++		IS_ENABLED(CONFIG_SCHED_AUTOGROUP_DEFAULT_ENABLED) ? 1 : 0;
+ static struct autogroup autogroup_default;
+ static atomic_t autogroup_seq_nr;
+ 
+@@ -219,11 +220,13 @@ void sched_autogroup_exit(struct signal_
+ 
+ static int __init setup_autogroup(char *str)
+ {
+-	sysctl_sched_autogroup_enabled = 0;
++	unsigned long enabled;
++	if (!kstrtoul(str, 0, &enabled))
++		sysctl_sched_autogroup_enabled = enabled ? 1 : 0;
+ 
+ 	return 1;
+ }
+-__setup("noautogroup", setup_autogroup);
++__setup("autogroup=", setup_autogroup);
+ 
+ #ifdef CONFIG_PROC_FS
+ 
diff --git a/debian/patches/patchset-xanmod/xanmod/0013-XANMOD-cpufreq-tunes-ondemand-and-conservative-gover.patch b/debian/patches/patchset-xanmod/xanmod/0013-XANMOD-cpufreq-tunes-ondemand-and-conservative-gover.patch
new file mode 100644
index 0000000..b8f9198
--- /dev/null
+++ b/debian/patches/patchset-xanmod/xanmod/0013-XANMOD-cpufreq-tunes-ondemand-and-conservative-gover.patch
@@ -0,0 +1,62 @@
+From f41037c4ee33dd201c1750727daa390a3f652516 Mon Sep 17 00:00:00 2001
+From: Alexandre Frade <kernel@xanmod.org>
+Date: Tue, 31 Mar 2020 13:32:08 -0300
+Subject: [PATCH 16/19] XANMOD: cpufreq: tunes ondemand and conservative
+ governor for performance
+
+Signed-off-by: Alexandre Frade <kernel@xanmod.org>
+---
+ drivers/cpufreq/cpufreq_conservative.c | 8 ++++----
+ drivers/cpufreq/cpufreq_ondemand.c     | 8 ++++----
+ 2 files changed, 8 insertions(+), 8 deletions(-)
+
+--- a/drivers/cpufreq/cpufreq_conservative.c
++++ b/drivers/cpufreq/cpufreq_conservative.c
+@@ -28,8 +28,8 @@ struct cs_dbs_tuners {
+ };
+ 
+ /* Conservative governor macros */
+-#define DEF_FREQUENCY_UP_THRESHOLD		(80)
+-#define DEF_FREQUENCY_DOWN_THRESHOLD		(20)
++#define DEF_FREQUENCY_UP_THRESHOLD		(63)
++#define DEF_FREQUENCY_DOWN_THRESHOLD		(26)
+ #define DEF_FREQUENCY_STEP			(5)
+ #define DEF_SAMPLING_DOWN_FACTOR		(1)
+ #define MAX_SAMPLING_DOWN_FACTOR		(10)
+@@ -47,9 +47,9 @@ static inline unsigned int get_freq_step
+ }
+ 
+ /*
+- * Every sampling_rate, we check, if current idle time is less than 20%
++ * Every sampling_rate, we check, if current idle time is less than 37%
+  * (default), then we try to increase frequency. Every sampling_rate *
+- * sampling_down_factor, we check, if current idle time is more than 80%
++ * sampling_down_factor, we check, if current idle time is more than 74%
+  * (default), then we try to decrease frequency
+  *
+  * Frequency updates happen at minimum steps of 5% (default) of maximum
+--- a/drivers/cpufreq/cpufreq_ondemand.c
++++ b/drivers/cpufreq/cpufreq_ondemand.c
+@@ -18,10 +18,10 @@
+ #include "cpufreq_ondemand.h"
+ 
+ /* On-demand governor macros */
+-#define DEF_FREQUENCY_UP_THRESHOLD		(80)
+-#define DEF_SAMPLING_DOWN_FACTOR		(1)
++#define DEF_FREQUENCY_UP_THRESHOLD		(63)
++#define DEF_SAMPLING_DOWN_FACTOR		(100)
+ #define MAX_SAMPLING_DOWN_FACTOR		(100000)
+-#define MICRO_FREQUENCY_UP_THRESHOLD		(95)
++#define MICRO_FREQUENCY_UP_THRESHOLD		(70)
+ #define MIN_FREQUENCY_UP_THRESHOLD		(1)
+ #define MAX_FREQUENCY_UP_THRESHOLD		(100)
+ 
+@@ -128,7 +128,7 @@ static void dbs_freq_increase(struct cpu
+ }
+ 
+ /*
+- * Every sampling_rate, we check, if current idle time is less than 20%
++ * Every sampling_rate, we check, if current idle time is less than 37%
+  * (default), then we try to increase frequency. Else, we adjust the frequency
+  * proportional to load.
+  */
diff --git a/debian/patches/patchset-xanmod/xanmod/0014-XANMOD-lib-kconfig.debug-disable-default-SYMBOLIC_ER.patch b/debian/patches/patchset-xanmod/xanmod/0014-XANMOD-lib-kconfig.debug-disable-default-SYMBOLIC_ER.patch
new file mode 100644
index 0000000..9a2d7f8
--- /dev/null
+++ b/debian/patches/patchset-xanmod/xanmod/0014-XANMOD-lib-kconfig.debug-disable-default-SYMBOLIC_ER.patch
@@ -0,0 +1,42 @@
+From 000614733021b6723cfd78a4908b9b2e869ea001 Mon Sep 17 00:00:00 2001
+From: Alexandre Frade <kernel@xanmod.org>
+Date: Mon, 16 Sep 2024 08:09:56 +0000
+Subject: [PATCH 17/19] XANMOD: lib/kconfig.debug: disable default
+ SYMBOLIC_ERRNAME and DEBUG_BUGVERBOSE
+
+Signed-off-by: Alexandre Frade <kernel@xanmod.org>
+---
+ fs/bcachefs/Kconfig | 1 -
+ lib/Kconfig.debug   | 4 ++--
+ 2 files changed, 2 insertions(+), 3 deletions(-)
+
+--- a/fs/bcachefs/Kconfig
++++ b/fs/bcachefs/Kconfig
+@@ -23,7 +23,6 @@ config BCACHEFS_FS
+ 	select XOR_BLOCKS
+ 	select XXHASH
+ 	select SRCU
+-	select SYMBOLIC_ERRNAME
+ 	help
+ 	The bcachefs filesystem - a modern, copy on write filesystem, with
+ 	support for multiple devices, compression, checksumming, etc.
+--- a/lib/Kconfig.debug
++++ b/lib/Kconfig.debug
+@@ -190,7 +190,7 @@ config DYNAMIC_DEBUG_CORE
+ 
+ config SYMBOLIC_ERRNAME
+ 	bool "Support symbolic error names in printf"
+-	default y if PRINTK
++	default n
+ 	help
+ 	  If you say Y here, the kernel's printf implementation will
+ 	  be able to print symbolic error names such as ENOSPC instead
+@@ -200,7 +200,7 @@ config SYMBOLIC_ERRNAME
+ config DEBUG_BUGVERBOSE
+ 	bool "Verbose BUG() reporting (adds 70K)" if DEBUG_KERNEL && EXPERT
+ 	depends on BUG && (GENERIC_BUG || HAVE_DEBUG_BUGVERBOSE)
+-	default y
++	default n
+ 	help
+ 	  Say Y here to make BUG() panics output the file name and line number
+ 	  of the BUG call as well as the EIP and oops trace.  This aids
diff --git a/debian/patches/patchset-xanmod/xanmod/0015-XANMOD-scripts-setlocalversion-remove-tag-for-git-re.patch b/debian/patches/patchset-xanmod/xanmod/0015-XANMOD-scripts-setlocalversion-remove-tag-for-git-re.patch
new file mode 100644
index 0000000..86c8375
--- /dev/null
+++ b/debian/patches/patchset-xanmod/xanmod/0015-XANMOD-scripts-setlocalversion-remove-tag-for-git-re.patch
@@ -0,0 +1,21 @@
+From bea9f2e6b80582dc36b5693e08633546b14b2a4e Mon Sep 17 00:00:00 2001
+From: Alexandre Frade <kernel@xanmod.org>
+Date: Sun, 29 May 2022 00:57:40 +0000
+Subject: [PATCH 18/19] XANMOD: scripts/setlocalversion: remove "+" tag for git
+ repo short version
+
+Signed-off-by: Alexandre Frade <kernel@xanmod.org>
+---
+ scripts/setlocalversion | 1 -
+ 1 file changed, 1 deletion(-)
+
+--- a/scripts/setlocalversion
++++ b/scripts/setlocalversion
+@@ -92,7 +92,6 @@ scm_version()
+ 		# If only the short version is requested, don't bother
+ 		# running further git commands
+ 		if $short; then
+-			echo "+"
+ 			return
+ 		fi
+ 		# If we are past the tagged commit, we pretty print it.
diff --git a/debian/patches/patchset-xanmod/xanmod/0016-XANMOD-scripts-setlocalversion-Move-localversion-fil.patch b/debian/patches/patchset-xanmod/xanmod/0016-XANMOD-scripts-setlocalversion-Move-localversion-fil.patch
new file mode 100644
index 0000000..148106e
--- /dev/null
+++ b/debian/patches/patchset-xanmod/xanmod/0016-XANMOD-scripts-setlocalversion-Move-localversion-fil.patch
@@ -0,0 +1,19 @@
+From eb42d6c8411f9a70472e6f3632051e2e44910843 Mon Sep 17 00:00:00 2001
+From: Alexandre Frade <kernel@xanmod.org>
+Date: Mon, 24 Apr 2023 04:50:34 +0000
+Subject: [PATCH 19/19] XANMOD: scripts/setlocalversion: Move localversion*
+ files to the end
+
+Signed-off-by: Alexandre Frade <kernel@xanmod.org>
+---
+ scripts/setlocalversion | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/scripts/setlocalversion
++++ b/scripts/setlocalversion
+@@ -182,4 +182,4 @@ elif [ "${LOCALVERSION+set}" != "set" ];
+ 	scm_version="$(scm_version --short)"
+ fi
+ 
+-echo "${KERNELVERSION}${file_localversion}${config_localversion}${LOCALVERSION}${scm_version}"
++echo "${KERNELVERSION}${config_localversion}${LOCALVERSION}${scm_version}${file_localversion}"
diff --git a/debian/patches/patchset-zen/fixes/0001-Partially-revert-drm-amd-amdgpu-add-pipe1-hardware-s.patch b/debian/patches/patchset-zen/fixes/0001-Partially-revert-drm-amd-amdgpu-add-pipe1-hardware-s.patch
new file mode 100644
index 0000000..b43efe5
--- /dev/null
+++ b/debian/patches/patchset-zen/fixes/0001-Partially-revert-drm-amd-amdgpu-add-pipe1-hardware-s.patch
@@ -0,0 +1,38 @@
+From 2b70fecae5eabc439190c435be020db8afd3bd33 Mon Sep 17 00:00:00 2001
+From: Alex Deucher <alexander.deucher@amd.com>
+Date: Wed, 7 Aug 2024 17:04:02 -0400
+Subject: Partially revert "drm/amd/amdgpu: add pipe1 hardware support"
+
+This partially reverts commit b7a1a0ef12b81957584fef7b61e2d5ec049c7209.
+
+A user reported stuttering under heavy gfx load with this commit.
+I suspect it's due to the fact that the gfx contexts are shared
+between the pipes so if there is alot of load on one pipe, we could
+end up stalling waiting for a context.
+
+That said, having both pipes is useful in some contexts and
+this patch was actually enabled mainly to support some SR-IOV
+use cases, so leave it enabled for SR-IOV.
+
+Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/3519
+Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
+Cc: ZhenGuo Yin <zhenguo.yin@amd.com>
+Cc: Alex Deucher <alexander.deucher@amd.com>
+---
+ drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 5 ++++-
+ 1 file changed, 4 insertions(+), 1 deletion(-)
+
+--- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
++++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
+@@ -4711,7 +4711,10 @@ static int gfx_v10_0_sw_init(void *handl
+ 	case IP_VERSION(10, 3, 3):
+ 	case IP_VERSION(10, 3, 7):
+ 		adev->gfx.me.num_me = 1;
+-		adev->gfx.me.num_pipe_per_me = 2;
++		if (amdgpu_sriov_vf(adev))
++			adev->gfx.me.num_pipe_per_me = 2;
++		else
++			adev->gfx.me.num_pipe_per_me = 1;
+ 		adev->gfx.me.num_queue_per_pipe = 1;
+ 		adev->gfx.mec.num_mec = 2;
+ 		adev->gfx.mec.num_pipe_per_mec = 4;
diff --git a/debian/patches/patchset-zen/fixes/0002-drm-amdgpu-swsmu-default-to-fullscreen-3D-profile-fo.patch b/debian/patches/patchset-zen/fixes/0002-drm-amdgpu-swsmu-default-to-fullscreen-3D-profile-fo.patch
new file mode 100644
index 0000000..2fb4597
--- /dev/null
+++ b/debian/patches/patchset-zen/fixes/0002-drm-amdgpu-swsmu-default-to-fullscreen-3D-profile-fo.patch
@@ -0,0 +1,42 @@
+From 3c881c7323af4269aa58848dd9da694d5a73d353 Mon Sep 17 00:00:00 2001
+From: Alex Deucher <alexander.deucher@amd.com>
+Date: Thu, 3 Oct 2024 09:57:38 -0400
+Subject: drm/amdgpu/swsmu: default to fullscreen 3D profile for dGPUs
+
+This uses more aggressive hueristics than the the bootup default
+profile.  On windows the OS has a special fullscreen 3D mode
+where this is used.  Since we don't have the equivalent on Linux
+default to this profile for dGPUs.
+
+Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/3618
+Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/1500
+Link: https://gitlab.freedesktop.org/drm/amd/-/issues/3131
+Fixes: c50fe289ed72 ("drm/amdgpu/swsmu: always force a state reprogram on init")
+Reviewed-by: Kenneth Feng <kenneth.feng@amd.com>
+Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
+---
+ drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c | 6 +++++-
+ 1 file changed, 5 insertions(+), 1 deletion(-)
+
+--- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
++++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
+@@ -1257,7 +1257,6 @@ static int smu_sw_init(void *handle)
+ 	atomic_set(&smu->smu_power.power_gate.vpe_gated, 1);
+ 	atomic_set(&smu->smu_power.power_gate.umsch_mm_gated, 1);
+ 
+-	smu->workload_mask = 1 << smu->workload_prority[PP_SMC_POWER_PROFILE_BOOTUP_DEFAULT];
+ 	smu->workload_prority[PP_SMC_POWER_PROFILE_BOOTUP_DEFAULT] = 0;
+ 	smu->workload_prority[PP_SMC_POWER_PROFILE_FULLSCREEN3D] = 1;
+ 	smu->workload_prority[PP_SMC_POWER_PROFILE_POWERSAVING] = 2;
+@@ -1266,6 +1265,11 @@ static int smu_sw_init(void *handle)
+ 	smu->workload_prority[PP_SMC_POWER_PROFILE_COMPUTE] = 5;
+ 	smu->workload_prority[PP_SMC_POWER_PROFILE_CUSTOM] = 6;
+ 
++	if (smu->is_apu)
++		smu->workload_mask = 1 << smu->workload_prority[PP_SMC_POWER_PROFILE_BOOTUP_DEFAULT];
++	else
++		smu->workload_mask = 1 << smu->workload_prority[PP_SMC_POWER_PROFILE_FULLSCREEN3D];
++
+ 	smu->workload_setting[0] = PP_SMC_POWER_PROFILE_BOOTUP_DEFAULT;
+ 	smu->workload_setting[1] = PP_SMC_POWER_PROFILE_FULLSCREEN3D;
+ 	smu->workload_setting[2] = PP_SMC_POWER_PROFILE_POWERSAVING;
diff --git a/debian/patches/patchset-zen/sauce/0001-ZEN-Add-VHBA-driver.patch b/debian/patches/patchset-zen/sauce/0001-ZEN-Add-VHBA-driver.patch
new file mode 100644
index 0000000..9e4c756
--- /dev/null
+++ b/debian/patches/patchset-zen/sauce/0001-ZEN-Add-VHBA-driver.patch
@@ -0,0 +1,1183 @@
+From cba900f94444c8c4cde6b5b4edfaeb6160b55ba3 Mon Sep 17 00:00:00 2001
+From: "Jan Alexander Steffens (heftig)" <heftig@archlinux.org>
+Date: Mon, 26 Apr 2021 22:12:46 +0200
+Subject: ZEN: Add VHBA driver
+
+remote https://github.com/cdemu/cdemu
+tag    vhba-module-20240202
+---
+ drivers/scsi/Kconfig       |    2 +
+ drivers/scsi/Makefile      |    1 +
+ drivers/scsi/vhba/Kconfig  |    9 +
+ drivers/scsi/vhba/Makefile |    4 +
+ drivers/scsi/vhba/vhba.c   | 1124 ++++++++++++++++++++++++++++++++++++
+ 5 files changed, 1140 insertions(+)
+ create mode 100644 drivers/scsi/vhba/Kconfig
+ create mode 100644 drivers/scsi/vhba/Makefile
+ create mode 100644 drivers/scsi/vhba/vhba.c
+
+--- a/drivers/scsi/Kconfig
++++ b/drivers/scsi/Kconfig
+@@ -1522,4 +1522,6 @@ endif # SCSI_LOWLEVEL
+ 
+ source "drivers/scsi/device_handler/Kconfig"
+ 
++source "drivers/scsi/vhba/Kconfig"
++
+ endmenu
+--- a/drivers/scsi/Makefile
++++ b/drivers/scsi/Makefile
+@@ -153,6 +153,7 @@ obj-$(CONFIG_CHR_DEV_SCH)	+= ch.o
+ obj-$(CONFIG_SCSI_ENCLOSURE)	+= ses.o
+ 
+ obj-$(CONFIG_SCSI_HISI_SAS) += hisi_sas/
++obj-$(CONFIG_VHBA)		+= vhba/
+ 
+ # This goes last, so that "real" scsi devices probe earlier
+ obj-$(CONFIG_SCSI_DEBUG)	+= scsi_debug.o
+--- /dev/null
++++ b/drivers/scsi/vhba/Kconfig
+@@ -0,0 +1,9 @@
++config VHBA
++	tristate "Virtual (SCSI) Host Bus Adapter"
++	depends on SCSI
++	help
++	  This is the in-kernel part of CDEmu, a CD/DVD-ROM device
++	  emulator.
++
++	  This driver can also be built as a module. If so, the module
++	  will be called vhba.
+--- /dev/null
++++ b/drivers/scsi/vhba/Makefile
+@@ -0,0 +1,4 @@
++VHBA_VERSION := 20240202
++
++obj-$(CONFIG_VHBA)		+= vhba.o
++ccflags-y := -DVHBA_VERSION=\"$(VHBA_VERSION)\" -Werror
+--- /dev/null
++++ b/drivers/scsi/vhba/vhba.c
+@@ -0,0 +1,1124 @@
++/*
++ * vhba.c
++ *
++ * Copyright (C) 2007-2012 Chia-I Wu <olvaffe AT gmail DOT com>
++ *
++ * This program is free software; you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License as published by
++ * the Free Software Foundation; either version 2 of the License, or
++ * (at your option) any later version.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License along
++ * with this program; if not, write to the Free Software Foundation, Inc.,
++ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
++ */
++
++#define pr_fmt(fmt) "vhba: " fmt
++
++#include <linux/version.h>
++
++#include <linux/init.h>
++#include <linux/module.h>
++#include <linux/highmem.h>
++#include <linux/fs.h>
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 11, 0)
++#include <linux/sched/signal.h>
++#else
++#include <linux/sched.h>
++#endif
++#include <linux/platform_device.h>
++#include <linux/miscdevice.h>
++#include <linux/poll.h>
++#include <linux/slab.h>
++#include <linux/scatterlist.h>
++#ifdef CONFIG_COMPAT
++#include <linux/compat.h>
++#endif
++#include <asm/uaccess.h>
++#include <scsi/scsi.h>
++#include <scsi/scsi_host.h>
++#include <scsi/scsi_cmnd.h>
++#include <scsi/scsi_device.h>
++#include <scsi/scsi_tcq.h>
++
++
++MODULE_AUTHOR("Chia-I Wu");
++MODULE_VERSION(VHBA_VERSION);
++MODULE_DESCRIPTION("Virtual SCSI HBA");
++MODULE_LICENSE("GPL");
++
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 15, 0)
++#define sdev_dbg(sdev, fmt, a...) \
++    dev_dbg(&(sdev)->sdev_gendev, fmt, ##a)
++#define scmd_dbg(scmd, fmt, a...) \
++    dev_dbg(&(scmd)->device->sdev_gendev, fmt, ##a)
++#endif
++
++#define VHBA_MAX_SECTORS_PER_IO 256
++#define VHBA_MAX_BUS 16
++#define VHBA_MAX_ID 16
++#define VHBA_MAX_DEVICES (VHBA_MAX_BUS * (VHBA_MAX_ID-1))
++#define VHBA_KBUF_SIZE PAGE_SIZE
++
++#define DATA_TO_DEVICE(dir) ((dir) == DMA_TO_DEVICE || (dir) == DMA_BIDIRECTIONAL)
++#define DATA_FROM_DEVICE(dir) ((dir) == DMA_FROM_DEVICE || (dir) == DMA_BIDIRECTIONAL)
++
++
++static int vhba_can_queue = 32;
++module_param_named(can_queue, vhba_can_queue, int, 0);
++
++
++enum vhba_req_state {
++    VHBA_REQ_FREE,
++    VHBA_REQ_PENDING,
++    VHBA_REQ_READING,
++    VHBA_REQ_SENT,
++    VHBA_REQ_WRITING,
++};
++
++struct vhba_command {
++    struct scsi_cmnd *cmd;
++    /* metatags are per-host. not to be confused with
++       queue tags that are usually per-lun */
++    unsigned long metatag;
++    int status;
++    struct list_head entry;
++};
++
++struct vhba_device {
++    unsigned int num;
++    spinlock_t cmd_lock;
++    struct list_head cmd_list;
++    wait_queue_head_t cmd_wq;
++    atomic_t refcnt;
++
++    unsigned char *kbuf;
++    size_t kbuf_size;
++};
++
++struct vhba_host {
++    struct Scsi_Host *shost;
++    spinlock_t cmd_lock;
++    int cmd_next;
++    struct vhba_command *commands;
++    spinlock_t dev_lock;
++    struct vhba_device *devices[VHBA_MAX_DEVICES];
++    int num_devices;
++    DECLARE_BITMAP(chgmap, VHBA_MAX_DEVICES);
++    int chgtype[VHBA_MAX_DEVICES];
++    struct work_struct scan_devices;
++};
++
++#define MAX_COMMAND_SIZE 16
++
++struct vhba_request {
++    __u32 metatag;
++    __u32 lun;
++    __u8 cdb[MAX_COMMAND_SIZE];
++    __u8 cdb_len;
++    __u32 data_len;
++};
++
++struct vhba_response {
++    __u32 metatag;
++    __u32 status;
++    __u32 data_len;
++};
++
++
++
++static struct vhba_command *vhba_alloc_command (void);
++static void vhba_free_command (struct vhba_command *vcmd);
++
++static struct platform_device vhba_platform_device;
++
++
++
++/* These functions define a symmetric 1:1 mapping between device numbers and
++   the bus and id. We have reserved the last id per bus for the host itself. */
++static void devnum_to_bus_and_id(unsigned int devnum, unsigned int *bus, unsigned int *id)
++{
++    *bus = devnum / (VHBA_MAX_ID-1);
++    *id  = devnum % (VHBA_MAX_ID-1);
++}
++
++static unsigned int bus_and_id_to_devnum(unsigned int bus, unsigned int id)
++{
++    return (bus * (VHBA_MAX_ID-1)) + id;
++}
++
++static struct vhba_device *vhba_device_alloc (void)
++{
++    struct vhba_device *vdev;
++
++    vdev = kzalloc(sizeof(struct vhba_device), GFP_KERNEL);
++    if (!vdev) {
++        return NULL;
++    }
++
++    spin_lock_init(&vdev->cmd_lock);
++    INIT_LIST_HEAD(&vdev->cmd_list);
++    init_waitqueue_head(&vdev->cmd_wq);
++    atomic_set(&vdev->refcnt, 1);
++
++    vdev->kbuf = NULL;
++    vdev->kbuf_size = 0;
++
++    return vdev;
++}
++
++static void vhba_device_put (struct vhba_device *vdev)
++{
++    if (atomic_dec_and_test(&vdev->refcnt)) {
++        kfree(vdev);
++    }
++}
++
++static struct vhba_device *vhba_device_get (struct vhba_device *vdev)
++{
++    atomic_inc(&vdev->refcnt);
++
++    return vdev;
++}
++
++static int vhba_device_queue (struct vhba_device *vdev, struct scsi_cmnd *cmd)
++{
++    struct vhba_host *vhost;
++    struct vhba_command *vcmd;
++    unsigned long flags;
++
++    vhost = platform_get_drvdata(&vhba_platform_device);
++
++    vcmd = vhba_alloc_command();
++    if (!vcmd) {
++        return SCSI_MLQUEUE_HOST_BUSY;
++    }
++
++    vcmd->cmd = cmd;
++
++    spin_lock_irqsave(&vdev->cmd_lock, flags);
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 15, 0)
++    vcmd->metatag = scsi_cmd_to_rq(vcmd->cmd)->tag;
++#else
++    vcmd->metatag = vcmd->cmd->request->tag;
++#endif
++    list_add_tail(&vcmd->entry, &vdev->cmd_list);
++    spin_unlock_irqrestore(&vdev->cmd_lock, flags);
++
++    wake_up_interruptible(&vdev->cmd_wq);
++
++    return 0;
++}
++
++static int vhba_device_dequeue (struct vhba_device *vdev, struct scsi_cmnd *cmd)
++{
++    struct vhba_command *vcmd;
++    int retval;
++    unsigned long flags;
++
++    spin_lock_irqsave(&vdev->cmd_lock, flags);
++    list_for_each_entry(vcmd, &vdev->cmd_list, entry) {
++        if (vcmd->cmd == cmd) {
++            list_del_init(&vcmd->entry);
++            break;
++        }
++    }
++
++    /* command not found */
++    if (&vcmd->entry == &vdev->cmd_list) {
++        spin_unlock_irqrestore(&vdev->cmd_lock, flags);
++        return SUCCESS;
++    }
++
++    while (vcmd->status == VHBA_REQ_READING || vcmd->status == VHBA_REQ_WRITING) {
++        spin_unlock_irqrestore(&vdev->cmd_lock, flags);
++        scmd_dbg(cmd, "wait for I/O before aborting\n");
++        schedule_timeout(1);
++        spin_lock_irqsave(&vdev->cmd_lock, flags);
++    }
++
++    retval = (vcmd->status == VHBA_REQ_SENT) ? FAILED : SUCCESS;
++
++    vhba_free_command(vcmd);
++
++    spin_unlock_irqrestore(&vdev->cmd_lock, flags);
++
++    return retval;
++}
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 19, 0)
++static int vhba_slave_alloc(struct scsi_device *sdev)
++{
++    struct Scsi_Host *shost = sdev->host;
++
++    sdev_dbg(sdev, "enabling tagging (queue depth: %i).\n", sdev->queue_depth);
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 17, 0)
++    if (!shost_use_blk_mq(shost) && shost->bqt) {
++#else
++    if (shost->bqt) {
++#endif
++        blk_queue_init_tags(sdev->request_queue, sdev->queue_depth, shost->bqt);
++    }
++    scsi_adjust_queue_depth(sdev, 0, sdev->queue_depth);
++
++    return 0;
++}
++#endif
++
++static void vhba_scan_devices_add (struct vhba_host *vhost, int bus, int id)
++{
++    struct scsi_device *sdev;
++
++    sdev = scsi_device_lookup(vhost->shost, bus, id, 0);
++    if (!sdev) {
++        scsi_add_device(vhost->shost, bus, id, 0);
++    } else {
++        dev_warn(&vhost->shost->shost_gendev, "tried to add an already-existing device %d:%d:0!\n", bus, id);
++        scsi_device_put(sdev);
++    }
++}
++
++static void vhba_scan_devices_remove (struct vhba_host *vhost, int bus, int id)
++{
++    struct scsi_device *sdev;
++
++    sdev = scsi_device_lookup(vhost->shost, bus, id, 0);
++    if (sdev) {
++        scsi_remove_device(sdev);
++        scsi_device_put(sdev);
++    } else {
++        dev_warn(&vhost->shost->shost_gendev, "tried to remove non-existing device %d:%d:0!\n", bus, id);
++    }
++}
++
++static void vhba_scan_devices (struct work_struct *work)
++{
++    struct vhba_host *vhost = container_of(work, struct vhba_host, scan_devices);
++    unsigned long flags;
++    int change, exists;
++    unsigned int devnum;
++    unsigned int bus, id;
++
++    for (;;) {
++        spin_lock_irqsave(&vhost->dev_lock, flags);
++
++        devnum = find_first_bit(vhost->chgmap, VHBA_MAX_DEVICES);
++        if (devnum >= VHBA_MAX_DEVICES) {
++            spin_unlock_irqrestore(&vhost->dev_lock, flags);
++            break;
++        }
++        change = vhost->chgtype[devnum];
++        exists = vhost->devices[devnum] != NULL;
++
++        vhost->chgtype[devnum] = 0;
++        clear_bit(devnum, vhost->chgmap);
++
++        spin_unlock_irqrestore(&vhost->dev_lock, flags);
++
++        devnum_to_bus_and_id(devnum, &bus, &id);
++
++        if (change < 0) {
++            dev_dbg(&vhost->shost->shost_gendev, "trying to remove target %d:%d:0\n", bus, id);
++            vhba_scan_devices_remove(vhost, bus, id);
++        } else if (change > 0) {
++            dev_dbg(&vhost->shost->shost_gendev, "trying to add target %d:%d:0\n", bus, id);
++            vhba_scan_devices_add(vhost, bus, id);
++        } else {
++            /* quick sequence of add/remove or remove/add; we determine
++               which one it was by checking if device structure exists */
++            if (exists) {
++                /* remove followed by add: remove and (re)add */
++                dev_dbg(&vhost->shost->shost_gendev, "trying to (re)add target %d:%d:0\n", bus, id);
++                vhba_scan_devices_remove(vhost, bus, id);
++                vhba_scan_devices_add(vhost, bus, id);
++            } else {
++                /* add followed by remove: no-op */
++                dev_dbg(&vhost->shost->shost_gendev, "no-op for target %d:%d:0\n", bus, id);
++            }
++        }
++    }
++}
++
++static int vhba_add_device (struct vhba_device *vdev)
++{
++    struct vhba_host *vhost;
++    unsigned int devnum;
++    unsigned long flags;
++
++    vhost = platform_get_drvdata(&vhba_platform_device);
++
++    vhba_device_get(vdev);
++
++    spin_lock_irqsave(&vhost->dev_lock, flags);
++    if (vhost->num_devices >= VHBA_MAX_DEVICES) {
++        spin_unlock_irqrestore(&vhost->dev_lock, flags);
++        vhba_device_put(vdev);
++        return -EBUSY;
++    }
++
++    for (devnum = 0; devnum < VHBA_MAX_DEVICES; devnum++) {
++        if (vhost->devices[devnum] == NULL) {
++            vdev->num = devnum;
++            vhost->devices[devnum] = vdev;
++            vhost->num_devices++;
++            set_bit(devnum, vhost->chgmap);
++            vhost->chgtype[devnum]++;
++            break;
++        }
++    }
++    spin_unlock_irqrestore(&vhost->dev_lock, flags);
++
++    schedule_work(&vhost->scan_devices);
++
++    return 0;
++}
++
++static int vhba_remove_device (struct vhba_device *vdev)
++{
++    struct vhba_host *vhost;
++    unsigned long flags;
++
++    vhost = platform_get_drvdata(&vhba_platform_device);
++
++    spin_lock_irqsave(&vhost->dev_lock, flags);
++    set_bit(vdev->num, vhost->chgmap);
++    vhost->chgtype[vdev->num]--;
++    vhost->devices[vdev->num] = NULL;
++    vhost->num_devices--;
++    spin_unlock_irqrestore(&vhost->dev_lock, flags);
++
++    vhba_device_put(vdev);
++
++    schedule_work(&vhost->scan_devices);
++
++    return 0;
++}
++
++static struct vhba_device *vhba_lookup_device (int devnum)
++{
++    struct vhba_host *vhost;
++    struct vhba_device *vdev = NULL;
++    unsigned long flags;
++
++    vhost = platform_get_drvdata(&vhba_platform_device);
++
++    if (likely(devnum < VHBA_MAX_DEVICES)) {
++        spin_lock_irqsave(&vhost->dev_lock, flags);
++        vdev = vhost->devices[devnum];
++        if (vdev) {
++            vdev = vhba_device_get(vdev);
++        }
++
++        spin_unlock_irqrestore(&vhost->dev_lock, flags);
++    }
++
++    return vdev;
++}
++
++static struct vhba_command *vhba_alloc_command (void)
++{
++    struct vhba_host *vhost;
++    struct vhba_command *vcmd;
++    unsigned long flags;
++    int i;
++
++    vhost = platform_get_drvdata(&vhba_platform_device);
++
++    spin_lock_irqsave(&vhost->cmd_lock, flags);
++
++    vcmd = vhost->commands + vhost->cmd_next++;
++    if (vcmd->status != VHBA_REQ_FREE) {
++        for (i = 0; i < vhba_can_queue; i++) {
++            vcmd = vhost->commands + i;
++
++            if (vcmd->status == VHBA_REQ_FREE) {
++                vhost->cmd_next = i + 1;
++                break;
++            }
++        }
++
++        if (i == vhba_can_queue) {
++            vcmd = NULL;
++        }
++    }
++
++    if (vcmd) {
++        vcmd->status = VHBA_REQ_PENDING;
++    }
++
++    vhost->cmd_next %= vhba_can_queue;
++
++    spin_unlock_irqrestore(&vhost->cmd_lock, flags);
++
++    return vcmd;
++}
++
++static void vhba_free_command (struct vhba_command *vcmd)
++{
++    struct vhba_host *vhost;
++    unsigned long flags;
++
++    vhost = platform_get_drvdata(&vhba_platform_device);
++
++    spin_lock_irqsave(&vhost->cmd_lock, flags);
++    vcmd->status = VHBA_REQ_FREE;
++    spin_unlock_irqrestore(&vhost->cmd_lock, flags);
++}
++
++static int vhba_queuecommand (struct Scsi_Host *shost, struct scsi_cmnd *cmd)
++{
++    struct vhba_device *vdev;
++    int retval;
++    unsigned int devnum;
++
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 15, 0)
++    scmd_dbg(cmd, "queue %p tag %i\n", cmd, scsi_cmd_to_rq(cmd)->tag);
++#else
++    scmd_dbg(cmd, "queue %p tag %i\n", cmd, cmd->request->tag);
++#endif
++
++    devnum = bus_and_id_to_devnum(cmd->device->channel, cmd->device->id);
++    vdev = vhba_lookup_device(devnum);
++    if (!vdev) {
++        scmd_dbg(cmd, "no such device\n");
++
++        cmd->result = DID_NO_CONNECT << 16;
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 16, 0)
++        scsi_done(cmd);
++#else
++        cmd->scsi_done(cmd);
++#endif
++
++        return 0;
++    }
++
++    retval = vhba_device_queue(vdev, cmd);
++
++    vhba_device_put(vdev);
++
++    return retval;
++}
++
++static int vhba_abort (struct scsi_cmnd *cmd)
++{
++    struct vhba_device *vdev;
++    int retval = SUCCESS;
++    unsigned int devnum;
++
++    scmd_dbg(cmd, "abort %p\n", cmd);
++
++    devnum = bus_and_id_to_devnum(cmd->device->channel, cmd->device->id);
++    vdev = vhba_lookup_device(devnum);
++    if (vdev) {
++        retval = vhba_device_dequeue(vdev, cmd);
++        vhba_device_put(vdev);
++    } else {
++        cmd->result = DID_NO_CONNECT << 16;
++    }
++
++    return retval;
++}
++
++static struct scsi_host_template vhba_template = {
++    .module = THIS_MODULE,
++    .name = "vhba",
++    .proc_name = "vhba",
++    .queuecommand = vhba_queuecommand,
++    .eh_abort_handler = vhba_abort,
++    .this_id = -1,
++    .max_sectors = VHBA_MAX_SECTORS_PER_IO,
++    .sg_tablesize = 256,
++#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 19, 0)
++    .slave_alloc = vhba_slave_alloc,
++#endif
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 0, 0)
++    .tag_alloc_policy = BLK_TAG_ALLOC_RR,
++#endif
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 19, 0) && LINUX_VERSION_CODE < KERNEL_VERSION(4, 4, 0)
++    .use_blk_tags = 1,
++#endif
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 0, 0)
++    .max_segment_size = VHBA_KBUF_SIZE,
++#endif
++};
++
++static ssize_t do_request (struct vhba_device *vdev, unsigned long metatag, struct scsi_cmnd *cmd, char __user *buf, size_t buf_len)
++{
++    struct vhba_request vreq;
++    ssize_t ret;
++
++    scmd_dbg(cmd, "request %lu (%p), cdb 0x%x, bufflen %d, sg count %d\n",
++        metatag, cmd, cmd->cmnd[0], scsi_bufflen(cmd), scsi_sg_count(cmd));
++
++    ret = sizeof(vreq);
++    if (DATA_TO_DEVICE(cmd->sc_data_direction)) {
++        ret += scsi_bufflen(cmd);
++    }
++
++    if (ret > buf_len) {
++        scmd_dbg(cmd, "buffer too small (%zd < %zd) for a request\n", buf_len, ret);
++        return -EIO;
++    }
++
++    vreq.metatag = metatag;
++    vreq.lun = cmd->device->lun;
++    memcpy(vreq.cdb, cmd->cmnd, MAX_COMMAND_SIZE);
++    vreq.cdb_len = cmd->cmd_len;
++    vreq.data_len = scsi_bufflen(cmd);
++
++    if (copy_to_user(buf, &vreq, sizeof(vreq))) {
++        return -EFAULT;
++    }
++
++    if (DATA_TO_DEVICE(cmd->sc_data_direction) && vreq.data_len) {
++        buf += sizeof(vreq);
++
++        if (scsi_sg_count(cmd)) {
++            unsigned char *kaddr, *uaddr;
++            struct scatterlist *sglist = scsi_sglist(cmd);
++            struct scatterlist *sg;
++            int i;
++
++            uaddr = (unsigned char *) buf;
++
++            for_each_sg(sglist, sg, scsi_sg_count(cmd), i) {
++                size_t len = sg->length;
++
++                if (len > vdev->kbuf_size) {
++                    scmd_dbg(cmd, "segment size (%zu) exceeds kbuf size (%zu)!", len, vdev->kbuf_size);
++                    len = vdev->kbuf_size;
++                }
++
++                kaddr = kmap_atomic(sg_page(sg));
++                memcpy(vdev->kbuf, kaddr + sg->offset, len);
++                kunmap_atomic(kaddr);
++
++                if (copy_to_user(uaddr, vdev->kbuf, len)) {
++                    return -EFAULT;
++                }
++                uaddr += len;
++            }
++        } else {
++            if (copy_to_user(buf, scsi_sglist(cmd), vreq.data_len)) {
++                return -EFAULT;
++            }
++        }
++    }
++
++    return ret;
++}
++
++static ssize_t do_response (struct vhba_device *vdev, unsigned long metatag, struct scsi_cmnd *cmd, const char __user *buf, size_t buf_len, struct vhba_response *res)
++{
++    ssize_t ret = 0;
++
++    scmd_dbg(cmd, "response %lu (%p), status %x, data len %d, sg count %d\n",
++         metatag, cmd, res->status, res->data_len, scsi_sg_count(cmd));
++
++    if (res->status) {
++        if (res->data_len > SCSI_SENSE_BUFFERSIZE) {
++            scmd_dbg(cmd, "truncate sense (%d < %d)", SCSI_SENSE_BUFFERSIZE, res->data_len);
++            res->data_len = SCSI_SENSE_BUFFERSIZE;
++        }
++
++        if (copy_from_user(cmd->sense_buffer, buf, res->data_len)) {
++            return -EFAULT;
++        }
++
++        cmd->result = res->status;
++
++        ret += res->data_len;
++    } else if (DATA_FROM_DEVICE(cmd->sc_data_direction) && scsi_bufflen(cmd)) {
++        size_t to_read;
++
++        if (res->data_len > scsi_bufflen(cmd)) {
++            scmd_dbg(cmd, "truncate data (%d < %d)\n", scsi_bufflen(cmd), res->data_len);
++            res->data_len = scsi_bufflen(cmd);
++        }
++
++        to_read = res->data_len;
++
++        if (scsi_sg_count(cmd)) {
++            unsigned char *kaddr, *uaddr;
++            struct scatterlist *sglist = scsi_sglist(cmd);
++            struct scatterlist *sg;
++            int i;
++
++            uaddr = (unsigned char *)buf;
++
++            for_each_sg(sglist, sg, scsi_sg_count(cmd), i) {
++                size_t len = (sg->length < to_read) ? sg->length : to_read;
++
++                if (len > vdev->kbuf_size) {
++                    scmd_dbg(cmd, "segment size (%zu) exceeds kbuf size (%zu)!", len, vdev->kbuf_size);
++                    len = vdev->kbuf_size;
++                }
++
++                if (copy_from_user(vdev->kbuf, uaddr, len)) {
++                    return -EFAULT;
++                }
++                uaddr += len;
++
++                kaddr = kmap_atomic(sg_page(sg));
++                memcpy(kaddr + sg->offset, vdev->kbuf, len);
++                kunmap_atomic(kaddr);
++
++                to_read -= len;
++                if (to_read == 0) {
++                    break;
++                }
++            }
++        } else {
++            if (copy_from_user(scsi_sglist(cmd), buf, res->data_len)) {
++                return -EFAULT;
++            }
++
++            to_read -= res->data_len;
++        }
++
++        scsi_set_resid(cmd, to_read);
++
++        ret += res->data_len - to_read;
++    }
++
++    return ret;
++}
++
++static struct vhba_command *next_command (struct vhba_device *vdev)
++{
++    struct vhba_command *vcmd;
++
++    list_for_each_entry(vcmd, &vdev->cmd_list, entry) {
++        if (vcmd->status == VHBA_REQ_PENDING) {
++            break;
++        }
++    }
++
++    if (&vcmd->entry == &vdev->cmd_list) {
++        vcmd = NULL;
++    }
++
++    return vcmd;
++}
++
++static struct vhba_command *match_command (struct vhba_device *vdev, __u32 metatag)
++{
++    struct vhba_command *vcmd;
++
++    list_for_each_entry(vcmd, &vdev->cmd_list, entry) {
++        if (vcmd->metatag == metatag) {
++            break;
++        }
++    }
++
++    if (&vcmd->entry == &vdev->cmd_list) {
++        vcmd = NULL;
++    }
++
++    return vcmd;
++}
++
++static struct vhba_command *wait_command (struct vhba_device *vdev, unsigned long flags)
++{
++    struct vhba_command *vcmd;
++    DEFINE_WAIT(wait);
++
++    while (!(vcmd = next_command(vdev))) {
++        if (signal_pending(current)) {
++            break;
++        }
++
++        prepare_to_wait(&vdev->cmd_wq, &wait, TASK_INTERRUPTIBLE);
++
++        spin_unlock_irqrestore(&vdev->cmd_lock, flags);
++
++        schedule();
++
++        spin_lock_irqsave(&vdev->cmd_lock, flags);
++    }
++
++    finish_wait(&vdev->cmd_wq, &wait);
++    if (vcmd) {
++        vcmd->status = VHBA_REQ_READING;
++    }
++
++    return vcmd;
++}
++
++static ssize_t vhba_ctl_read (struct file *file, char __user *buf, size_t buf_len, loff_t *offset)
++{
++    struct vhba_device *vdev;
++    struct vhba_command *vcmd;
++    ssize_t ret;
++    unsigned long flags;
++
++    vdev = file->private_data;
++
++    /* Get next command */
++    if (file->f_flags & O_NONBLOCK) {
++        /* Non-blocking variant */
++        spin_lock_irqsave(&vdev->cmd_lock, flags);
++        vcmd = next_command(vdev);
++        spin_unlock_irqrestore(&vdev->cmd_lock, flags);
++
++        if (!vcmd) {
++            return -EWOULDBLOCK;
++        }
++    } else {
++        /* Blocking variant */
++        spin_lock_irqsave(&vdev->cmd_lock, flags);
++        vcmd = wait_command(vdev, flags);
++        spin_unlock_irqrestore(&vdev->cmd_lock, flags);
++
++        if (!vcmd) {
++            return -ERESTARTSYS;
++        }
++    }
++
++    ret = do_request(vdev, vcmd->metatag, vcmd->cmd, buf, buf_len);
++
++    spin_lock_irqsave(&vdev->cmd_lock, flags);
++    if (ret >= 0) {
++        vcmd->status = VHBA_REQ_SENT;
++        *offset += ret;
++    } else {
++        vcmd->status = VHBA_REQ_PENDING;
++    }
++
++    spin_unlock_irqrestore(&vdev->cmd_lock, flags);
++
++    return ret;
++}
++
++static ssize_t vhba_ctl_write (struct file *file, const char __user *buf, size_t buf_len, loff_t *offset)
++{
++    struct vhba_device *vdev;
++    struct vhba_command *vcmd;
++    struct vhba_response res;
++    ssize_t ret;
++    unsigned long flags;
++
++    if (buf_len < sizeof(res)) {
++        return -EIO;
++    }
++
++    if (copy_from_user(&res, buf, sizeof(res))) {
++        return -EFAULT;
++    }
++
++    vdev = file->private_data;
++
++    spin_lock_irqsave(&vdev->cmd_lock, flags);
++    vcmd = match_command(vdev, res.metatag);
++    if (!vcmd || vcmd->status != VHBA_REQ_SENT) {
++        spin_unlock_irqrestore(&vdev->cmd_lock, flags);
++        pr_debug("ctl dev #%u not expecting response\n", vdev->num);
++        return -EIO;
++    }
++    vcmd->status = VHBA_REQ_WRITING;
++    spin_unlock_irqrestore(&vdev->cmd_lock, flags);
++
++    ret = do_response(vdev, vcmd->metatag, vcmd->cmd, buf + sizeof(res), buf_len - sizeof(res), &res);
++
++    spin_lock_irqsave(&vdev->cmd_lock, flags);
++    if (ret >= 0) {
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 16, 0)
++        scsi_done(vcmd->cmd);
++#else
++        vcmd->cmd->scsi_done(vcmd->cmd);
++#endif
++        ret += sizeof(res);
++
++        /* don't compete with vhba_device_dequeue */
++        if (!list_empty(&vcmd->entry)) {
++            list_del_init(&vcmd->entry);
++            vhba_free_command(vcmd);
++        }
++    } else {
++        vcmd->status = VHBA_REQ_SENT;
++    }
++
++    spin_unlock_irqrestore(&vdev->cmd_lock, flags);
++
++    return ret;
++}
++
++static long vhba_ctl_ioctl (struct file *file, unsigned int cmd, unsigned long arg)
++{
++    struct vhba_device *vdev = file->private_data;
++    struct vhba_host *vhost = platform_get_drvdata(&vhba_platform_device);
++
++    switch (cmd) {
++        case 0xBEEF001: {
++            unsigned int ident[4]; /* host, channel, id, lun */
++
++            ident[0] = vhost->shost->host_no;
++            devnum_to_bus_and_id(vdev->num, &ident[1], &ident[2]);
++            ident[3] = 0; /* lun */
++
++            if (copy_to_user((void *) arg, ident, sizeof(ident))) {
++                return -EFAULT;
++            }
++
++            return 0;
++        }
++        case 0xBEEF002: {
++            unsigned int devnum = vdev->num;
++
++            if (copy_to_user((void *) arg, &devnum, sizeof(devnum))) {
++                return -EFAULT;
++            }
++
++            return 0;
++        }
++    }
++
++    return -ENOTTY;
++}
++
++#ifdef CONFIG_COMPAT
++static long vhba_ctl_compat_ioctl (struct file *file, unsigned int cmd, unsigned long arg)
++{
++    unsigned long compat_arg = (unsigned long)compat_ptr(arg);
++    return vhba_ctl_ioctl(file, cmd, compat_arg);
++}
++#endif
++
++static unsigned int vhba_ctl_poll (struct file *file, poll_table *wait)
++{
++    struct vhba_device *vdev = file->private_data;
++    unsigned int mask = 0;
++    unsigned long flags;
++
++    poll_wait(file, &vdev->cmd_wq, wait);
++
++    spin_lock_irqsave(&vdev->cmd_lock, flags);
++    if (next_command(vdev)) {
++        mask |= POLLIN | POLLRDNORM;
++    }
++    spin_unlock_irqrestore(&vdev->cmd_lock, flags);
++
++    return mask;
++}
++
++static int vhba_ctl_open (struct inode *inode, struct file *file)
++{
++    struct vhba_device *vdev;
++    int retval;
++
++    pr_debug("ctl dev open\n");
++
++    /* check if vhba is probed */
++    if (!platform_get_drvdata(&vhba_platform_device)) {
++        return -ENODEV;
++    }
++
++    vdev = vhba_device_alloc();
++    if (!vdev) {
++        return -ENOMEM;
++    }
++
++    vdev->kbuf_size = VHBA_KBUF_SIZE;
++    vdev->kbuf = kzalloc(vdev->kbuf_size, GFP_KERNEL);
++    if (!vdev->kbuf) {
++        return -ENOMEM;
++    }
++
++    if (!(retval = vhba_add_device(vdev))) {
++        file->private_data = vdev;
++    }
++
++    vhba_device_put(vdev);
++
++    return retval;
++}
++
++static int vhba_ctl_release (struct inode *inode, struct file *file)
++{
++    struct vhba_device *vdev;
++    struct vhba_command *vcmd;
++    unsigned long flags;
++
++    vdev = file->private_data;
++
++    pr_debug("ctl dev release\n");
++
++    vhba_device_get(vdev);
++    vhba_remove_device(vdev);
++
++    spin_lock_irqsave(&vdev->cmd_lock, flags);
++    list_for_each_entry(vcmd, &vdev->cmd_list, entry) {
++        WARN_ON(vcmd->status == VHBA_REQ_READING || vcmd->status == VHBA_REQ_WRITING);
++
++        scmd_dbg(vcmd->cmd, "device released with command %lu (%p)\n", vcmd->metatag, vcmd->cmd);
++        vcmd->cmd->result = DID_NO_CONNECT << 16;
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 16, 0)
++        scsi_done(vcmd->cmd);
++#else
++        vcmd->cmd->scsi_done(vcmd->cmd);
++#endif
++        vhba_free_command(vcmd);
++    }
++    INIT_LIST_HEAD(&vdev->cmd_list);
++    spin_unlock_irqrestore(&vdev->cmd_lock, flags);
++
++    kfree(vdev->kbuf);
++    vdev->kbuf = NULL;
++
++    vhba_device_put(vdev);
++
++    return 0;
++}
++
++static struct file_operations vhba_ctl_fops = {
++    .owner = THIS_MODULE,
++    .open = vhba_ctl_open,
++    .release = vhba_ctl_release,
++    .read = vhba_ctl_read,
++    .write = vhba_ctl_write,
++    .poll = vhba_ctl_poll,
++    .unlocked_ioctl = vhba_ctl_ioctl,
++#ifdef CONFIG_COMPAT
++    .compat_ioctl = vhba_ctl_compat_ioctl,
++#endif
++};
++
++static struct miscdevice vhba_miscdev = {
++    .minor = MISC_DYNAMIC_MINOR,
++    .name = "vhba_ctl",
++    .fops = &vhba_ctl_fops,
++};
++
++static int vhba_probe (struct platform_device *pdev)
++{
++    struct Scsi_Host *shost;
++    struct vhba_host *vhost;
++    int i;
++
++    vhba_can_queue = clamp(vhba_can_queue, 1, 256);
++
++    shost = scsi_host_alloc(&vhba_template, sizeof(struct vhba_host));
++    if (!shost) {
++        return -ENOMEM;
++    }
++
++    shost->max_channel = VHBA_MAX_BUS-1;
++    shost->max_id = VHBA_MAX_ID;
++    /* we don't support lun > 0 */
++    shost->max_lun = 1;
++    shost->max_cmd_len = MAX_COMMAND_SIZE;
++    shost->can_queue = vhba_can_queue;
++    shost->cmd_per_lun = vhba_can_queue;
++
++    vhost = (struct vhba_host *)shost->hostdata;
++    memset(vhost, 0, sizeof(struct vhba_host));
++
++    vhost->shost = shost;
++    vhost->num_devices = 0;
++    spin_lock_init(&vhost->dev_lock);
++    spin_lock_init(&vhost->cmd_lock);
++    INIT_WORK(&vhost->scan_devices, vhba_scan_devices);
++    vhost->cmd_next = 0;
++    vhost->commands = kzalloc(vhba_can_queue * sizeof(struct vhba_command), GFP_KERNEL);
++    if (!vhost->commands) {
++        return -ENOMEM;
++    }
++
++    for (i = 0; i < vhba_can_queue; i++) {
++        vhost->commands[i].status = VHBA_REQ_FREE;
++    }
++
++    platform_set_drvdata(pdev, vhost);
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 4, 0)
++    i = scsi_init_shared_tag_map(shost, vhba_can_queue);
++    if (i) return i;
++#endif
++
++    if (scsi_add_host(shost, &pdev->dev)) {
++        scsi_host_put(shost);
++        return -ENOMEM;
++    }
++
++    return 0;
++}
++
++static int vhba_remove (struct platform_device *pdev)
++{
++    struct vhba_host *vhost;
++    struct Scsi_Host *shost;
++
++    vhost = platform_get_drvdata(pdev);
++    shost = vhost->shost;
++
++    scsi_remove_host(shost);
++    scsi_host_put(shost);
++
++    kfree(vhost->commands);
++
++    return 0;
++}
++
++static void vhba_release (struct device * dev)
++{
++    return;
++}
++
++static struct platform_device vhba_platform_device = {
++    .name = "vhba",
++    .id = -1,
++    .dev = {
++        .release = vhba_release,
++    },
++};
++
++static struct platform_driver vhba_platform_driver = {
++    .driver = {
++        .owner = THIS_MODULE,
++        .name = "vhba",
++    },
++    .probe = vhba_probe,
++    .remove = vhba_remove,
++};
++
++static int __init vhba_init (void)
++{
++    int ret;
++
++    ret = platform_device_register(&vhba_platform_device);
++    if (ret < 0) {
++        return ret;
++    }
++
++    ret = platform_driver_register(&vhba_platform_driver);
++    if (ret < 0) {
++        platform_device_unregister(&vhba_platform_device);
++        return ret;
++    }
++
++    ret = misc_register(&vhba_miscdev);
++    if (ret < 0) {
++        platform_driver_unregister(&vhba_platform_driver);
++        platform_device_unregister(&vhba_platform_device);
++        return ret;
++    }
++
++    return 0;
++}
++
++static void __exit vhba_exit(void)
++{
++    misc_deregister(&vhba_miscdev);
++    platform_driver_unregister(&vhba_platform_driver);
++    platform_device_unregister(&vhba_platform_device);
++}
++
++module_init(vhba_init);
++module_exit(vhba_exit);
++
diff --git a/debian/patches/patchset-zen/sauce/0002-vhba-Fix-compat-with-kernel-6.11.patch b/debian/patches/patchset-zen/sauce/0002-vhba-Fix-compat-with-kernel-6.11.patch
new file mode 100644
index 0000000..e4b2fd3
--- /dev/null
+++ b/debian/patches/patchset-zen/sauce/0002-vhba-Fix-compat-with-kernel-6.11.patch
@@ -0,0 +1,35 @@
+From 6df7338351c342060088aa9abd561b81ccc113d2 Mon Sep 17 00:00:00 2001
+From: "Jan Alexander Steffens (heftig)" <heftig@archlinux.org>
+Date: Sun, 15 Sep 2024 19:05:46 +0000
+Subject: vhba: Fix compat with kernel 6.11
+
+Upstream commit 0edb555a65d1ef047a9805051c36922b52a38a9d changed the
+return value of the `remove` callback from `int` to `void`.
+---
+ drivers/scsi/vhba/vhba.c | 6 ++++++
+ 1 file changed, 6 insertions(+)
+
+--- a/drivers/scsi/vhba/vhba.c
++++ b/drivers/scsi/vhba/vhba.c
+@@ -1049,7 +1049,11 @@ static int vhba_probe (struct platform_d
+     return 0;
+ }
+ 
++#if LINUX_VERSION_CODE < KERNEL_VERSION(6, 11, 0)
+ static int vhba_remove (struct platform_device *pdev)
++#else
++static void vhba_remove (struct platform_device *pdev)
++#endif
+ {
+     struct vhba_host *vhost;
+     struct Scsi_Host *shost;
+@@ -1062,7 +1066,9 @@ static int vhba_remove (struct platform_
+ 
+     kfree(vhost->commands);
+ 
++#if LINUX_VERSION_CODE < KERNEL_VERSION(6, 11, 0)
+     return 0;
++#endif
+ }
+ 
+ static void vhba_release (struct device * dev)
diff --git a/debian/patches/patchset-zen/sauce/0003-ZEN-PCI-Add-Intel-remapped-NVMe-device-support.patch b/debian/patches/patchset-zen/sauce/0003-ZEN-PCI-Add-Intel-remapped-NVMe-device-support.patch
new file mode 100644
index 0000000..69c7a89
--- /dev/null
+++ b/debian/patches/patchset-zen/sauce/0003-ZEN-PCI-Add-Intel-remapped-NVMe-device-support.patch
@@ -0,0 +1,626 @@
+From 567908cc05dc5e02b1b9c26620bce3791559f9d4 Mon Sep 17 00:00:00 2001
+From: Daniel Drake <drake@endlessm.com>
+Date: Tue, 4 Jun 2019 14:51:21 +0800
+Subject: ZEN: PCI: Add Intel remapped NVMe device support
+
+Contains:
+  - PCI: Add Intel remapped NVMe device support
+
+    Consumer products that are configured by default to run the Intel SATA AHCI
+    controller in "RAID" or "Intel RST Premium With Intel Optane System
+    Acceleration" mode are becoming increasingly prevalent.
+
+    Unde this mode, NVMe devices are remapped into the SATA device and become
+    hidden from the PCI bus, which means that Linux users cannot access their
+    storage devices unless they go into the firmware setup menu to revert back
+    to AHCI mode - assuming such option is available. Lack of support for this
+    mode is also causing complications for vendors who distribute Linux.
+
+    Add support for the remapped NVMe mode by creating a virtual PCI bus,
+    where the AHCI and NVMe devices are presented separately, allowing the
+    ahci and nvme drivers to bind in the normal way.
+
+    Unfortunately the NVMe device configuration space is inaccesible under
+    this scheme, so we provide a fake one, and hope that no DeviceID-based
+    quirks are needed. The interrupt is shared between the AHCI and NVMe
+    devices.
+
+    Allow pci_real_dma_dev() to traverse back to the real DMA device from
+    the PCI devices created on our virtual bus, in case the iommu driver
+    will be involved with data transfers here.
+
+    The existing ahci driver is modified to not claim devices where remapped
+    NVMe devices are present, allowing this new driver to step in.
+
+    The details of the remapping scheme came from patches previously
+    posted by Dan Williams and the resulting discussion.
+
+    https://phabricator.endlessm.com/T24358
+    https://phabricator.endlessm.com/T29119
+
+    Signed-off-by: Daniel Drake <drake@endlessm.com>
+
+  - PCI: Fix order of remapped NVMe devices
+---
+ arch/x86/include/asm/pci.h                |   6 +
+ arch/x86/pci/common.c                     |   7 +-
+ drivers/ata/ahci.c                        |  23 +-
+ drivers/pci/controller/Makefile           |   6 +
+ drivers/pci/controller/intel-nvme-remap.c | 462 ++++++++++++++++++++++
+ 5 files changed, 488 insertions(+), 16 deletions(-)
+ create mode 100644 drivers/pci/controller/intel-nvme-remap.c
+
+--- a/arch/x86/include/asm/pci.h
++++ b/arch/x86/include/asm/pci.h
+@@ -26,6 +26,7 @@ struct pci_sysdata {
+ #if IS_ENABLED(CONFIG_VMD)
+ 	struct pci_dev	*vmd_dev;	/* VMD Device if in Intel VMD domain */
+ #endif
++	struct pci_dev	*nvme_remap_dev;	/* AHCI Device if NVME remapped bus */
+ };
+ 
+ extern int pci_routeirq;
+@@ -69,6 +70,11 @@ static inline bool is_vmd(struct pci_bus
+ #define is_vmd(bus)		false
+ #endif /* CONFIG_VMD */
+ 
++static inline bool is_nvme_remap(struct pci_bus *bus)
++{
++	return to_pci_sysdata(bus)->nvme_remap_dev != NULL;
++}
++
+ /* Can be used to override the logic in pci_scan_bus for skipping
+    already-configured bus numbers - to be used for buggy BIOSes
+    or architectures with incomplete PCI setup by the loader */
+--- a/arch/x86/pci/common.c
++++ b/arch/x86/pci/common.c
+@@ -723,12 +723,15 @@ int pci_ext_cfg_avail(void)
+ 		return 0;
+ }
+ 
+-#if IS_ENABLED(CONFIG_VMD)
+ struct pci_dev *pci_real_dma_dev(struct pci_dev *dev)
+ {
++#if IS_ENABLED(CONFIG_VMD)
+ 	if (is_vmd(dev->bus))
+ 		return to_pci_sysdata(dev->bus)->vmd_dev;
++#endif
++
++	if (is_nvme_remap(dev->bus))
++		return to_pci_sysdata(dev->bus)->nvme_remap_dev;
+ 
+ 	return dev;
+ }
+-#endif
+--- a/drivers/ata/ahci.c
++++ b/drivers/ata/ahci.c
+@@ -1618,7 +1618,7 @@ static irqreturn_t ahci_thunderx_irq_han
+ }
+ #endif
+ 
+-static void ahci_remap_check(struct pci_dev *pdev, int bar,
++static int ahci_remap_check(struct pci_dev *pdev, int bar,
+ 		struct ahci_host_priv *hpriv)
+ {
+ 	int i;
+@@ -1631,7 +1631,7 @@ static void ahci_remap_check(struct pci_
+ 	    pci_resource_len(pdev, bar) < SZ_512K ||
+ 	    bar != AHCI_PCI_BAR_STANDARD ||
+ 	    !(readl(hpriv->mmio + AHCI_VSCAP) & 1))
+-		return;
++		return 0;
+ 
+ 	cap = readq(hpriv->mmio + AHCI_REMAP_CAP);
+ 	for (i = 0; i < AHCI_MAX_REMAP; i++) {
+@@ -1646,18 +1646,11 @@ static void ahci_remap_check(struct pci_
+ 	}
+ 
+ 	if (!hpriv->remapped_nvme)
+-		return;
++		return 0;
+ 
+-	dev_warn(&pdev->dev, "Found %u remapped NVMe devices.\n",
+-		 hpriv->remapped_nvme);
+-	dev_warn(&pdev->dev,
+-		 "Switch your BIOS from RAID to AHCI mode to use them.\n");
+-
+-	/*
+-	 * Don't rely on the msi-x capability in the remap case,
+-	 * share the legacy interrupt across ahci and remapped devices.
+-	 */
+-	hpriv->flags |= AHCI_HFLAG_NO_MSI;
++	/* Abort probe, allowing intel-nvme-remap to step in when available */
++	dev_info(&pdev->dev, "Device will be handled by intel-nvme-remap.\n");
++	return -ENODEV;
+ }
+ 
+ static int ahci_get_irq_vector(struct ata_host *host, int port)
+@@ -1896,7 +1889,9 @@ static int ahci_init_one(struct pci_dev
+ 	hpriv->mmio = pcim_iomap_table(pdev)[ahci_pci_bar];
+ 
+ 	/* detect remapped nvme devices */
+-	ahci_remap_check(pdev, ahci_pci_bar, hpriv);
++	rc = ahci_remap_check(pdev, ahci_pci_bar, hpriv);
++	if (rc)
++		return rc;
+ 
+ 	sysfs_add_file_to_group(&pdev->dev.kobj,
+ 				&dev_attr_remapped_nvme.attr,
+--- a/drivers/pci/controller/Makefile
++++ b/drivers/pci/controller/Makefile
+@@ -1,4 +1,10 @@
+ # SPDX-License-Identifier: GPL-2.0
++ifdef CONFIG_X86_64
++ifdef CONFIG_SATA_AHCI
++obj-y += intel-nvme-remap.o
++endif
++endif
++
+ obj-$(CONFIG_PCIE_CADENCE) += cadence/
+ obj-$(CONFIG_PCI_FTPCI100) += pci-ftpci100.o
+ obj-$(CONFIG_PCI_IXP4XX) += pci-ixp4xx.o
+--- /dev/null
++++ b/drivers/pci/controller/intel-nvme-remap.c
+@@ -0,0 +1,462 @@
++// SPDX-License-Identifier: GPL-2.0
++/*
++ * Intel remapped NVMe device support.
++ *
++ * Copyright (c) 2019 Endless Mobile, Inc.
++ * Author: Daniel Drake <drake@endlessm.com>
++ *
++ * Some products ship by default with the SATA controller in "RAID" or
++ * "Intel RST Premium With Intel Optane System Acceleration" mode. Under this
++ * mode, which we refer to as "remapped NVMe" mode, any installed NVMe
++ * devices disappear from the PCI bus, and instead their I/O memory becomes
++ * available within the AHCI device BARs.
++ *
++ * This scheme is understood to be a way of avoiding usage of the standard
++ * Windows NVMe driver under that OS, instead mandating usage of Intel's
++ * driver instead, which has better power management, and presumably offers
++ * some RAID/disk-caching solutions too.
++ *
++ * Here in this driver, we support the remapped NVMe mode by claiming the
++ * AHCI device and creating a fake PCIe root port. On the new bus, the
++ * original AHCI device is exposed with only minor tweaks. Then, fake PCI
++ * devices corresponding to the remapped NVMe devices are created. The usual
++ * ahci and nvme drivers are then expected to bind to these devices and
++ * operate as normal.
++ *
++ * The PCI configuration space for the NVMe devices is completely
++ * unavailable, so we fake a minimal one and hope for the best.
++ *
++ * Interrupts are shared between the AHCI and NVMe devices. For simplicity,
++ * we only support the legacy interrupt here, although MSI support
++ * could potentially be added later.
++ */
++
++#define MODULE_NAME "intel-nvme-remap"
++
++#include <linux/ahci-remap.h>
++#include <linux/irq.h>
++#include <linux/kernel.h>
++#include <linux/module.h>
++#include <linux/pci.h>
++
++#define AHCI_PCI_BAR_STANDARD 5
++
++struct nvme_remap_dev {
++	struct pci_dev		*dev;		/* AHCI device */
++	struct pci_bus		*bus;		/* our fake PCI bus */
++	struct pci_sysdata	sysdata;
++	int			irq_base;	/* our fake interrupts */
++
++	/*
++	 * When we detect an all-ones write to a BAR register, this flag
++	 * is set, so that we return the BAR size on the next read (a
++	 * standard PCI behaviour).
++	 * This includes the assumption that an all-ones BAR write is
++	 * immediately followed by a read of the same register.
++	 */
++	bool			bar_sizing;
++
++	/*
++	 * Resources copied from the AHCI device, to be regarded as
++	 * resources on our fake bus.
++	 */
++	struct resource		ahci_resources[PCI_NUM_RESOURCES];
++
++	/* Resources corresponding to the NVMe devices. */
++	struct resource		remapped_dev_mem[AHCI_MAX_REMAP];
++
++	/* Number of remapped NVMe devices found. */
++	int			num_remapped_devices;
++};
++
++static inline struct nvme_remap_dev *nrdev_from_bus(struct pci_bus *bus)
++{
++	return container_of(bus->sysdata, struct nvme_remap_dev, sysdata);
++}
++
++
++/******** PCI configuration space **********/
++
++/*
++ * Helper macros for tweaking returned contents of PCI configuration space.
++ *
++ * value contains len bytes of data read from reg.
++ * If fixup_reg is included in that range, fix up the contents of that
++ * register to fixed_value.
++ */
++#define NR_FIX8(fixup_reg, fixed_value) do { \
++		if (reg <= fixup_reg && fixup_reg < reg + len) \
++			((u8 *) value)[fixup_reg - reg] = (u8) (fixed_value); \
++	} while (0)
++
++#define NR_FIX16(fixup_reg, fixed_value) do { \
++		NR_FIX8(fixup_reg, fixed_value); \
++		NR_FIX8(fixup_reg + 1, fixed_value >> 8); \
++	} while (0)
++
++#define NR_FIX24(fixup_reg, fixed_value) do { \
++		NR_FIX8(fixup_reg, fixed_value); \
++		NR_FIX8(fixup_reg + 1, fixed_value >> 8); \
++		NR_FIX8(fixup_reg + 2, fixed_value >> 16); \
++	} while (0)
++
++#define NR_FIX32(fixup_reg, fixed_value) do { \
++		NR_FIX16(fixup_reg, (u16) fixed_value); \
++		NR_FIX16(fixup_reg + 2, fixed_value >> 16); \
++	} while (0)
++
++/*
++ * Read PCI config space of the slot 0 (AHCI) device.
++ * We pass through the read request to the underlying device, but
++ * tweak the results in some cases.
++ */
++static int nvme_remap_pci_read_slot0(struct pci_bus *bus, int reg,
++				     int len, u32 *value)
++{
++	struct nvme_remap_dev *nrdev = nrdev_from_bus(bus);
++	struct pci_bus *ahci_dev_bus = nrdev->dev->bus;
++	int ret;
++
++	ret = ahci_dev_bus->ops->read(ahci_dev_bus, nrdev->dev->devfn,
++				      reg, len, value);
++	if (ret)
++		return ret;
++
++	/*
++	 * Adjust the device class, to prevent this driver from attempting to
++	 * additionally probe the device we're simulating here.
++	 */
++	NR_FIX24(PCI_CLASS_PROG, PCI_CLASS_STORAGE_SATA_AHCI);
++
++	/*
++	 * Unset interrupt pin, otherwise ACPI tries to find routing
++	 * info for our virtual IRQ, fails, and complains.
++	 */
++	NR_FIX8(PCI_INTERRUPT_PIN, 0);
++
++	/*
++	 * Truncate the AHCI BAR to not include the region that covers the
++	 * hidden devices. This will cause the ahci driver to successfully
++	 * probe th new device (instead of handing it over to this driver).
++	 */
++	if (nrdev->bar_sizing) {
++		NR_FIX32(PCI_BASE_ADDRESS_5, ~(SZ_16K - 1));
++		nrdev->bar_sizing = false;
++	}
++
++	return PCIBIOS_SUCCESSFUL;
++}
++
++/*
++ * Read PCI config space of a remapped device.
++ * Since the original PCI config space is inaccessible, we provide a minimal,
++ * fake config space instead.
++ */
++static int nvme_remap_pci_read_remapped(struct pci_bus *bus, unsigned int port,
++					int reg, int len, u32 *value)
++{
++	struct nvme_remap_dev *nrdev = nrdev_from_bus(bus);
++	struct resource *remapped_mem;
++
++	if (port > nrdev->num_remapped_devices)
++		return PCIBIOS_DEVICE_NOT_FOUND;
++
++	*value = 0;
++	remapped_mem = &nrdev->remapped_dev_mem[port - 1];
++
++	/* Set a Vendor ID, otherwise Linux assumes no device is present */
++	NR_FIX16(PCI_VENDOR_ID, PCI_VENDOR_ID_INTEL);
++
++	/* Always appear on & bus mastering */
++	NR_FIX16(PCI_COMMAND, PCI_COMMAND_MEMORY | PCI_COMMAND_MASTER);
++
++	/* Set class so that nvme driver probes us */
++	NR_FIX24(PCI_CLASS_PROG, PCI_CLASS_STORAGE_EXPRESS);
++
++	if (nrdev->bar_sizing) {
++		NR_FIX32(PCI_BASE_ADDRESS_0,
++			 ~(resource_size(remapped_mem) - 1));
++		nrdev->bar_sizing = false;
++	} else {
++		resource_size_t mem_start = remapped_mem->start;
++
++		mem_start |= PCI_BASE_ADDRESS_MEM_TYPE_64;
++		NR_FIX32(PCI_BASE_ADDRESS_0, mem_start);
++		mem_start >>= 32;
++		NR_FIX32(PCI_BASE_ADDRESS_1, mem_start);
++	}
++
++	return PCIBIOS_SUCCESSFUL;
++}
++
++/* Read PCI configuration space. */
++static int nvme_remap_pci_read(struct pci_bus *bus, unsigned int devfn,
++			       int reg, int len, u32 *value)
++{
++	if (PCI_SLOT(devfn) == 0)
++		return nvme_remap_pci_read_slot0(bus, reg, len, value);
++	else
++		return nvme_remap_pci_read_remapped(bus, PCI_SLOT(devfn),
++						    reg, len, value);
++}
++
++/*
++ * Write PCI config space of the slot 0 (AHCI) device.
++ * Apart from the special case of BAR sizing, we disable all writes.
++ * Otherwise, the ahci driver could make changes (e.g. unset PCI bus master)
++ * that would affect the operation of the NVMe devices.
++ */
++static int nvme_remap_pci_write_slot0(struct pci_bus *bus, int reg,
++				      int len, u32 value)
++{
++	struct nvme_remap_dev *nrdev = nrdev_from_bus(bus);
++	struct pci_bus *ahci_dev_bus = nrdev->dev->bus;
++
++	if (reg >= PCI_BASE_ADDRESS_0 && reg <= PCI_BASE_ADDRESS_5) {
++		/*
++		 * Writing all-ones to a BAR means that the size of the
++		 * memory region is being checked. Flag this so that we can
++		 * reply with an appropriate size on the next read.
++		 */
++		if (value == ~0)
++			nrdev->bar_sizing = true;
++
++		return ahci_dev_bus->ops->write(ahci_dev_bus,
++						nrdev->dev->devfn,
++						reg, len, value);
++	}
++
++	return PCIBIOS_SET_FAILED;
++}
++
++/*
++ * Write PCI config space of a remapped device.
++ * Since the original PCI config space is inaccessible, we reject all
++ * writes, except for the special case of BAR probing.
++ */
++static int nvme_remap_pci_write_remapped(struct pci_bus *bus,
++					 unsigned int port,
++					 int reg, int len, u32 value)
++{
++	struct nvme_remap_dev *nrdev = nrdev_from_bus(bus);
++
++	if (port > nrdev->num_remapped_devices)
++		return PCIBIOS_DEVICE_NOT_FOUND;
++
++	/*
++	 * Writing all-ones to a BAR means that the size of the memory
++	 * region is being checked. Flag this so that we can reply with
++	 * an appropriate size on the next read.
++	 */
++	if (value == ~0 && reg >= PCI_BASE_ADDRESS_0
++			&& reg <= PCI_BASE_ADDRESS_5) {
++		nrdev->bar_sizing = true;
++		return PCIBIOS_SUCCESSFUL;
++	}
++
++	return PCIBIOS_SET_FAILED;
++}
++
++/* Write PCI configuration space. */
++static int nvme_remap_pci_write(struct pci_bus *bus, unsigned int devfn,
++				int reg, int len, u32 value)
++{
++	if (PCI_SLOT(devfn) == 0)
++		return nvme_remap_pci_write_slot0(bus, reg, len, value);
++	else
++		return nvme_remap_pci_write_remapped(bus, PCI_SLOT(devfn),
++						     reg, len, value);
++}
++
++static struct pci_ops nvme_remap_pci_ops = {
++	.read	= nvme_remap_pci_read,
++	.write	= nvme_remap_pci_write,
++};
++
++
++/******** Initialization & exit **********/
++
++/*
++ * Find a PCI domain ID to use for our fake bus.
++ * Start at 0x10000 to not clash with ACPI _SEG domains (16 bits).
++ */
++static int find_free_domain(void)
++{
++	int domain = 0xffff;
++	struct pci_bus *bus = NULL;
++
++	while ((bus = pci_find_next_bus(bus)) != NULL)
++		domain = max_t(int, domain, pci_domain_nr(bus));
++
++	return domain + 1;
++}
++
++static int find_remapped_devices(struct nvme_remap_dev *nrdev,
++				 struct list_head *resources)
++{
++	void __iomem *mmio;
++	int i, count = 0;
++	u32 cap;
++
++	mmio = pcim_iomap(nrdev->dev, AHCI_PCI_BAR_STANDARD,
++			  pci_resource_len(nrdev->dev,
++					   AHCI_PCI_BAR_STANDARD));
++	if (!mmio)
++		return -ENODEV;
++
++	/* Check if this device might have remapped nvme devices. */
++	if (pci_resource_len(nrdev->dev, AHCI_PCI_BAR_STANDARD) < SZ_512K ||
++	    !(readl(mmio + AHCI_VSCAP) & 1))
++		return -ENODEV;
++
++	cap = readq(mmio + AHCI_REMAP_CAP);
++	for (i = AHCI_MAX_REMAP-1; i >= 0; i--) {
++		struct resource *remapped_mem;
++
++		if ((cap & (1 << i)) == 0)
++			continue;
++		if (readl(mmio + ahci_remap_dcc(i))
++				!= PCI_CLASS_STORAGE_EXPRESS)
++			continue;
++
++		/* We've found a remapped device */
++		remapped_mem = &nrdev->remapped_dev_mem[count++];
++		remapped_mem->start =
++			pci_resource_start(nrdev->dev, AHCI_PCI_BAR_STANDARD)
++			+ ahci_remap_base(i);
++		remapped_mem->end = remapped_mem->start
++			+ AHCI_REMAP_N_SIZE - 1;
++		remapped_mem->flags = IORESOURCE_MEM | IORESOURCE_PCI_FIXED;
++		pci_add_resource(resources, remapped_mem);
++	}
++
++	pcim_iounmap(nrdev->dev, mmio);
++
++	if (count == 0)
++		return -ENODEV;
++
++	nrdev->num_remapped_devices = count;
++	dev_info(&nrdev->dev->dev, "Found %d remapped NVMe devices\n",
++		 nrdev->num_remapped_devices);
++	return 0;
++}
++
++static void nvme_remap_remove_root_bus(void *data)
++{
++	struct pci_bus *bus = data;
++
++	pci_stop_root_bus(bus);
++	pci_remove_root_bus(bus);
++}
++
++static int nvme_remap_probe(struct pci_dev *dev,
++			    const struct pci_device_id *id)
++{
++	struct nvme_remap_dev *nrdev;
++	LIST_HEAD(resources);
++	int i;
++	int ret;
++	struct pci_dev *child;
++
++	nrdev = devm_kzalloc(&dev->dev, sizeof(*nrdev), GFP_KERNEL);
++	nrdev->sysdata.domain = find_free_domain();
++	nrdev->sysdata.nvme_remap_dev = dev;
++	nrdev->dev = dev;
++	pci_set_drvdata(dev, nrdev);
++
++	ret = pcim_enable_device(dev);
++	if (ret < 0)
++		return ret;
++
++	pci_set_master(dev);
++
++	ret = find_remapped_devices(nrdev, &resources);
++	if (ret)
++		return ret;
++
++	/* Add resources from the original AHCI device */
++	for (i = 0; i < PCI_NUM_RESOURCES; i++) {
++		struct resource *res = &dev->resource[i];
++
++		if (res->start) {
++			struct resource *nr_res = &nrdev->ahci_resources[i];
++
++			nr_res->start = res->start;
++			nr_res->end = res->end;
++			nr_res->flags = res->flags;
++			pci_add_resource(&resources, nr_res);
++		}
++	}
++
++	/* Create virtual interrupts */
++	nrdev->irq_base = devm_irq_alloc_descs(&dev->dev, -1, 0,
++					       nrdev->num_remapped_devices + 1,
++					       0);
++	if (nrdev->irq_base < 0)
++		return nrdev->irq_base;
++
++	/* Create and populate PCI bus */
++	nrdev->bus = pci_create_root_bus(&dev->dev, 0, &nvme_remap_pci_ops,
++					 &nrdev->sysdata, &resources);
++	if (!nrdev->bus)
++		return -ENODEV;
++
++	if (devm_add_action_or_reset(&dev->dev, nvme_remap_remove_root_bus,
++				     nrdev->bus))
++		return -ENOMEM;
++
++	/* We don't support sharing MSI interrupts between these devices */
++	nrdev->bus->bus_flags |= PCI_BUS_FLAGS_NO_MSI;
++
++	pci_scan_child_bus(nrdev->bus);
++
++	list_for_each_entry(child, &nrdev->bus->devices, bus_list) {
++		/*
++		 * Prevent PCI core from trying to move memory BARs around.
++		 * The hidden NVMe devices are at fixed locations.
++		 */
++		for (i = 0; i < PCI_NUM_RESOURCES; i++) {
++			struct resource *res = &child->resource[i];
++
++			if (res->flags & IORESOURCE_MEM)
++				res->flags |= IORESOURCE_PCI_FIXED;
++		}
++
++		/* Share the legacy IRQ between all devices */
++		child->irq = dev->irq;
++	}
++
++	pci_assign_unassigned_bus_resources(nrdev->bus);
++	pci_bus_add_devices(nrdev->bus);
++
++	return 0;
++}
++
++static const struct pci_device_id nvme_remap_ids[] = {
++	/*
++	 * Match all Intel RAID controllers.
++	 *
++	 * There's overlap here with the set of devices detected by the ahci
++	 * driver, but ahci will only successfully probe when there
++	 * *aren't* any remapped NVMe devices, and this driver will only
++	 * successfully probe when there *are* remapped NVMe devices that
++	 * need handling.
++	 */
++	{
++		PCI_VDEVICE(INTEL, PCI_ANY_ID),
++		.class = PCI_CLASS_STORAGE_RAID << 8,
++		.class_mask = 0xffffff00,
++	},
++	{0,}
++};
++MODULE_DEVICE_TABLE(pci, nvme_remap_ids);
++
++static struct pci_driver nvme_remap_drv = {
++	.name		= MODULE_NAME,
++	.id_table	= nvme_remap_ids,
++	.probe		= nvme_remap_probe,
++};
++module_pci_driver(nvme_remap_drv);
++
++MODULE_AUTHOR("Daniel Drake <drake@endlessm.com>");
++MODULE_LICENSE("GPL v2");
diff --git a/debian/patches/patchset-zen/sauce/0004-ZEN-Disable-stack-conservation-for-GCC.patch b/debian/patches/patchset-zen/sauce/0004-ZEN-Disable-stack-conservation-for-GCC.patch
new file mode 100644
index 0000000..e2a70b0
--- /dev/null
+++ b/debian/patches/patchset-zen/sauce/0004-ZEN-Disable-stack-conservation-for-GCC.patch
@@ -0,0 +1,29 @@
+From 89a4975b413afa5f591c7a18109d35b5e848b582 Mon Sep 17 00:00:00 2001
+From: Sultan Alsawaf <sultan@kerneltoast.com>
+Date: Sun, 8 Mar 2020 00:31:35 -0800
+Subject: ZEN: Disable stack conservation for GCC
+
+There's plenty of room on the stack for a few more inlined bytes here
+and there. The measured stack usage at runtime is still safe without
+this, and performance is surely improved at a microscopic level, so
+remove it.
+
+Signed-off-by: Sultan Alsawaf <sultan@kerneltoast.com>
+---
+ Makefile | 5 -----
+ 1 file changed, 5 deletions(-)
+
+--- a/Makefile
++++ b/Makefile
+@@ -1003,11 +1003,6 @@ KBUILD_CFLAGS	+= -fno-strict-overflow
+ # Make sure -fstack-check isn't enabled (like gentoo apparently did)
+ KBUILD_CFLAGS  += -fno-stack-check
+ 
+-# conserve stack if available
+-ifdef CONFIG_CC_IS_GCC
+-KBUILD_CFLAGS   += -fconserve-stack
+-endif
+-
+ # change __FILE__ to the relative path from the srctree
+ KBUILD_CPPFLAGS += $(call cc-option,-fmacro-prefix-map=$(srctree)/=)
+ 
diff --git a/debian/patches/patchset-zen/sauce/0005-ZEN-Initialize-ata-before-graphics.patch b/debian/patches/patchset-zen/sauce/0005-ZEN-Initialize-ata-before-graphics.patch
new file mode 100644
index 0000000..4ef331c
--- /dev/null
+++ b/debian/patches/patchset-zen/sauce/0005-ZEN-Initialize-ata-before-graphics.patch
@@ -0,0 +1,43 @@
+From 5f2e6f795ce9908851acb20ab03af6550ae54f3b Mon Sep 17 00:00:00 2001
+From: Arjan van de Ven <arjan@linux.intel.com>
+Date: Thu, 2 Jun 2016 23:36:32 -0500
+Subject: ZEN: Initialize ata before graphics
+
+ATA init is the long pole in the boot process, and its asynchronous.
+move the graphics init after it so that ata and graphics initialize
+in parallel
+---
+ drivers/Makefile | 13 +++++++------
+ 1 file changed, 7 insertions(+), 6 deletions(-)
+
+--- a/drivers/Makefile
++++ b/drivers/Makefile
+@@ -61,14 +61,8 @@ obj-y				+= char/
+ # iommu/ comes before gpu as gpu are using iommu controllers
+ obj-y				+= iommu/
+ 
+-# gpu/ comes after char for AGP vs DRM startup and after iommu
+-obj-y				+= gpu/
+-
+ obj-$(CONFIG_CONNECTOR)		+= connector/
+ 
+-# i810fb depends on char/agp/
+-obj-$(CONFIG_FB_I810)           += video/fbdev/i810/
+-
+ obj-$(CONFIG_PARPORT)		+= parport/
+ obj-y				+= base/ block/ misc/ mfd/ nfc/
+ obj-$(CONFIG_LIBNVDIMM)		+= nvdimm/
+@@ -80,6 +74,13 @@ obj-y				+= macintosh/
+ obj-y				+= scsi/
+ obj-y				+= nvme/
+ obj-$(CONFIG_ATA)		+= ata/
++
++# gpu/ comes after char for AGP vs DRM startup and after iommu
++obj-y				+= gpu/
++
++# i810fb depends on char/agp/
++obj-$(CONFIG_FB_I810)           += video/fbdev/i810/
++
+ obj-$(CONFIG_TARGET_CORE)	+= target/
+ obj-$(CONFIG_MTD)		+= mtd/
+ obj-$(CONFIG_SPI)		+= spi/
diff --git a/debian/patches/patchset-zen/sauce/0006-ZEN-Input-evdev-use-call_rcu-when-detaching-client.patch b/debian/patches/patchset-zen/sauce/0006-ZEN-Input-evdev-use-call_rcu-when-detaching-client.patch
new file mode 100644
index 0000000..b229161
--- /dev/null
+++ b/debian/patches/patchset-zen/sauce/0006-ZEN-Input-evdev-use-call_rcu-when-detaching-client.patch
@@ -0,0 +1,105 @@
+From 3d92c251c04b1b4c6363018220af42ec3a294d1e Mon Sep 17 00:00:00 2001
+From: Kenny Levinsen <kl@kl.wtf>
+Date: Sun, 27 Dec 2020 14:43:13 +0000
+Subject: ZEN: Input: evdev - use call_rcu when detaching client
+
+Significant time was spent on synchronize_rcu in evdev_detach_client
+when applications closed evdev devices. Switching VT away from a
+graphical environment commonly leads to mass input device closures,
+which could lead to noticable delays on systems with many input devices.
+
+Replace synchronize_rcu with call_rcu, deferring reclaim of the evdev
+client struct till after the RCU grace period instead of blocking the
+calling application.
+
+While this does not solve all slow evdev fd closures, it takes care of a
+good portion of them, including this simple test:
+
+	#include <fcntl.h>
+	#include <unistd.h>
+
+	int main(int argc, char *argv[])
+	{
+		int idx, fd;
+		const char *path = "/dev/input/event0";
+		for (idx = 0; idx < 1000; idx++) {
+			if ((fd = open(path, O_RDWR)) == -1) {
+				return -1;
+			}
+			close(fd);
+		}
+		return 0;
+	}
+
+Time to completion of above test when run locally:
+
+	Before: 0m27.111s
+	After:  0m0.018s
+
+Signed-off-by: Kenny Levinsen <kl@kl.wtf>
+---
+ drivers/input/evdev.c | 19 +++++++++++--------
+ 1 file changed, 11 insertions(+), 8 deletions(-)
+
+--- a/drivers/input/evdev.c
++++ b/drivers/input/evdev.c
+@@ -46,6 +46,7 @@ struct evdev_client {
+ 	struct fasync_struct *fasync;
+ 	struct evdev *evdev;
+ 	struct list_head node;
++	struct rcu_head rcu;
+ 	enum input_clock_type clk_type;
+ 	bool revoked;
+ 	unsigned long *evmasks[EV_CNT];
+@@ -368,13 +369,22 @@ static void evdev_attach_client(struct e
+ 	spin_unlock(&evdev->client_lock);
+ }
+ 
++static void evdev_reclaim_client(struct rcu_head *rp)
++{
++	struct evdev_client *client = container_of(rp, struct evdev_client, rcu);
++	unsigned int i;
++	for (i = 0; i < EV_CNT; ++i)
++		bitmap_free(client->evmasks[i]);
++	kvfree(client);
++}
++
+ static void evdev_detach_client(struct evdev *evdev,
+ 				struct evdev_client *client)
+ {
+ 	spin_lock(&evdev->client_lock);
+ 	list_del_rcu(&client->node);
+ 	spin_unlock(&evdev->client_lock);
+-	synchronize_rcu();
++	call_rcu(&client->rcu, evdev_reclaim_client);
+ }
+ 
+ static int evdev_open_device(struct evdev *evdev)
+@@ -427,7 +437,6 @@ static int evdev_release(struct inode *i
+ {
+ 	struct evdev_client *client = file->private_data;
+ 	struct evdev *evdev = client->evdev;
+-	unsigned int i;
+ 
+ 	mutex_lock(&evdev->mutex);
+ 
+@@ -439,11 +448,6 @@ static int evdev_release(struct inode *i
+ 
+ 	evdev_detach_client(evdev, client);
+ 
+-	for (i = 0; i < EV_CNT; ++i)
+-		bitmap_free(client->evmasks[i]);
+-
+-	kvfree(client);
+-
+ 	evdev_close_device(evdev);
+ 
+ 	return 0;
+@@ -486,7 +490,6 @@ static int evdev_open(struct inode *inod
+ 
+  err_free_client:
+ 	evdev_detach_client(evdev, client);
+-	kvfree(client);
+ 	return error;
+ }
+ 
diff --git a/debian/patches/patchset-zen/sauce/0007-ZEN-cpufreq-Remove-schedutil-dependency-on-Intel-AMD.patch b/debian/patches/patchset-zen/sauce/0007-ZEN-cpufreq-Remove-schedutil-dependency-on-Intel-AMD.patch
new file mode 100644
index 0000000..c94ccd2
--- /dev/null
+++ b/debian/patches/patchset-zen/sauce/0007-ZEN-cpufreq-Remove-schedutil-dependency-on-Intel-AMD.patch
@@ -0,0 +1,31 @@
+From 67c446794b5fc16009bc1f31aee8846576796b11 Mon Sep 17 00:00:00 2001
+From: Steven Barrett <steven@liquorix.net>
+Date: Mon, 11 Jul 2022 19:10:30 -0500
+Subject: ZEN: cpufreq: Remove schedutil dependency on Intel/AMD P-State
+ drivers
+
+Although both P-State drivers depend on schedutil in Kconfig, both code
+bases do not use any schedutil code.  This arbitrarily enables schedutil
+when unwanted in some configurations.
+---
+ drivers/cpufreq/Kconfig.x86 | 2 --
+ 1 file changed, 2 deletions(-)
+
+--- a/drivers/cpufreq/Kconfig.x86
++++ b/drivers/cpufreq/Kconfig.x86
+@@ -9,7 +9,6 @@ config X86_INTEL_PSTATE
+ 	select ACPI_PROCESSOR if ACPI
+ 	select ACPI_CPPC_LIB if X86_64 && ACPI && SCHED_MC_PRIO
+ 	select CPU_FREQ_GOV_PERFORMANCE
+-	select CPU_FREQ_GOV_SCHEDUTIL if SMP
+ 	help
+ 	  This driver provides a P state for Intel core processors.
+ 	  The driver implements an internal governor and will become
+@@ -39,7 +38,6 @@ config X86_AMD_PSTATE
+ 	depends on X86 && ACPI
+ 	select ACPI_PROCESSOR
+ 	select ACPI_CPPC_LIB if X86_64
+-	select CPU_FREQ_GOV_SCHEDUTIL if SMP
+ 	help
+ 	  This driver adds a CPUFreq driver which utilizes a fine grain
+ 	  processor performance frequency control range instead of legacy
diff --git a/debian/patches/patchset-zen/sauce/0008-ZEN-intel-pstate-Implement-enable-parameter.patch b/debian/patches/patchset-zen/sauce/0008-ZEN-intel-pstate-Implement-enable-parameter.patch
new file mode 100644
index 0000000..007f9c6
--- /dev/null
+++ b/debian/patches/patchset-zen/sauce/0008-ZEN-intel-pstate-Implement-enable-parameter.patch
@@ -0,0 +1,53 @@
+From 1d5cc90283f9de0c4cc996a2f3e6ba0306c1f14d Mon Sep 17 00:00:00 2001
+From: Steven Barrett <steven@liquorix.net>
+Date: Wed, 15 Jan 2020 20:43:56 -0600
+Subject: ZEN: intel-pstate: Implement "enable" parameter
+
+If intel-pstate is compiled into the kernel, it will preempt the loading
+of acpi-cpufreq so you can take advantage of hardware p-states without
+any friction.
+
+However, intel-pstate is not completely superior to cpufreq's ondemand
+for one reason.  There's no concept of an up_threshold property.
+
+In ondemand, up_threshold essentially reduces the maximum utilization to
+compare against, allowing you to hit max frequencies and turbo boost
+from a much lower core utilization.
+
+With intel-pstate, you have the concept of minimum and maximum
+performance, but no tunable that lets you define, maximum frequency
+means 50% core utilization.  For just this oversight, there's reasons
+you may want ondemand.
+
+Lets support setting "enable" in kernel boot parameters.  This lets
+kernel maintainers include "intel_pstate=disable" statically in the
+static boot parameters, but let users of the kernel override this
+selection.
+---
+ Documentation/admin-guide/kernel-parameters.txt | 3 +++
+ drivers/cpufreq/intel_pstate.c                  | 2 ++
+ 2 files changed, 5 insertions(+)
+
+--- a/Documentation/admin-guide/kernel-parameters.txt
++++ b/Documentation/admin-guide/kernel-parameters.txt
+@@ -2237,6 +2237,9 @@
+ 			disable
+ 			  Do not enable intel_pstate as the default
+ 			  scaling driver for the supported processors
++			enable
++			  Enable intel_pstate in-case "disable" was passed
++			  previously in the kernel boot parameters
+                         active
+                           Use intel_pstate driver to bypass the scaling
+                           governors layer of cpufreq and provides it own
+--- a/drivers/cpufreq/intel_pstate.c
++++ b/drivers/cpufreq/intel_pstate.c
+@@ -3524,6 +3524,8 @@ static int __init intel_pstate_setup(cha
+ 
+ 	if (!strcmp(str, "disable"))
+ 		no_load = 1;
++	else if (!strcmp(str, "enable"))
++		no_load = 0;
+ 	else if (!strcmp(str, "active"))
+ 		default_driver = &intel_pstate;
+ 	else if (!strcmp(str, "passive"))
diff --git a/debian/patches/patchset-zen/sauce/0009-ZEN-drm-amdgpu-pm-Allow-override-of-min_power_limit-.patch b/debian/patches/patchset-zen/sauce/0009-ZEN-drm-amdgpu-pm-Allow-override-of-min_power_limit-.patch
new file mode 100644
index 0000000..d67e7f0
--- /dev/null
+++ b/debian/patches/patchset-zen/sauce/0009-ZEN-drm-amdgpu-pm-Allow-override-of-min_power_limit-.patch
@@ -0,0 +1,91 @@
+From 493aee188c1c7c5cee2791820bfc779932bc10dc Mon Sep 17 00:00:00 2001
+From: Steven Barrett <steven@liquorix.net>
+Date: Fri, 15 Mar 2024 12:36:51 -0500
+Subject: ZEN: drm/amdgpu/pm: Allow override of min_power_limit with
+ ignore_min_pcap
+
+---
+ drivers/gpu/drm/amd/amdgpu/amdgpu.h       |  1 +
+ drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c   | 10 ++++++++++
+ drivers/gpu/drm/amd/pm/amdgpu_pm.c        |  3 +++
+ drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c | 14 ++++++++++++--
+ 4 files changed, 26 insertions(+), 2 deletions(-)
+
+--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+@@ -162,6 +162,7 @@ struct amdgpu_watchdog_timer {
+  */
+ extern int amdgpu_modeset;
+ extern unsigned int amdgpu_vram_limit;
++extern int amdgpu_ignore_min_pcap;
+ extern int amdgpu_vis_vram_limit;
+ extern int amdgpu_gart_size;
+ extern int amdgpu_gtt_size;
+--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+@@ -135,6 +135,7 @@ enum AMDGPU_DEBUG_MASK {
+ };
+ 
+ unsigned int amdgpu_vram_limit = UINT_MAX;
++int amdgpu_ignore_min_pcap = 0; /* do not ignore by default */
+ int amdgpu_vis_vram_limit;
+ int amdgpu_gart_size = -1; /* auto */
+ int amdgpu_gtt_size = -1; /* auto */
+@@ -249,6 +250,15 @@ struct amdgpu_watchdog_timer amdgpu_watc
+ };
+ 
+ /**
++ * DOC: ignore_min_pcap (int)
++ * Ignore the minimum power cap.
++ * Useful on graphics cards where the minimum power cap is very high.
++ * The default is 0 (Do not ignore).
++ */
++MODULE_PARM_DESC(ignore_min_pcap, "Ignore the minimum power cap");
++module_param_named(ignore_min_pcap, amdgpu_ignore_min_pcap, int, 0600);
++
++/**
+  * DOC: vramlimit (int)
+  * Restrict the total amount of VRAM in MiB for testing.  The default is 0 (Use full VRAM).
+  */
+--- a/drivers/gpu/drm/amd/pm/amdgpu_pm.c
++++ b/drivers/gpu/drm/amd/pm/amdgpu_pm.c
+@@ -3272,6 +3272,9 @@ static ssize_t amdgpu_hwmon_show_power_c
+ 					 struct device_attribute *attr,
+ 					 char *buf)
+ {
++	if (amdgpu_ignore_min_pcap)
++		return sysfs_emit(buf, "%i\n", 0);
++
+ 	return amdgpu_hwmon_show_power_cap_generic(dev, attr, buf, PP_PWR_LIMIT_MIN);
+ }
+ 
+--- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
++++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
+@@ -2762,7 +2762,10 @@ int smu_get_power_limit(void *handle,
+ 			*limit = smu->max_power_limit;
+ 			break;
+ 		case SMU_PPT_LIMIT_MIN:
+-			*limit = smu->min_power_limit;
++			if (amdgpu_ignore_min_pcap)
++				*limit = 0;
++			else
++				*limit = smu->min_power_limit;
+ 			break;
+ 		default:
+ 			return -EINVAL;
+@@ -2786,7 +2789,14 @@ static int smu_set_power_limit(void *han
+ 		if (smu->ppt_funcs->set_power_limit)
+ 			return smu->ppt_funcs->set_power_limit(smu, limit_type, limit);
+ 
+-	if ((limit > smu->max_power_limit) || (limit < smu->min_power_limit)) {
++	if (amdgpu_ignore_min_pcap) {
++		if ((limit > smu->max_power_limit)) {
++			dev_err(smu->adev->dev,
++				"New power limit (%d) is over the max allowed %d\n",
++				limit, smu->max_power_limit);
++			return -EINVAL;
++		}
++	} else if ((limit > smu->max_power_limit) || (limit < smu->min_power_limit)) {
+ 		dev_err(smu->adev->dev,
+ 			"New power limit (%d) is out of range [%d,%d]\n",
+ 			limit, smu->min_power_limit, smu->max_power_limit);
diff --git a/debian/patches/patchset-zen/sauce/0010-ZEN-Set-default-max-map-count-to-INT_MAX-5.patch b/debian/patches/patchset-zen/sauce/0010-ZEN-Set-default-max-map-count-to-INT_MAX-5.patch
new file mode 100644
index 0000000..b6d30fe
--- /dev/null
+++ b/debian/patches/patchset-zen/sauce/0010-ZEN-Set-default-max-map-count-to-INT_MAX-5.patch
@@ -0,0 +1,29 @@
+From 531d884259632814e998d1662690daa1e57dd98c Mon Sep 17 00:00:00 2001
+From: Steven Barrett <steven@liquorix.net>
+Date: Thu, 27 Apr 2023 14:43:57 -0500
+Subject: ZEN: Set default max map count to (INT_MAX - 5)
+
+Per [Fedora][1], they intend to change the default max map count for
+their distribution to improve OOTB compatibility with games played
+through Steam/Proton.  The value they picked comes from the Steam Deck,
+which defaults to INT_MAX - MAPCOUNT_ELF_CORE_MARGIN.
+
+Since most ZEN and Liquorix users probably play games, follow Valve's
+lead and raise this value to their default.
+
+[1]: https://fedoraproject.org/wiki/Changes/IncreaseVmMaxMapCount
+---
+ include/linux/mm.h | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/include/linux/mm.h
++++ b/include/linux/mm.h
+@@ -197,7 +197,7 @@ static inline void __mm_zero_struct_page
+  * that.
+  */
+ #define MAPCOUNT_ELF_CORE_MARGIN	(5)
+-#define DEFAULT_MAX_MAP_COUNT	(USHRT_MAX - MAPCOUNT_ELF_CORE_MARGIN)
++#define DEFAULT_MAX_MAP_COUNT	(INT_MAX - MAPCOUNT_ELF_CORE_MARGIN)
+ 
+ extern int sysctl_max_map_count;
+ 
diff --git a/debian/patches/patchset-zen/sauce/0011-ZEN-INTERACTIVE-Base-config-item.patch b/debian/patches/patchset-zen/sauce/0011-ZEN-INTERACTIVE-Base-config-item.patch
new file mode 100644
index 0000000..58ddd57
--- /dev/null
+++ b/debian/patches/patchset-zen/sauce/0011-ZEN-INTERACTIVE-Base-config-item.patch
@@ -0,0 +1,24 @@
+From 032775267df11a87616d2ec7f09c0b1b12da5da7 Mon Sep 17 00:00:00 2001
+From: "Jan Alexander Steffens (heftig)" <jan.steffens@gmail.com>
+Date: Mon, 27 Jan 2020 18:10:06 +0100
+Subject: ZEN: INTERACTIVE: Base config item
+
+---
+ init/Kconfig | 6 ++++++
+ 1 file changed, 6 insertions(+)
+
+--- a/init/Kconfig
++++ b/init/Kconfig
+@@ -134,6 +134,12 @@ config THREAD_INFO_IN_TASK
+ 
+ menu "General setup"
+ 
++config ZEN_INTERACTIVE
++	bool "Tune kernel for interactivity"
++	default y
++	help
++	  Tunes the kernel for responsiveness at the cost of throughput and power usage.
++
+ config BROKEN
+ 	bool
+ 
diff --git a/debian/patches/patchset-zen/sauce/0012-ZEN-INTERACTIVE-Use-BFQ-as-the-elevator-for-SQ-devic.patch b/debian/patches/patchset-zen/sauce/0012-ZEN-INTERACTIVE-Use-BFQ-as-the-elevator-for-SQ-devic.patch
new file mode 100644
index 0000000..c74d54f
--- /dev/null
+++ b/debian/patches/patchset-zen/sauce/0012-ZEN-INTERACTIVE-Use-BFQ-as-the-elevator-for-SQ-devic.patch
@@ -0,0 +1,37 @@
+From c614dbbfd3480cf18c90fd51bb52abd53339b790 Mon Sep 17 00:00:00 2001
+From: "Jan Alexander Steffens (heftig)" <jan.steffens@gmail.com>
+Date: Mon, 27 Jan 2020 18:11:05 +0100
+Subject: ZEN: INTERACTIVE: Use BFQ as the elevator for SQ devices
+
+---
+ block/elevator.c | 4 ++++
+ init/Kconfig     | 4 ++++
+ 2 files changed, 8 insertions(+)
+
+--- a/block/elevator.c
++++ b/block/elevator.c
+@@ -569,7 +569,11 @@ static struct elevator_type *elevator_ge
+ 	    !blk_mq_is_shared_tags(q->tag_set->flags))
+ 		return NULL;
+ 
++#if defined(CONFIG_ZEN_INTERACTIVE) && defined(CONFIG_IOSCHED_BFQ)
++	return elevator_find_get(q, "bfq");
++#else
+ 	return elevator_find_get(q, "mq-deadline");
++#endif
+ }
+ 
+ /*
+--- a/init/Kconfig
++++ b/init/Kconfig
+@@ -140,6 +140,10 @@ config ZEN_INTERACTIVE
+ 	help
+ 	  Tunes the kernel for responsiveness at the cost of throughput and power usage.
+ 
++	  --- Block Layer ----------------------------------------
++
++	    Default scheduler for SQ..: mq-deadline ->   bfq
++
+ config BROKEN
+ 	bool
+ 
diff --git a/debian/patches/patchset-zen/sauce/0013-ZEN-INTERACTIVE-Use-Kyber-as-the-elevator-for-MQ-dev.patch b/debian/patches/patchset-zen/sauce/0013-ZEN-INTERACTIVE-Use-Kyber-as-the-elevator-for-MQ-dev.patch
new file mode 100644
index 0000000..4be7534
--- /dev/null
+++ b/debian/patches/patchset-zen/sauce/0013-ZEN-INTERACTIVE-Use-Kyber-as-the-elevator-for-MQ-dev.patch
@@ -0,0 +1,36 @@
+From b87281991e8f34e557cf2fb1614b3f4808100233 Mon Sep 17 00:00:00 2001
+From: "Jan Alexander Steffens (heftig)" <heftig@archlinux.org>
+Date: Mon, 12 Dec 2022 00:03:03 +0100
+Subject: ZEN: INTERACTIVE: Use Kyber as the elevator for MQ devices
+
+---
+ block/elevator.c | 6 ++++++
+ init/Kconfig     | 1 +
+ 2 files changed, 7 insertions(+)
+
+--- a/block/elevator.c
++++ b/block/elevator.c
+@@ -567,7 +567,13 @@ static struct elevator_type *elevator_ge
+ 
+ 	if (q->nr_hw_queues != 1 &&
+ 	    !blk_mq_is_shared_tags(q->tag_set->flags))
++#if defined(CONFIG_ZEN_INTERACTIVE) && defined(CONFIG_MQ_IOSCHED_KYBER)
++		return elevator_find_get(q, "kyber");
++#elif defined(CONFIG_ZEN_INTERACTIVE)
++		return elevator_find_get(q, "mq-deadline");
++#else
+ 		return NULL;
++#endif
+ 
+ #if defined(CONFIG_ZEN_INTERACTIVE) && defined(CONFIG_IOSCHED_BFQ)
+ 	return elevator_find_get(q, "bfq");
+--- a/init/Kconfig
++++ b/init/Kconfig
+@@ -143,6 +143,7 @@ config ZEN_INTERACTIVE
+ 	  --- Block Layer ----------------------------------------
+ 
+ 	    Default scheduler for SQ..: mq-deadline ->   bfq
++	    Default scheduler for MQ..:        none ->   kyber
+ 
+ config BROKEN
+ 	bool
diff --git a/debian/patches/patchset-zen/sauce/0014-ZEN-INTERACTIVE-Enable-background-reclaim-of-hugepag.patch b/debian/patches/patchset-zen/sauce/0014-ZEN-INTERACTIVE-Enable-background-reclaim-of-hugepag.patch
new file mode 100644
index 0000000..7ddc6fc
--- /dev/null
+++ b/debian/patches/patchset-zen/sauce/0014-ZEN-INTERACTIVE-Enable-background-reclaim-of-hugepag.patch
@@ -0,0 +1,59 @@
+From e2db8ce3c52c7bd37e93728d6c12a483f17634bc Mon Sep 17 00:00:00 2001
+From: "Jan Alexander Steffens (heftig)" <jan.steffens@gmail.com>
+Date: Mon, 27 Jan 2020 18:21:09 +0100
+Subject: ZEN: INTERACTIVE: Enable background reclaim of hugepages
+
+Use [defer+madvise] as default khugepaged defrag strategy:
+
+For some reason, the default strategy to respond to THP fault fallbacks
+is still just madvise, meaning stall if the program wants transparent
+hugepages, but don't trigger a background reclaim / compaction if THP
+begins to fail allocations.  This creates a snowball affect where we
+still use the THP code paths, but we almost always fail once a system
+has been active and busy for a while.
+
+The option "defer" was created for interactive systems where THP can
+still improve performance.  If we have to fallback to a regular page due
+to an allocation failure or anything else, we will trigger a background
+reclaim and compaction so future THP attempts succeed and previous
+attempts eventually have their smaller pages combined without stalling
+running applications.
+
+We still want madvise to stall applications that explicitely want THP,
+so defer+madvise _does_ make a ton of sense.  Make it the default for
+interactive systems, especially if the kernel maintainer left
+transparent hugepages on "always".
+
+Reasoning and details in the original patch: https://lwn.net/Articles/711248/
+---
+ init/Kconfig     | 4 ++++
+ mm/huge_memory.c | 4 ++++
+ 2 files changed, 8 insertions(+)
+
+--- a/init/Kconfig
++++ b/init/Kconfig
+@@ -145,6 +145,10 @@ config ZEN_INTERACTIVE
+ 	    Default scheduler for SQ..: mq-deadline ->   bfq
+ 	    Default scheduler for MQ..:        none ->   kyber
+ 
++	  --- Virtual Memory Subsystem ---------------------------
++
++	    Background-reclaim hugepages...:   no   ->   yes
++
+ config BROKEN
+ 	bool
+ 
+--- a/mm/huge_memory.c
++++ b/mm/huge_memory.c
+@@ -64,7 +64,11 @@ unsigned long transparent_hugepage_flags
+ #ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE
+ 	(1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)|
+ #endif
++#ifdef CONFIG_ZEN_INTERACTIVE
++	(1<<TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG)|
++#else
+ 	(1<<TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG)|
++#endif
+ 	(1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)|
+ 	(1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
+ 
diff --git a/debian/patches/patchset-zen/sauce/0015-ZEN-INTERACTIVE-Tune-mgLRU-to-protect-cache-used-in-.patch b/debian/patches/patchset-zen/sauce/0015-ZEN-INTERACTIVE-Tune-mgLRU-to-protect-cache-used-in-.patch
new file mode 100644
index 0000000..75f392b
--- /dev/null
+++ b/debian/patches/patchset-zen/sauce/0015-ZEN-INTERACTIVE-Tune-mgLRU-to-protect-cache-used-in-.patch
@@ -0,0 +1,41 @@
+From f5b82cc382eaf3ddf5c26f60965037bde8733445 Mon Sep 17 00:00:00 2001
+From: Steven Barrett <steven@liquorix.net>
+Date: Wed, 11 Aug 2021 18:47:46 -0500
+Subject: ZEN: INTERACTIVE: Tune mgLRU to protect cache used in the last second
+
+Although not identical to the le9 patches that protect a byte-amount of
+cache through tunables, multigenerational LRU now supports protecting
+cache accessed in the last X milliseconds.
+
+In #218, Yu recommends starting with 1000ms and tuning as needed.  This
+looks like a safe default and turning on this feature should help users
+that don't know they need it.
+---
+ init/Kconfig | 1 +
+ mm/vmscan.c  | 4 ++++
+ 2 files changed, 5 insertions(+)
+
+--- a/init/Kconfig
++++ b/init/Kconfig
+@@ -148,6 +148,7 @@ config ZEN_INTERACTIVE
+ 	  --- Virtual Memory Subsystem ---------------------------
+ 
+ 	    Background-reclaim hugepages...:   no   ->   yes
++	    MG-LRU minimum cache TTL.......:   0    ->   1000 ms
+ 
+ config BROKEN
+ 	bool
+--- a/mm/vmscan.c
++++ b/mm/vmscan.c
+@@ -3968,7 +3968,11 @@ static bool lruvec_is_reclaimable(struct
+ }
+ 
+ /* to protect the working set of the last N jiffies */
++#ifdef CONFIG_ZEN_INTERACTIVE
++static unsigned long lru_gen_min_ttl __read_mostly = HZ;
++#else
+ static unsigned long lru_gen_min_ttl __read_mostly;
++#endif
+ 
+ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
+ {
diff --git a/debian/patches/patchset-zen/sauce/0016-ZEN-INTERACTIVE-Tune-EEVDF-for-interactivity.patch b/debian/patches/patchset-zen/sauce/0016-ZEN-INTERACTIVE-Tune-EEVDF-for-interactivity.patch
new file mode 100644
index 0000000..75a0131
--- /dev/null
+++ b/debian/patches/patchset-zen/sauce/0016-ZEN-INTERACTIVE-Tune-EEVDF-for-interactivity.patch
@@ -0,0 +1,104 @@
+From 44a6d7ca11b601b34724dc41e086576499a096bd Mon Sep 17 00:00:00 2001
+From: "Jan Alexander Steffens (heftig)" <heftig@archlinux.org>
+Date: Tue, 31 Oct 2023 19:03:10 +0100
+Subject: ZEN: INTERACTIVE: Tune EEVDF for interactivity
+
+5.7:
+Take "sysctl_sched_nr_migrate" tune from early XanMod builds of 128. As
+of 5.7, XanMod uses 256 but that may affect applications that require
+timely response to IRQs.
+
+5.15:
+Per [a comment][1] on our ZEN INTERACTIVE commit, reducing the cost of
+migration causes the system less responsive under high load.  Most
+likely the combination of reduced migration cost + the higher number of
+tasks that can be migrated at once contributes to this.
+
+To better handle this situation, restore the mainline migration cost
+value and also reduce the max number of tasks that can be migrated in
+batch from 128 to 64.
+
+If this doesn't help, we'll restore the reduced migration cost and keep
+total number of tasks that can be migrated at once to 32.
+
+[1]: https://github.com/zen-kernel/zen-kernel/commit/be5ba234ca0a5aabe74bfc7e1f636f085bd3823c#commitcomment-63159674
+
+6.6:
+Port the tuning to EEVDF, which removed a couple of settings.
+
+6.7:
+Instead of increasing the number of tasks that migrate at once, migrate
+the amount acceptable for PREEMPT_RT, but reduce the cost so migrations
+occur more often.
+
+This should make CFS/EEVDF behave more like out-of-tree schedulers that
+aggressively use idle cores to reduce latency, but without the jank
+caused by rebalancing too many tasks at once.
+---
+ init/Kconfig         |  7 +++++++
+ kernel/sched/fair.c  | 13 +++++++++++++
+ kernel/sched/sched.h |  2 +-
+ 3 files changed, 21 insertions(+), 1 deletion(-)
+
+--- a/init/Kconfig
++++ b/init/Kconfig
+@@ -150,6 +150,13 @@ config ZEN_INTERACTIVE
+ 	    Background-reclaim hugepages...:   no   ->   yes
+ 	    MG-LRU minimum cache TTL.......:   0    ->   1000 ms
+ 
++	  --- EEVDF CPU Scheduler --------------------------------
++
++	    Minimal granularity............:   0.75 ->   0.4  ms
++	    Migration cost.................:   0.5  ->   0.25 ms
++	    Bandwidth slice size...........:   5    ->   3    ms
++	    Task rebalancing threshold.....:  32    ->   8
++
+ config BROKEN
+ 	bool
+ 
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -73,10 +73,19 @@ unsigned int sysctl_sched_tunable_scalin
+  *
+  * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
+  */
++#ifdef CONFIG_ZEN_INTERACTIVE
++unsigned int sysctl_sched_base_slice			= 400000ULL;
++static unsigned int normalized_sysctl_sched_base_slice	= 400000ULL;
++#else
+ unsigned int sysctl_sched_base_slice			= 750000ULL;
+ static unsigned int normalized_sysctl_sched_base_slice	= 750000ULL;
++#endif
+ 
++#ifdef CONFIG_ZEN_INTERACTIVE
++const_debug unsigned int sysctl_sched_migration_cost	= 250000UL;
++#else
+ const_debug unsigned int sysctl_sched_migration_cost	= 500000UL;
++#endif
+ 
+ static int __init setup_sched_thermal_decay_shift(char *str)
+ {
+@@ -121,8 +130,12 @@ int __weak arch_asym_cpu_priority(int cp
+  *
+  * (default: 5 msec, units: microseconds)
+  */
++#ifdef CONFIG_ZEN_INTERACTIVE
++static unsigned int sysctl_sched_cfs_bandwidth_slice		= 3000UL;
++#else
+ static unsigned int sysctl_sched_cfs_bandwidth_slice		= 5000UL;
+ #endif
++#endif
+ 
+ #ifdef CONFIG_NUMA_BALANCING
+ /* Restrict the NUMA promotion throughput (MB/s) for each target node. */
+--- a/kernel/sched/sched.h
++++ b/kernel/sched/sched.h
+@@ -2591,7 +2591,7 @@ extern void deactivate_task(struct rq *r
+ 
+ extern void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags);
+ 
+-#ifdef CONFIG_PREEMPT_RT
++#if defined(CONFIG_PREEMPT_RT) || defined(CONFIG_ZEN_INTERACTIVE)
+ # define SCHED_NR_MIGRATE_BREAK 8
+ #else
+ # define SCHED_NR_MIGRATE_BREAK 32
diff --git a/debian/patches/patchset-zen/sauce/0017-ZEN-INTERACTIVE-Tune-ondemand-governor-for-interacti.patch b/debian/patches/patchset-zen/sauce/0017-ZEN-INTERACTIVE-Tune-ondemand-governor-for-interacti.patch
new file mode 100644
index 0000000..5cef79c
--- /dev/null
+++ b/debian/patches/patchset-zen/sauce/0017-ZEN-INTERACTIVE-Tune-ondemand-governor-for-interacti.patch
@@ -0,0 +1,90 @@
+From e9c6f500f4429c32f583d6da11352b2f0bcce4c8 Mon Sep 17 00:00:00 2001
+From: "Jan Alexander Steffens (heftig)" <jan.steffens@gmail.com>
+Date: Mon, 27 Jan 2020 18:27:16 +0100
+Subject: ZEN: INTERACTIVE: Tune ondemand governor for interactivity
+
+4.10:
+During some personal testing with the Dolphin emulator, MuQSS has
+serious problems scaling its frequencies causing poor performance where
+boosting the CPU frequencies would have fixed them.  Reducing the
+up_threshold to 45 with MuQSS appears to fix the issue, letting the
+introduction to "Star Wars: Rogue Leader" run at 100% speed versus about
+80% on my test system.
+
+Also, lets refactor the definitions and include some indentation to help
+the reader discern what the scope of all the macros are.
+
+5.4:
+On the last custom kernel benchmark from Phoronix with Xanmod, Michael
+configured all the kernels to run using ondemand instead of the kernel's
+[default selection][1].  This reminded me that another option outside of
+the kernels control is the user's choice to change the cpufreq governor,
+for better or for worse.
+
+In Liquorix, performance is the default governor whether you're running
+acpi-cpufreq or intel-pstate.  I expect laptop users to install TLP or
+LMT to control the power balance on their system, especially when
+they're plugged in or on battery.  However, it's pretty clear to me a
+lot of people would choose ondemand over performance since it's not
+obvious it has huge performance ramifications with MuQSS, and ondemand
+otherwise is "good enough" for most people.
+
+Lets codify lower up thresholds for MuQSS to more closely synergize with
+its aggressive thread migration behavior.  This way when ondemand is
+configured, you get sort of a "performance-lite" type of result but with
+the power savings you expect when leaving the running system idle.
+
+[1]: https://www.phoronix.com/scan.php?page=article&item=xanmod-2020-kernel
+
+5.14:
+Although CFS and similar schedulers (BMQ, PDS, and CacULE), reuse a lot
+more of mainline scheduling and do a good job of pinning single threaded
+tasks to their respective core, there's still applications that
+confusingly run steady near 50% and benefit from going full speed or
+turbo when they need to run (emulators for more recent consoles come to
+mind).
+
+Drop the up threshold for all non-MuQSS schedulers from 80/95 to 55/60.
+
+5.15:
+Remove MuQSS cpufreq configuration.
+---
+ drivers/cpufreq/cpufreq_ondemand.c | 8 +++++++-
+ init/Kconfig                       | 6 ++++++
+ 2 files changed, 13 insertions(+), 1 deletion(-)
+
+--- a/drivers/cpufreq/cpufreq_ondemand.c
++++ b/drivers/cpufreq/cpufreq_ondemand.c
+@@ -18,10 +18,16 @@
+ #include "cpufreq_ondemand.h"
+ 
+ /* On-demand governor macros */
++#if defined(CONFIG_ZEN_INTERACTIVE)
++#define DEF_FREQUENCY_UP_THRESHOLD		(55)
++#define MICRO_FREQUENCY_UP_THRESHOLD		(60)
++#define DEF_SAMPLING_DOWN_FACTOR		(5)
++#else
+ #define DEF_FREQUENCY_UP_THRESHOLD		(63)
++#define MICRO_FREQUENCY_UP_THRESHOLD		(70)
+ #define DEF_SAMPLING_DOWN_FACTOR		(100)
++#endif
+ #define MAX_SAMPLING_DOWN_FACTOR		(100000)
+-#define MICRO_FREQUENCY_UP_THRESHOLD		(70)
+ #define MIN_FREQUENCY_UP_THRESHOLD		(1)
+ #define MAX_FREQUENCY_UP_THRESHOLD		(100)
+ 
+--- a/init/Kconfig
++++ b/init/Kconfig
+@@ -157,6 +157,12 @@ config ZEN_INTERACTIVE
+ 	    Bandwidth slice size...........:   5    ->   3    ms
+ 	    Task rebalancing threshold.....:  32    ->   8
+ 
++	  --- CPUFreq Settings -----------------------------------
++
++	    Ondemand sampling down factor..: 100    ->   5
++	    Ondemand default up threshold..:  63    ->  55
++	    Ondemand micro up threshold....:  70    ->  60
++
+ config BROKEN
+ 	bool
+ 
diff --git a/debian/patches/patchset-zen/sauce/0018-ZEN-INTERACTIVE-mm-Disable-unevictable-compaction.patch b/debian/patches/patchset-zen/sauce/0018-ZEN-INTERACTIVE-mm-Disable-unevictable-compaction.patch
new file mode 100644
index 0000000..e6a0e18
--- /dev/null
+++ b/debian/patches/patchset-zen/sauce/0018-ZEN-INTERACTIVE-mm-Disable-unevictable-compaction.patch
@@ -0,0 +1,33 @@
+From 4706a3fb5823c97dc6acc1e86958b71e2c048ec5 Mon Sep 17 00:00:00 2001
+From: Steven Barrett <steven@liquorix.net>
+Date: Sat, 5 Mar 2022 11:37:14 -0600
+Subject: ZEN: INTERACTIVE: mm: Disable unevictable compaction
+
+This option is already disabled when CONFIG_PREEMPT_RT is enabled, lets
+turn it off when CONFIG_ZEN_INTERACTIVE is set as well.
+---
+ init/Kconfig | 1 +
+ mm/Kconfig   | 2 +-
+ 2 files changed, 2 insertions(+), 1 deletion(-)
+
+--- a/init/Kconfig
++++ b/init/Kconfig
+@@ -149,6 +149,7 @@ config ZEN_INTERACTIVE
+ 
+ 	    Background-reclaim hugepages...:   no   ->   yes
+ 	    MG-LRU minimum cache TTL.......:   0    ->   1000 ms
++	    Compact unevictable............:   yes  ->   no
+ 
+ 	  --- EEVDF CPU Scheduler --------------------------------
+ 
+--- a/mm/Kconfig
++++ b/mm/Kconfig
+@@ -649,7 +649,7 @@ config COMPACTION
+ config COMPACT_UNEVICTABLE_DEFAULT
+ 	int
+ 	depends on COMPACTION
+-	default 0 if PREEMPT_RT
++	default 0 if PREEMPT_RT || ZEN_INTERACTIVE
+ 	default 1
+ 
+ #
diff --git a/debian/patches/patchset-zen/sauce/0019-ZEN-INTERACTIVE-mm-Disable-proactive-compaction-by-d.patch b/debian/patches/patchset-zen/sauce/0019-ZEN-INTERACTIVE-mm-Disable-proactive-compaction-by-d.patch
new file mode 100644
index 0000000..2927530
--- /dev/null
+++ b/debian/patches/patchset-zen/sauce/0019-ZEN-INTERACTIVE-mm-Disable-proactive-compaction-by-d.patch
@@ -0,0 +1,38 @@
+From 8146f220f871c4db77c8363c831784041a5bcf7b Mon Sep 17 00:00:00 2001
+From: Sultan Alsawaf <sultan@kerneltoast.com>
+Date: Sat, 24 Oct 2020 22:17:49 -0700
+Subject: ZEN: INTERACTIVE: mm: Disable proactive compaction by default
+
+On-demand compaction works fine assuming that you don't have a need to
+spam the page allocator nonstop for large order page allocations.
+
+Signed-off-by: Sultan Alsawaf <sultan@kerneltoast.com>
+---
+ init/Kconfig    | 1 +
+ mm/compaction.c | 4 ++++
+ 2 files changed, 5 insertions(+)
+
+--- a/init/Kconfig
++++ b/init/Kconfig
+@@ -150,6 +150,7 @@ config ZEN_INTERACTIVE
+ 	    Background-reclaim hugepages...:   no   ->   yes
+ 	    MG-LRU minimum cache TTL.......:   0    ->   1000 ms
+ 	    Compact unevictable............:   yes  ->   no
++	    Compaction proactiveness.......:  20    ->   0
+ 
+ 	  --- EEVDF CPU Scheduler --------------------------------
+ 
+--- a/mm/compaction.c
++++ b/mm/compaction.c
+@@ -1950,7 +1950,11 @@ static int sysctl_compact_unevictable_al
+  * aggressively the kernel should compact memory in the
+  * background. It takes values in the range [0, 100].
+  */
++#ifdef CONFIG_ZEN_INTERACTIVE
++static unsigned int __read_mostly sysctl_compaction_proactiveness;
++#else
+ static unsigned int __read_mostly sysctl_compaction_proactiveness = 20;
++#endif
+ static int sysctl_extfrag_threshold = 500;
+ static int __read_mostly sysctl_compact_memory;
+ 
diff --git a/debian/patches/patchset-zen/sauce/0020-ZEN-INTERACTIVE-mm-Disable-watermark-boosting-by-def.patch b/debian/patches/patchset-zen/sauce/0020-ZEN-INTERACTIVE-mm-Disable-watermark-boosting-by-def.patch
new file mode 100644
index 0000000..e745c99
--- /dev/null
+++ b/debian/patches/patchset-zen/sauce/0020-ZEN-INTERACTIVE-mm-Disable-watermark-boosting-by-def.patch
@@ -0,0 +1,57 @@
+From 5f16843397798d2c709e3b8af4b1a73539d13aa8 Mon Sep 17 00:00:00 2001
+From: Sultan Alsawaf <sultan@kerneltoast.com>
+Date: Sat, 28 Mar 2020 13:06:28 -0700
+Subject: ZEN: INTERACTIVE: mm: Disable watermark boosting by default
+
+What watermark boosting does is preemptively fire up kswapd to free
+memory when there hasn't been an allocation failure. It does this by
+increasing kswapd's high watermark goal and then firing up kswapd. The
+reason why this causes freezes is because, with the increased high
+watermark goal, kswapd will steal memory from processes that need it in
+order to make forward progress. These processes will, in turn, try to
+allocate memory again, which will cause kswapd to steal necessary pages
+from those processes again, in a positive feedback loop known as page
+thrashing. When page thrashing occurs, your system is essentially
+livelocked until the necessary forward progress can be made to stop
+processes from trying to continuously allocate memory and trigger
+kswapd to steal it back.
+
+This problem already occurs with kswapd *without* watermark boosting,
+but it's usually only encountered on machines with a small amount of
+memory and/or a slow CPU. Watermark boosting just makes the existing
+problem worse enough to notice on higher spec'd machines.
+
+Disable watermark boosting by default since it's a total dumpster fire.
+I can't imagine why anyone would want to explicitly enable it, but the
+option is there in case someone does.
+
+Signed-off-by: Sultan Alsawaf <sultan@kerneltoast.com>
+---
+ init/Kconfig    | 1 +
+ mm/page_alloc.c | 4 ++++
+ 2 files changed, 5 insertions(+)
+
+--- a/init/Kconfig
++++ b/init/Kconfig
+@@ -151,6 +151,7 @@ config ZEN_INTERACTIVE
+ 	    MG-LRU minimum cache TTL.......:   0    ->   1000 ms
+ 	    Compact unevictable............:   yes  ->   no
+ 	    Compaction proactiveness.......:  20    ->   0
++	    Watermark boost factor.........:   1.5  ->   0
+ 
+ 	  --- EEVDF CPU Scheduler --------------------------------
+ 
+--- a/mm/page_alloc.c
++++ b/mm/page_alloc.c
+@@ -271,7 +271,11 @@ const char * const migratetype_names[MIG
+ 
+ int min_free_kbytes = 1024;
+ int user_min_free_kbytes = -1;
++#ifdef CONFIG_ZEN_INTERACTIVE
++static int watermark_boost_factor __read_mostly;
++#else
+ static int watermark_boost_factor __read_mostly = 15000;
++#endif
+ static int watermark_scale_factor = 10;
+ 
+ /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
diff --git a/debian/patches/patchset-zen/sauce/0021-ZEN-INTERACTIVE-mm-Lower-the-non-hugetlbpage-pageblo.patch b/debian/patches/patchset-zen/sauce/0021-ZEN-INTERACTIVE-mm-Lower-the-non-hugetlbpage-pageblo.patch
new file mode 100644
index 0000000..0333d12
--- /dev/null
+++ b/debian/patches/patchset-zen/sauce/0021-ZEN-INTERACTIVE-mm-Lower-the-non-hugetlbpage-pageblo.patch
@@ -0,0 +1,57 @@
+From eb51c53e5ded1743830368815c550b871f950738 Mon Sep 17 00:00:00 2001
+From: Sultan Alsawaf <sultan@kerneltoast.com>
+Date: Wed, 20 Oct 2021 20:50:11 -0700
+Subject: ZEN: INTERACTIVE: mm: Lower the non-hugetlbpage pageblock size to
+ reduce scheduling delays
+
+The page allocator processes free pages in groups of pageblocks, where
+the size of a pageblock is typically quite large (1024 pages without
+hugetlbpage support). Pageblocks are processed atomically with the zone
+lock held, which can cause severe scheduling delays on both the CPU
+going through the pageblock and any other CPUs waiting to acquire the
+zone lock. A frequent offender is move_freepages_block(), which is used
+by rmqueue() for page allocation.
+
+As it turns out, there's no requirement for pageblocks to be so large,
+so the pageblock order can simply be reduced to ease the scheduling
+delays and zone lock contention. PAGE_ALLOC_COSTLY_ORDER is used as a
+reasonable setting to ensure non-costly page allocation requests can
+still be serviced without always needing to free up more than one
+pageblock's worth of pages at a time.
+
+This has a noticeable effect on overall system latency when memory
+pressure is elevated. The various mm functions which operate on
+pageblocks no longer appear in the preemptoff tracer, where previously
+they would spend up to 100 ms on a mobile arm64 CPU processing a
+pageblock with preemption disabled and the zone lock held.
+
+Signed-off-by: Sultan Alsawaf <sultan@kerneltoast.com>
+---
+ include/linux/pageblock-flags.h | 4 ++++
+ init/Kconfig                    | 1 +
+ 2 files changed, 5 insertions(+)
+
+--- a/include/linux/pageblock-flags.h
++++ b/include/linux/pageblock-flags.h
+@@ -52,7 +52,11 @@ extern unsigned int pageblock_order;
+ #else /* CONFIG_TRANSPARENT_HUGEPAGE */
+ 
+ /* If huge pages are not used, group by MAX_ORDER_NR_PAGES */
++#ifdef CONFIG_ZEN_INTERACTIVE
++#define pageblock_order		PAGE_ALLOC_COSTLY_ORDER
++#else
+ #define pageblock_order		MAX_PAGE_ORDER
++#endif
+ 
+ #endif /* CONFIG_HUGETLB_PAGE */
+ 
+--- a/init/Kconfig
++++ b/init/Kconfig
+@@ -152,6 +152,7 @@ config ZEN_INTERACTIVE
+ 	    Compact unevictable............:   yes  ->   no
+ 	    Compaction proactiveness.......:  20    ->   0
+ 	    Watermark boost factor.........:   1.5  ->   0
++	    Pageblock order................:  10    ->   3
+ 
+ 	  --- EEVDF CPU Scheduler --------------------------------
+ 
diff --git a/debian/patches/patchset-zen/sauce/0022-ZEN-INTERACTIVE-dm-crypt-Disable-workqueues-for-cryp.patch b/debian/patches/patchset-zen/sauce/0022-ZEN-INTERACTIVE-dm-crypt-Disable-workqueues-for-cryp.patch
new file mode 100644
index 0000000..2ead0f7
--- /dev/null
+++ b/debian/patches/patchset-zen/sauce/0022-ZEN-INTERACTIVE-dm-crypt-Disable-workqueues-for-cryp.patch
@@ -0,0 +1,44 @@
+From a8a0d4b9f356610babe5b884500799310fe6dcdd Mon Sep 17 00:00:00 2001
+From: Steven Barrett <steven@liquorix.net>
+Date: Sat, 21 May 2022 15:15:09 -0500
+Subject: ZEN: INTERACTIVE: dm-crypt: Disable workqueues for crypto ops
+
+Queueing in dm-crypt for crypto operations reduces performance on modern
+systems.  As discussed in an article from Cloudflare, they discovered
+that queuing was introduced because the crypto subsystem used to be
+synchronous.  Since it's now asynchronous, we get double queueing when
+using the subsystem through dm-crypt.  This is obviously undesirable and
+reduces throughput and increases latency.
+
+Disable queueing when using our Zen Interactive configuration.
+
+Fixes: https://github.com/zen-kernel/zen-kernel/issues/282
+---
+ drivers/md/dm-crypt.c | 5 +++++
+ init/Kconfig          | 1 +
+ 2 files changed, 6 insertions(+)
+
+--- a/drivers/md/dm-crypt.c
++++ b/drivers/md/dm-crypt.c
+@@ -3310,6 +3310,11 @@ static int crypt_ctr(struct dm_target *t
+ 			goto bad;
+ 	}
+ 
++#ifdef CONFIG_ZEN_INTERACTIVE
++	set_bit(DM_CRYPT_NO_READ_WORKQUEUE, &cc->flags);
++	set_bit(DM_CRYPT_NO_WRITE_WORKQUEUE, &cc->flags);
++#endif
++
+ 	ret = crypt_ctr_cipher(ti, argv[0], argv[1]);
+ 	if (ret < 0)
+ 		goto bad;
+--- a/init/Kconfig
++++ b/init/Kconfig
+@@ -144,6 +144,7 @@ config ZEN_INTERACTIVE
+ 
+ 	    Default scheduler for SQ..: mq-deadline ->   bfq
+ 	    Default scheduler for MQ..:        none ->   kyber
++	    DM-Crypt workqueues.......:        yes  ->   no
+ 
+ 	  --- Virtual Memory Subsystem ---------------------------
+ 
diff --git a/debian/patches/patchset-zen/sauce/0023-ZEN-INTERACTIVE-mm-swap-Disable-swap-in-readahead.patch b/debian/patches/patchset-zen/sauce/0023-ZEN-INTERACTIVE-mm-swap-Disable-swap-in-readahead.patch
new file mode 100644
index 0000000..35de516
--- /dev/null
+++ b/debian/patches/patchset-zen/sauce/0023-ZEN-INTERACTIVE-mm-swap-Disable-swap-in-readahead.patch
@@ -0,0 +1,49 @@
+From 5a8fabcd4e7396500f2c0070f8b7ce9106eb9bfa Mon Sep 17 00:00:00 2001
+From: Steven Barrett <steven@liquorix.net>
+Date: Mon, 5 Sep 2022 11:35:20 -0500
+Subject: ZEN: INTERACTIVE: mm/swap: Disable swap-in readahead
+
+Per an [issue][1] on the chromium project, swap-in readahead causes more
+jank than not.  This might be caused by poor optimization on the
+swapping code, or the fact under memory pressure, we're pulling in pages
+we don't need, causing more swapping.
+
+Either way, this is mainline/upstream to Chromium, and ChromeOS
+developers care a lot about system responsiveness. Lets implement the
+same change so Zen Kernel users benefit.
+
+[1]: https://bugs.chromium.org/p/chromium/issues/detail?id=263561
+---
+ init/Kconfig | 1 +
+ mm/swap.c    | 5 +++++
+ 2 files changed, 6 insertions(+)
+
+--- a/init/Kconfig
++++ b/init/Kconfig
+@@ -154,6 +154,7 @@ config ZEN_INTERACTIVE
+ 	    Compaction proactiveness.......:  20    ->   0
+ 	    Watermark boost factor.........:   1.5  ->   0
+ 	    Pageblock order................:  10    ->   3
++	    Swap-in readahead..............:   3    ->   0
+ 
+ 	  --- EEVDF CPU Scheduler --------------------------------
+ 
+--- a/mm/swap.c
++++ b/mm/swap.c
+@@ -1126,6 +1126,10 @@ void folio_batch_remove_exceptionals(str
+  */
+ void __init swap_setup(void)
+ {
++#ifdef CONFIG_ZEN_INTERACTIVE
++	/* Only swap-in pages requested, avoid readahead */
++	page_cluster = 0;
++#else
+ 	unsigned long megs = totalram_pages() >> (20 - PAGE_SHIFT);
+ 
+ 	/* Use a smaller cluster for small-memory machines */
+@@ -1137,4 +1141,5 @@ void __init swap_setup(void)
+ 	 * Right now other parts of the system means that we
+ 	 * _really_ don't want to cluster much more
+ 	 */
++#endif
+ }
diff --git a/debian/patches/patchset-zen/sauce/0024-ZEN-Update-VHBA-driver.patch b/debian/patches/patchset-zen/sauce/0024-ZEN-Update-VHBA-driver.patch
new file mode 100644
index 0000000..d89a19c
--- /dev/null
+++ b/debian/patches/patchset-zen/sauce/0024-ZEN-Update-VHBA-driver.patch
@@ -0,0 +1,19 @@
+From 238d23d6cb3f8610aa1cd3bdaeb398c63a4c9cb2 Mon Sep 17 00:00:00 2001
+From: "Jan Alexander Steffens (heftig)" <heftig@archlinux.org>
+Date: Tue, 1 Oct 2024 02:22:46 +0200
+Subject: ZEN: Update VHBA driver
+
+remote https://github.com/cdemu/cdemu
+tag    vhba-module-20240917
+---
+ drivers/scsi/vhba/Makefile | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/drivers/scsi/vhba/Makefile
++++ b/drivers/scsi/vhba/Makefile
+@@ -1,4 +1,4 @@
+-VHBA_VERSION := 20240202
++VHBA_VERSION := 20240917
+ 
+ obj-$(CONFIG_VHBA)		+= vhba.o
+ ccflags-y := -DVHBA_VERSION=\"$(VHBA_VERSION)\" -Werror
diff --git a/debian/patches/series b/debian/patches/series
index 074ab11..1045ec3 100644
--- a/debian/patches/series
+++ b/debian/patches/series
@@ -108,3 +108,212 @@ bugfix/all/perf-tools-pass-extra_cflags-through-to-libbpf-build-again.patch
 bugfix/all/kbuild-bpf-fix-btf-reproducibility.patch
 
 # ABI maintenance
+
+## custom patches
+
+krd/0001-Revert-objtool-dont-fail-the-kernel-build-on-fatal-errors.patch
+krd/0002-established-timeout.patch
+krd/0003-local-ports.patch
+krd/0004-bridge-group_fwd_mask.patch
+
+## 3rd party patches
+
+mixed-arch/0001-ZEN-Add-graysky-s-more-ISA-levels-and-uarches.patch
+mixed-arch/0002-ZEN-Fixup-graysky-s-more-ISA-levels-and-uarches.patch
+mixed-arch/0003-ZEN-Restore-CONFIG_OPTIMIZE_FOR_PERFORMANCE_O3.patch
+mixed-arch/0004-krd-adjust-CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3.patch
+mixed-arch/0005-XANMOD-x86-build-Prevent-generating-avx2-and-avx512-.patch
+mixed-arch/0006-krd-adjust-KBUILD_CFLAGS-fno-tree-vectorize.patch
+
+misc-bbr3/0001-net-tcp_bbr-broaden-app-limited-rate-sample-detectio.patch
+misc-bbr3/0002-net-tcp_bbr-v2-shrink-delivered_mstamp-first_tx_msta.patch
+misc-bbr3/0003-net-tcp_bbr-v2-snapshot-packets-in-flight-at-transmi.patch
+misc-bbr3/0004-net-tcp_bbr-v2-count-packets-lost-over-TCP-rate-samp.patch
+misc-bbr3/0005-net-tcp_bbr-v2-export-FLAG_ECE-in-rate_sample.is_ece.patch
+misc-bbr3/0006-net-tcp_bbr-v2-introduce-ca_ops-skb_marked_lost-CC-m.patch
+misc-bbr3/0007-net-tcp_bbr-v2-adjust-skb-tx.in_flight-upon-merge-in.patch
+misc-bbr3/0008-net-tcp_bbr-v2-adjust-skb-tx.in_flight-upon-split-in.patch
+misc-bbr3/0009-net-tcp-add-new-ca-opts-flag-TCP_CONG_WANTS_CE_EVENT.patch
+misc-bbr3/0010-net-tcp-re-generalize-TSO-sizing-in-TCP-CC-module-AP.patch
+misc-bbr3/0011-net-tcp-add-fast_ack_mode-1-skip-rwin-check-in-tcp_f.patch
+misc-bbr3/0012-net-tcp_bbr-v2-record-app-limited-status-of-TLP-repa.patch
+misc-bbr3/0013-net-tcp_bbr-v2-inform-CC-module-of-losses-repaired-b.patch
+misc-bbr3/0014-net-tcp_bbr-v2-introduce-is_acking_tlp_retrans_seq-i.patch
+misc-bbr3/0015-tcp-introduce-per-route-feature-RTAX_FEATURE_ECN_LOW.patch
+misc-bbr3/0016-net-tcp_bbr-v3-update-TCP-bbr-congestion-control-mod.patch
+misc-bbr3/0017-net-tcp_bbr-v3-ensure-ECN-enabled-BBR-flows-set-ECT-.patch
+misc-bbr3/0018-tcp-export-TCPI_OPT_ECN_LOW-in-tcp_info-tcpi_options.patch
+misc-bbr3/0019-x86-cfi-bpf-Add-tso_segs-and-skb_marked_lost-to-bpf_.patch
+
+misc-ntsync5/0001-ntsync-Introduce-NTSYNC_IOC_WAIT_ANY.patch
+misc-ntsync5/0002-ntsync-Introduce-NTSYNC_IOC_WAIT_ALL.patch
+misc-ntsync5/0003-ntsync-Introduce-NTSYNC_IOC_CREATE_MUTEX.patch
+misc-ntsync5/0004-ntsync-Introduce-NTSYNC_IOC_MUTEX_UNLOCK.patch
+misc-ntsync5/0005-ntsync-Introduce-NTSYNC_IOC_MUTEX_KILL.patch
+misc-ntsync5/0006-ntsync-Introduce-NTSYNC_IOC_CREATE_EVENT.patch
+misc-ntsync5/0007-ntsync-Introduce-NTSYNC_IOC_EVENT_SET.patch
+misc-ntsync5/0008-ntsync-Introduce-NTSYNC_IOC_EVENT_RESET.patch
+misc-ntsync5/0009-ntsync-Introduce-NTSYNC_IOC_EVENT_PULSE.patch
+misc-ntsync5/0010-ntsync-Introduce-NTSYNC_IOC_SEM_READ.patch
+misc-ntsync5/0011-ntsync-Introduce-NTSYNC_IOC_MUTEX_READ.patch
+misc-ntsync5/0012-ntsync-Introduce-NTSYNC_IOC_EVENT_READ.patch
+misc-ntsync5/0013-ntsync-Introduce-alertable-waits.patch
+misc-ntsync5/0014-maintainers-Add-an-entry-for-ntsync.patch
+misc-ntsync5/0015-docs-ntsync-Add-documentation-for-the-ntsync-uAPI.patch
+misc-ntsync5/0016-Revert-misc-ntsync-mark-driver-as-broken-to-prevent-.patch
+
+misc-openwrt/0001-mac80211-ignore-AP-power-level-when-tx-power-type-is.patch
+misc-openwrt/0002-net-enable-fraglist-GRO-by-default.patch
+misc-openwrt/0003-net-remove-NETIF_F_GSO_FRAGLIST-from-NETIF_F_GSO_SOF.patch
+
+patchset-pf/amd-pstate/0001-cpufreq-amd-pstate-add-quirk-for-Ryzen-3000-series-p.patch
+patchset-pf/amd-pstate/0002-cpufreq-amd-pstate-Export-symbols-for-changing-modes.patch
+patchset-pf/amd-pstate/0003-cpufreq-amd-pstate-ut-Add-test-case-for-mode-switche.patch
+patchset-pf/amd-pstate/0004-cpufreq-amd-pstate-Catch-failures-for-amd_pstate_epp.patch
+patchset-pf/amd-pstate/0005-x86-amd-Move-amd_get_highest_perf-from-amd.c-to-cppc.patch
+patchset-pf/amd-pstate/0006-ACPI-CPPC-Adjust-return-code-for-inline-functions-in.patch
+patchset-pf/amd-pstate/0007-x86-amd-Rename-amd_get_highest_perf-to-amd_get_boost.patch
+patchset-pf/amd-pstate/0008-ACPI-CPPC-Drop-check-for-non-zero-perf-ratio.patch
+patchset-pf/amd-pstate/0009-ACPI-CPPC-Adjust-debug-messages-in-amd_set_max_freq_.patch
+patchset-pf/amd-pstate/0010-x86-amd-Move-amd_get_highest_perf-out-of-amd-pstate.patch
+patchset-pf/amd-pstate/0011-x86-amd-Detect-preferred-cores-in-amd_get_boost_rati.patch
+patchset-pf/amd-pstate/0012-cpufreq-amd-pstate-Merge-amd_pstate_highest_perf_set.patch
+patchset-pf/amd-pstate/0013-cpufreq-amd-pstate-Optimize-amd_pstate_update_limits.patch
+patchset-pf/amd-pstate/0014-cpufreq-amd-pstate-Add-documentation-for-amd_pstate_.patch
+patchset-pf/amd-pstate/0015-amd-pstate-Add-missing-documentation-for-amd_pstate_.patch
+patchset-pf/amd-pstate/0016-cpufreq-amd-pstate-Fix-non-kerneldoc-comment.patch
+patchset-pf/amd-pstate/0017-cpufreq-amd-pstate-ut-Fix-an-Uninitialized-variables.patch
+patchset-pf/amd-pstate/0018-cpufreq-amd-pstate-Rename-MSR-and-shared-memory-spec.patch
+patchset-pf/amd-pstate/0019-cpufreq-Add-a-callback-to-update-the-min_freq_req-fr.patch
+patchset-pf/amd-pstate/0020-cpufreq-amd-pstate-Set-the-initial-min_freq-to-lowes.patch
+patchset-pf/amd-pstate/0021-cpufreq-amd-pstate-Cleanup-the-old-min_freq-qos-requ.patch
+patchset-pf/amd-pstate/0022-cpufreq-amd-pstate-Fix-amd_pstate-mode-switch-on-sha.patch
+patchset-pf/amd-pstate/0023-cpufreq-amd-pstate-Use-nominal-perf-for-limits-when-.patch
+patchset-pf/amd-pstate/0024-cpufreq-amd-pstate-Don-t-update-CPPC-request-in-amd_.patch
+patchset-pf/amd-pstate/0025-cpufreq-amd-pstate-Use-amd_pstate_update_min_max_lim.patch
+patchset-pf/amd-pstate/0026-cpufreq-amd-pstate-Drop-needless-EPP-initialization.patch
+patchset-pf/amd-pstate/0027-amd-pstate-6.11-update-setting-the-minimum-frequency.patch
+patchset-pf/amd-pstate/0028-cpufreq-amd-pstate-Call-amd_pstate_register-in-amd_p.patch
+patchset-pf/amd-pstate/0029-cpufreq-amd-pstate-Call-amd_pstate_set_driver-in-amd.patch
+patchset-pf/amd-pstate/0030-cpufreq-amd-pstate-Remove-the-switch-case-in-amd_pst.patch
+patchset-pf/amd-pstate/0031-cpufreq-amd-pstate-Remove-the-redundant-amd_pstate_s.patch
+patchset-pf/amd-pstate/0032-cpufreq-amd-pstate-ut-Add-fix-for-min-freq-unit-test.patch
+patchset-pf/amd-pstate/0033-amd-pstate-Set-min_perf-to-nominal_perf-for-active-m.patch
+patchset-pf/amd-pstate/0034-amd-pstate-Switch-to-amd-pstate-by-default-on-some-S.patch
+patchset-pf/amd-pstate/0035-cpufreq-amd-pstate-Push-adjust_perf-vfunc-init-into-.patch
+patchset-pf/amd-pstate/0036-cpufreq-amd-pstate-Move-registration-after-static-fu.patch
+
+patchset-pf/amd-rapl/0001-perf-Generic-hotplug-support-for-a-PMU-with-a-scope.patch
+patchset-pf/amd-rapl/0002-perf-Add-PERF_EV_CAP_READ_SCOPE.patch
+patchset-pf/amd-rapl/0003-perf-x86-intel-cstate-Clean-up-cpumask-and-hotplug.patch
+patchset-pf/amd-rapl/0004-iommu-vt-d-Clean-up-cpumask-and-hotplug-for-perfmon.patch
+patchset-pf/amd-rapl/0005-dmaengine-idxd-Clean-up-cpumask-and-hotplug-for-perf.patch
+patchset-pf/amd-rapl/0006-perf-x86-rapl-Move-the-pmu-allocation-out-of-CPU-hot.patch
+patchset-pf/amd-rapl/0007-perf-x86-rapl-Clean-up-cpumask-and-hotplug.patch
+patchset-pf/amd-rapl/0008-perf-x86-rapl-Fix-the-energy-pkg-event-for-AMD-CPUs.patch
+patchset-pf/amd-rapl/0009-x86-topology-Introduce-topology_logical_core_id.patch
+patchset-pf/amd-rapl/0010-perf-x86-rapl-Remove-the-cpu_to_rapl_pmu-function.patch
+patchset-pf/amd-rapl/0011-perf-x86-rapl-Rename-rapl_pmu-variables.patch
+patchset-pf/amd-rapl/0012-perf-x86-rapl-Make-rapl_model-struct-global.patch
+patchset-pf/amd-rapl/0013-perf-x86-rapl-Add-arguments-to-the-cleanup-and-init-.patch
+patchset-pf/amd-rapl/0014-perf-x86-rapl-Modify-the-generic-variable-names-to-_.patch
+patchset-pf/amd-rapl/0015-perf-x86-rapl-Remove-the-global-variable-rapl_msrs.patch
+patchset-pf/amd-rapl/0016-perf-x86-rapl-Move-the-cntr_mask-to-rapl_pmus-struct.patch
+patchset-pf/amd-rapl/0017-perf-x86-rapl-Add-per-core-energy-counter-support-fo.patch
+
+patchset-pf/cpuidle/0001-cpuidle-menu-Remove-iowait-influence.patch
+patchset-pf/cpuidle/0002-cpuidle-Prefer-teo-over-menu-governor.patch
+patchset-pf/cpuidle/0003-TEST-cpufreq-schedutil-Linear-iowait-boost-step.patch
+patchset-pf/cpuidle/0004-TEST-cpufreq-schedutil-iowait-boost-cap-sysfs.patch
+patchset-pf/cpuidle/0005-cpufreq-schedutil-Remove-iowait-boost.patch
+patchset-pf/cpuidle/0006-cpufreq-intel_pstate-Remove-iowait-boost.patch
+patchset-pf/cpuidle/0007-cpufreq-Remove-SCHED_CPUFREQ_IOWAIT-update.patch
+patchset-pf/cpuidle/0008-io_uring-Do-not-set-iowait-before-sleeping.patch
+
+patchset-pf/crypto/0001-crypto-x86-crc32c-simplify-code-for-handling-fewer-t.patch
+patchset-pf/crypto/0002-crypto-x86-crc32c-access-32-bit-arguments-as-32-bit.patch
+patchset-pf/crypto/0003-crypto-x86-crc32c-eliminate-jump-table-and-excessive.patch
+
+patchset-pf/ksm/0001-mm-expose-per-process-KSM-control-via-syscalls.patch
+patchset-pf/ksm/0002-mm-process_ksm-use-pidfd_get_task-instead-of-pidfd_g.patch
+
+patchset-pf/zstd/0001-zstd-import-upstream-v1.5.6.patch
+patchset-pf/zstd/0002-lib-zstd-Refactor-intentional-wrap-around-test.patch
+
+patchset-xanmod/binder/0001-binder-turn-into-module.patch
+
+patchset-xanmod/clearlinux/0001-sched-wait-Do-accept-in-LIFO-order-for-cache-efficie.patch
+patchset-xanmod/clearlinux/0002-firmware-Enable-stateless-firmware-loading.patch
+patchset-xanmod/clearlinux/0003-locking-rwsem-spin-faster.patch
+
+patchset-xanmod/net/netfilter/0001-netfilter-Add-netfilter-nf_tables-fullcone-support.patch
+patchset-xanmod/net/netfilter/0002-netfilter-add-xt_FLOWOFFLOAD-target.patch
+
+patchset-xanmod/net/tcp/cloudflare/0001-tcp-Add-a-sysctl-to-skip-tcp-collapse-processing-whe.patch
+
+patchset-xanmod/pci_acso/0001-PCI-Enable-overrides-for-missing-ACS-capabilities.patch
+
+patchset-xanmod/valve/0001-extcon-Add-driver-for-Steam-Deck.patch
+patchset-xanmod/valve/0002-hwmon-Add-driver-for-Steam-Deck-s-EC-sensors.patch
+patchset-xanmod/valve/0003-hwmon-steamdeck-hwmon-Add-support-for-max-battery-le.patch
+patchset-xanmod/valve/0004-leds-steamdeck-Add-support-for-Steam-Deck-LED.patch
+patchset-xanmod/valve/0005-mfd-Add-MFD-core-driver-for-Steam-Deck.patch
+patchset-xanmod/valve/0006-mfd-steamdeck-Expose-controller-board-power-in-sysfs.patch
+
+patchset-xanmod/xanmod/0001-XANMOD-kbuild-Add-GCC-SMS-based-modulo-scheduling-fl.patch
+patchset-xanmod/xanmod/0002-kbuild-Remove-GCC-minimal-function-alignment.patch
+patchset-xanmod/xanmod/0003-XANMOD-fair-Set-scheduler-tunable-latencies-to-unsca.patch
+patchset-xanmod/xanmod/0004-XANMOD-sched-Add-yield_type-sysctl-to-reduce-or-disa.patch
+patchset-xanmod/xanmod/0005-XANMOD-block-mq-deadline-Increase-write-priority-to-.patch
+patchset-xanmod/xanmod/0006-XANMOD-block-mq-deadline-Disable-front_merges-by-def.patch
+patchset-xanmod/xanmod/0007-XANMOD-block-Set-rq_affinity-to-force-complete-I-O-r.patch
+patchset-xanmod/xanmod/0008-XANMOD-blk-wbt-Set-wbt_default_latency_nsec-to-2msec.patch
+patchset-xanmod/xanmod/0009-XANMOD-kconfig-add-500Hz-timer-interrupt-kernel-conf.patch
+patchset-xanmod/xanmod/0010-XANMOD-dcache-cache_pressure-50-decreases-the-rate-a.patch
+patchset-xanmod/xanmod/0011-XANMOD-mm-vmscan-Set-minimum-amount-of-swapping.patch
+patchset-xanmod/xanmod/0012-XANMOD-sched-autogroup-Add-kernel-parameter-and-conf.patch
+patchset-xanmod/xanmod/0013-XANMOD-cpufreq-tunes-ondemand-and-conservative-gover.patch
+patchset-xanmod/xanmod/0014-XANMOD-lib-kconfig.debug-disable-default-SYMBOLIC_ER.patch
+patchset-xanmod/xanmod/0015-XANMOD-scripts-setlocalversion-remove-tag-for-git-re.patch
+patchset-xanmod/xanmod/0016-XANMOD-scripts-setlocalversion-Move-localversion-fil.patch
+
+patchset-zen/sauce/0001-ZEN-Add-VHBA-driver.patch
+patchset-zen/sauce/0002-vhba-Fix-compat-with-kernel-6.11.patch
+patchset-zen/sauce/0003-ZEN-PCI-Add-Intel-remapped-NVMe-device-support.patch
+patchset-zen/sauce/0004-ZEN-Disable-stack-conservation-for-GCC.patch
+patchset-zen/sauce/0005-ZEN-Initialize-ata-before-graphics.patch
+patchset-zen/sauce/0006-ZEN-Input-evdev-use-call_rcu-when-detaching-client.patch
+patchset-zen/sauce/0007-ZEN-cpufreq-Remove-schedutil-dependency-on-Intel-AMD.patch
+patchset-zen/sauce/0008-ZEN-intel-pstate-Implement-enable-parameter.patch
+patchset-zen/sauce/0009-ZEN-drm-amdgpu-pm-Allow-override-of-min_power_limit-.patch
+patchset-zen/sauce/0010-ZEN-Set-default-max-map-count-to-INT_MAX-5.patch
+patchset-zen/sauce/0011-ZEN-INTERACTIVE-Base-config-item.patch
+patchset-zen/sauce/0012-ZEN-INTERACTIVE-Use-BFQ-as-the-elevator-for-SQ-devic.patch
+patchset-zen/sauce/0013-ZEN-INTERACTIVE-Use-Kyber-as-the-elevator-for-MQ-dev.patch
+patchset-zen/sauce/0014-ZEN-INTERACTIVE-Enable-background-reclaim-of-hugepag.patch
+patchset-zen/sauce/0015-ZEN-INTERACTIVE-Tune-mgLRU-to-protect-cache-used-in-.patch
+patchset-zen/sauce/0016-ZEN-INTERACTIVE-Tune-EEVDF-for-interactivity.patch
+patchset-zen/sauce/0017-ZEN-INTERACTIVE-Tune-ondemand-governor-for-interacti.patch
+patchset-zen/sauce/0018-ZEN-INTERACTIVE-mm-Disable-unevictable-compaction.patch
+patchset-zen/sauce/0019-ZEN-INTERACTIVE-mm-Disable-proactive-compaction-by-d.patch
+patchset-zen/sauce/0020-ZEN-INTERACTIVE-mm-Disable-watermark-boosting-by-def.patch
+patchset-zen/sauce/0021-ZEN-INTERACTIVE-mm-Lower-the-non-hugetlbpage-pageblo.patch
+patchset-zen/sauce/0022-ZEN-INTERACTIVE-dm-crypt-Disable-workqueues-for-cryp.patch
+patchset-zen/sauce/0023-ZEN-INTERACTIVE-mm-swap-Disable-swap-in-readahead.patch
+patchset-zen/sauce/0024-ZEN-Update-VHBA-driver.patch
+
+patchset-pf/fixes/0001-arch-Kconfig-Default-to-maximum-amount-of-ASLR-bits.patch
+patchset-pf/fixes/0002-cpufreq-Remove-LATENCY_MULTIPLIER.patch
+patchset-pf/fixes/0003-drivers-firmware-skip-simpledrm-if-nvidia-drm.modese.patch
+patchset-pf/fixes/0004-nfsd-add-more-info-to-WARN_ON_ONCE-on-failed-callbac.patch
+patchset-pf/fixes/0005-e1000e-Remove-Meteor-Lake-SMBUS-workarounds.patch
+patchset-pf/fixes/0006-btrfs-zoned-fix-zone-unusable-accounting-for-freed-r.patch
+patchset-pf/fixes/0007-btrfs-clear-force-compress-on-remount-when-compress-.patch
+patchset-pf/fixes/0008-btrfs-qgroup-set-a-more-sane-default-value-for-subtr.patch
+patchset-pf/fixes/0009-btrfs-also-add-stripe-entries-for-NOCOW-writes.patch
+patchset-pf/fixes/0010-btrfs-fix-read-corruption-due-to-race-with-extent-ma.patch
+patchset-pf/fixes/0011-btrfs-reject-ro-rw-reconfiguration-if-there-are-hard.patch
+patchset-pf/fixes/0012-btrfs-fix-passing-0-to-ERR_PTR-in-btrfs_search_dir_i.patch
+
+patchset-zen/fixes/0001-Partially-revert-drm-amd-amdgpu-add-pipe1-hardware-s.patch
+patchset-zen/fixes/0002-drm-amdgpu-swsmu-default-to-fullscreen-3D-profile-fo.patch