1
0

add 3rd party/custom patches

3rd patchs (in alphabetical order):
- bbr3
- ntsync5
- openwrt
- pf-kernel
- xanmod
- zen

no configuration changes for now
This commit is contained in:
2024-10-29 05:12:06 +03:00
parent 8082dfeaca
commit 8cbaf1dea2
186 changed files with 43626 additions and 0 deletions

View File

@@ -0,0 +1,52 @@
From ce1cd7869a208112a8728d1fe9e373f78a2e4a6e Mon Sep 17 00:00:00 2001
From: Neal Cardwell <ncardwell@google.com>
Date: Tue, 11 Jun 2019 12:26:55 -0400
Subject: [PATCH 01/19] net-tcp_bbr: broaden app-limited rate sample detection
This commit is a bug fix for the Linux TCP app-limited
(application-limited) logic that is used for collecting rate
(bandwidth) samples.
Previously the app-limited logic only looked for "bubbles" of
silence in between application writes, by checking at the start
of each sendmsg. But "bubbles" of silence can also happen before
retransmits: e.g. bubbles can happen between an application write
and a retransmit, or between two retransmits.
Retransmits are triggered by ACKs or timers. So this commit checks
for bubbles of app-limited silence upon ACKs or timers.
Why does this commit check for app-limited state at the start of
ACKs and timer handling? Because at that point we know whether
inflight was fully using the cwnd. During processing the ACK or
timer event we often change the cwnd; after changing the cwnd we
can't know whether inflight was fully using the old cwnd.
Origin-9xx-SHA1: 3fe9b53291e018407780fb8c356adb5666722cbc
Change-Id: I37221506f5166877c2b110753d39bb0757985e68
Signed-off-by: Alexandre Frade <kernel@xanmod.org>
---
net/ipv4/tcp_input.c | 1 +
net/ipv4/tcp_timer.c | 1 +
2 files changed, 2 insertions(+)
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3961,6 +3961,7 @@ static int tcp_ack(struct sock *sk, cons
prior_fack = tcp_is_sack(tp) ? tcp_highest_sack_seq(tp) : tp->snd_una;
rs.prior_in_flight = tcp_packets_in_flight(tp);
+ tcp_rate_check_app_limited(sk);
/* ts_recent update must be made after we are sure that the packet
* is in window.
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -689,6 +689,7 @@ void tcp_write_timer_handler(struct sock
return;
}
+ tcp_rate_check_app_limited(sk);
tcp_mstamp_refresh(tcp_sk(sk));
event = icsk->icsk_pending;

View File

@@ -0,0 +1,74 @@
From b32715fbe2ab96d1060ec37bb9c03feedf366494 Mon Sep 17 00:00:00 2001
From: Neal Cardwell <ncardwell@google.com>
Date: Sun, 24 Jun 2018 21:55:59 -0400
Subject: [PATCH 02/19] net-tcp_bbr: v2: shrink delivered_mstamp,
first_tx_mstamp to u32 to free up 8 bytes
Free up some space for tracking inflight and losses for each
bw sample, in upcoming commits.
These timestamps are in microseconds, and are now stored in 32
bits. So they can only hold time intervals up to roughly 2^12 = 4096
seconds. But Linux TCP RTT and RTO tracking has the same 32-bit
microsecond implementation approach and resulting deployment
limitations. So this is not introducing a new limit. And these should
not be a limitation for the foreseeable future.
Effort: net-tcp_bbr
Origin-9xx-SHA1: 238a7e6b5d51625fef1ce7769826a7b21b02ae55
Change-Id: I3b779603797263b52a61ad57c565eb91fe42680c
Signed-off-by: Alexandre Frade <kernel@xanmod.org>
---
include/net/tcp.h | 9 +++++++--
net/ipv4/tcp_rate.c | 7 ++++---
2 files changed, 11 insertions(+), 5 deletions(-)
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -884,6 +884,11 @@ static inline u32 tcp_stamp_us_delta(u64
return max_t(s64, t1 - t0, 0);
}
+static inline u32 tcp_stamp32_us_delta(u32 t1, u32 t0)
+{
+ return max_t(s32, t1 - t0, 0);
+}
+
/* provide the departure time in us unit */
static inline u64 tcp_skb_timestamp_us(const struct sk_buff *skb)
{
@@ -973,9 +978,9 @@ struct tcp_skb_cb {
/* pkts S/ACKed so far upon tx of skb, incl retrans: */
__u32 delivered;
/* start of send pipeline phase */
- u64 first_tx_mstamp;
+ u32 first_tx_mstamp;
/* when we reached the "delivered" count */
- u64 delivered_mstamp;
+ u32 delivered_mstamp;
} tx; /* only used for outgoing skbs */
union {
struct inet_skb_parm h4;
--- a/net/ipv4/tcp_rate.c
+++ b/net/ipv4/tcp_rate.c
@@ -101,8 +101,9 @@ void tcp_rate_skb_delivered(struct sock
/* Record send time of most recently ACKed packet: */
tp->first_tx_mstamp = tx_tstamp;
/* Find the duration of the "send phase" of this window: */
- rs->interval_us = tcp_stamp_us_delta(tp->first_tx_mstamp,
- scb->tx.first_tx_mstamp);
+ rs->interval_us = tcp_stamp32_us_delta(
+ tp->first_tx_mstamp,
+ scb->tx.first_tx_mstamp);
}
/* Mark off the skb delivered once it's sacked to avoid being
@@ -155,7 +156,7 @@ void tcp_rate_gen(struct sock *sk, u32 d
* longer phase.
*/
snd_us = rs->interval_us; /* send phase */
- ack_us = tcp_stamp_us_delta(tp->tcp_mstamp,
+ ack_us = tcp_stamp32_us_delta(tp->tcp_mstamp,
rs->prior_mstamp); /* ack phase */
rs->interval_us = max(snd_us, ack_us);

View File

@@ -0,0 +1,109 @@
From 25856231832186fe13189b986cc0e91860c18201 Mon Sep 17 00:00:00 2001
From: Neal Cardwell <ncardwell@google.com>
Date: Sat, 5 Aug 2017 11:49:50 -0400
Subject: [PATCH 03/19] net-tcp_bbr: v2: snapshot packets in flight at transmit
time and pass in rate_sample
CC algorithms may want to snapshot the number of packets in flight at
transmit time and pass in rate_sample, to understand the relationship
between inflight and losses or ECN signals, to try to find the highest
inflight value that has acceptable levels of loss/ECN marking.
We split out the code to set an skb's tx.in_flight field into its own
function, so that this code can be used for the TCP_REPAIR "fake send"
code path that inserts skbs into the rtx queue without sending them.
Effort: net-tcp_bbr
Origin-9xx-SHA1: b3eb4f2d20efab4ca001f32c9294739036c493ea
Origin-9xx-SHA1: e880fc907d06ea7354333f60f712748ebce9497b
Origin-9xx-SHA1: 330f825a08a6fe92cef74d799cc468864c479f63
Change-Id: I7314047d0ff14dd261a04b1969a46dc658c8836a
Signed-off-by: Alexandre Frade <kernel@xanmod.org>
---
include/net/tcp.h | 6 ++++++
net/ipv4/tcp_output.c | 1 +
net/ipv4/tcp_rate.c | 20 ++++++++++++++++++++
3 files changed, 27 insertions(+)
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -981,6 +981,10 @@ struct tcp_skb_cb {
u32 first_tx_mstamp;
/* when we reached the "delivered" count */
u32 delivered_mstamp;
+#define TCPCB_IN_FLIGHT_BITS 20
+#define TCPCB_IN_FLIGHT_MAX ((1U << TCPCB_IN_FLIGHT_BITS) - 1)
+ u32 in_flight:20, /* packets in flight at transmit */
+ unused2:12;
} tx; /* only used for outgoing skbs */
union {
struct inet_skb_parm h4;
@@ -1136,6 +1140,7 @@ struct rate_sample {
u64 prior_mstamp; /* starting timestamp for interval */
u32 prior_delivered; /* tp->delivered at "prior_mstamp" */
u32 prior_delivered_ce;/* tp->delivered_ce at "prior_mstamp" */
+ u32 tx_in_flight; /* packets in flight at starting timestamp */
s32 delivered; /* number of packets delivered over interval */
s32 delivered_ce; /* number of packets delivered w/ CE marks*/
long interval_us; /* time for tp->delivered to incr "delivered" */
@@ -1258,6 +1263,7 @@ static inline void tcp_ca_event(struct s
void tcp_set_ca_state(struct sock *sk, const u8 ca_state);
/* From tcp_rate.c */
+void tcp_set_tx_in_flight(struct sock *sk, struct sk_buff *skb);
void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb);
void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb,
struct rate_sample *rs);
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2765,6 +2765,7 @@ static bool tcp_write_xmit(struct sock *
skb_set_delivery_time(skb, tp->tcp_wstamp_ns, SKB_CLOCK_MONOTONIC);
list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue);
tcp_init_tso_segs(skb, mss_now);
+ tcp_set_tx_in_flight(sk, skb);
goto repair; /* Skip network transmission */
}
--- a/net/ipv4/tcp_rate.c
+++ b/net/ipv4/tcp_rate.c
@@ -34,6 +34,24 @@
* ready to send in the write queue.
*/
+void tcp_set_tx_in_flight(struct sock *sk, struct sk_buff *skb)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ u32 in_flight;
+
+ /* Check, sanitize, and record packets in flight after skb was sent. */
+ in_flight = tcp_packets_in_flight(tp) + tcp_skb_pcount(skb);
+ if (WARN_ONCE(in_flight > TCPCB_IN_FLIGHT_MAX,
+ "insane in_flight %u cc %s mss %u "
+ "cwnd %u pif %u %u %u %u\n",
+ in_flight, inet_csk(sk)->icsk_ca_ops->name,
+ tp->mss_cache, tp->snd_cwnd,
+ tp->packets_out, tp->retrans_out,
+ tp->sacked_out, tp->lost_out))
+ in_flight = TCPCB_IN_FLIGHT_MAX;
+ TCP_SKB_CB(skb)->tx.in_flight = in_flight;
+}
+
/* Snapshot the current delivery information in the skb, to generate
* a rate sample later when the skb is (s)acked in tcp_rate_skb_delivered().
*/
@@ -67,6 +85,7 @@ void tcp_rate_skb_sent(struct sock *sk,
TCP_SKB_CB(skb)->tx.delivered = tp->delivered;
TCP_SKB_CB(skb)->tx.delivered_ce = tp->delivered_ce;
TCP_SKB_CB(skb)->tx.is_app_limited = tp->app_limited ? 1 : 0;
+ tcp_set_tx_in_flight(sk, skb);
}
/* When an skb is sacked or acked, we fill in the rate sample with the (prior)
@@ -96,6 +115,7 @@ void tcp_rate_skb_delivered(struct sock
rs->prior_mstamp = scb->tx.delivered_mstamp;
rs->is_app_limited = scb->tx.is_app_limited;
rs->is_retrans = scb->sacked & TCPCB_RETRANS;
+ rs->tx_in_flight = scb->tx.in_flight;
rs->last_end_seq = scb->end_seq;
/* Record send time of most recently ACKed packet: */

View File

@@ -0,0 +1,70 @@
From b1772710e8b5b98c09e96d4f1af620cd938fddf7 Mon Sep 17 00:00:00 2001
From: Neal Cardwell <ncardwell@google.com>
Date: Thu, 12 Oct 2017 23:44:27 -0400
Subject: [PATCH 04/19] net-tcp_bbr: v2: count packets lost over TCP rate
sampling interval
For understanding the relationship between inflight and packet loss
signals, to try to find the highest inflight value that has acceptable
levels of packet losses.
Effort: net-tcp_bbr
Origin-9xx-SHA1: 4527e26b2bd7756a88b5b9ef1ada3da33dd609ab
Change-Id: I594c2500868d9c530770e7ddd68ffc87c57f4fd5
Signed-off-by: Alexandre Frade <kernel@xanmod.org>
---
include/net/tcp.h | 5 ++++-
net/ipv4/tcp_rate.c | 3 +++
2 files changed, 7 insertions(+), 1 deletion(-)
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -985,6 +985,7 @@ struct tcp_skb_cb {
#define TCPCB_IN_FLIGHT_MAX ((1U << TCPCB_IN_FLIGHT_BITS) - 1)
u32 in_flight:20, /* packets in flight at transmit */
unused2:12;
+ u32 lost; /* packets lost so far upon tx of skb */
} tx; /* only used for outgoing skbs */
union {
struct inet_skb_parm h4;
@@ -1138,11 +1139,13 @@ struct ack_sample {
*/
struct rate_sample {
u64 prior_mstamp; /* starting timestamp for interval */
+ u32 prior_lost; /* tp->lost at "prior_mstamp" */
u32 prior_delivered; /* tp->delivered at "prior_mstamp" */
u32 prior_delivered_ce;/* tp->delivered_ce at "prior_mstamp" */
u32 tx_in_flight; /* packets in flight at starting timestamp */
+ s32 lost; /* number of packets lost over interval */
s32 delivered; /* number of packets delivered over interval */
- s32 delivered_ce; /* number of packets delivered w/ CE marks*/
+ s32 delivered_ce; /* packets delivered w/ CE mark over interval */
long interval_us; /* time for tp->delivered to incr "delivered" */
u32 snd_interval_us; /* snd interval for delivered packets */
u32 rcv_interval_us; /* rcv interval for delivered packets */
--- a/net/ipv4/tcp_rate.c
+++ b/net/ipv4/tcp_rate.c
@@ -84,6 +84,7 @@ void tcp_rate_skb_sent(struct sock *sk,
TCP_SKB_CB(skb)->tx.delivered_mstamp = tp->delivered_mstamp;
TCP_SKB_CB(skb)->tx.delivered = tp->delivered;
TCP_SKB_CB(skb)->tx.delivered_ce = tp->delivered_ce;
+ TCP_SKB_CB(skb)->tx.lost = tp->lost;
TCP_SKB_CB(skb)->tx.is_app_limited = tp->app_limited ? 1 : 0;
tcp_set_tx_in_flight(sk, skb);
}
@@ -110,6 +111,7 @@ void tcp_rate_skb_delivered(struct sock
if (!rs->prior_delivered ||
tcp_skb_sent_after(tx_tstamp, tp->first_tx_mstamp,
scb->end_seq, rs->last_end_seq)) {
+ rs->prior_lost = scb->tx.lost;
rs->prior_delivered_ce = scb->tx.delivered_ce;
rs->prior_delivered = scb->tx.delivered;
rs->prior_mstamp = scb->tx.delivered_mstamp;
@@ -165,6 +167,7 @@ void tcp_rate_gen(struct sock *sk, u32 d
return;
}
rs->delivered = tp->delivered - rs->prior_delivered;
+ rs->lost = tp->lost - rs->prior_lost;
rs->delivered_ce = tp->delivered_ce - rs->prior_delivered_ce;
/* delivered_ce occupies less than 32 bits in the skb control block */

View File

@@ -0,0 +1,38 @@
From fdf01142aea8645186e080f1278d3b5a5fd8c66c Mon Sep 17 00:00:00 2001
From: Neal Cardwell <ncardwell@google.com>
Date: Mon, 19 Nov 2018 13:48:36 -0500
Subject: [PATCH 05/19] net-tcp_bbr: v2: export FLAG_ECE in rate_sample.is_ece
For understanding the relationship between inflight and ECN signals,
to try to find the highest inflight value that has acceptable levels
ECN marking.
Effort: net-tcp_bbr
Origin-9xx-SHA1: 3eba998f2898541406c2666781182200934965a8
Change-Id: I3a964e04cee83e11649a54507043d2dfe769a3b3
Signed-off-by: Alexandre Frade <kernel@xanmod.org>
---
include/net/tcp.h | 1 +
net/ipv4/tcp_input.c | 1 +
2 files changed, 2 insertions(+)
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1157,6 +1157,7 @@ struct rate_sample {
bool is_app_limited; /* is sample from packet with bubble in pipe? */
bool is_retrans; /* is sample from retransmission? */
bool is_ack_delayed; /* is this (likely) a delayed ACK? */
+ bool is_ece; /* did this ACK have ECN marked? */
};
struct tcp_congestion_ops {
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4060,6 +4060,7 @@ static int tcp_ack(struct sock *sk, cons
delivered = tcp_newly_delivered(sk, delivered, flag);
lost = tp->lost - lost; /* freshly marked lost */
rs.is_ack_delayed = !!(flag & FLAG_ACK_MAYBE_DELAYED);
+ rs.is_ece = !!(flag & FLAG_ECE);
tcp_rate_gen(sk, delivered, lost, is_sack_reneg, sack_state.rate);
tcp_cong_control(sk, ack, delivered, flag, sack_state.rate);
tcp_xmit_recovery(sk, rexmit);

View File

@@ -0,0 +1,57 @@
From a3e88432c2ebf12de9c2053a13417ddf2ad4cb4e Mon Sep 17 00:00:00 2001
From: Neal Cardwell <ncardwell@google.com>
Date: Tue, 7 Aug 2018 21:52:06 -0400
Subject: [PATCH 06/19] net-tcp_bbr: v2: introduce ca_ops->skb_marked_lost() CC
module callback API
For connections experiencing reordering, RACK can mark packets lost
long after we receive the SACKs/ACKs hinting that the packets were
actually lost.
This means that CC modules cannot easily learn the volume of inflight
data at which packet loss happens by looking at the current inflight
or even the packets in flight when the most recently SACKed packet was
sent. To learn this, CC modules need to know how many packets were in
flight at the time lost packets were sent. This new callback, combined
with TCP_SKB_CB(skb)->tx.in_flight, allows them to learn this.
This also provides a consistent callback that is invoked whether
packets are marked lost upon ACK processing, using the RACK reordering
timer, or at RTO time.
Effort: net-tcp_bbr
Origin-9xx-SHA1: afcbebe3374e4632ac6714d39e4dc8a8455956f4
Change-Id: I54826ab53df636be537e5d3c618a46145d12d51a
Signed-off-by: Alexandre Frade <kernel@xanmod.org>
---
include/net/tcp.h | 3 +++
net/ipv4/tcp_input.c | 5 +++++
2 files changed, 8 insertions(+)
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1184,6 +1184,9 @@ struct tcp_congestion_ops {
/* override sysctl_tcp_min_tso_segs */
u32 (*min_tso_segs)(struct sock *sk);
+ /* react to a specific lost skb (optional) */
+ void (*skb_marked_lost)(struct sock *sk, const struct sk_buff *skb);
+
/* call when packets are delivered to update cwnd and pacing rate,
* after all the ca_state processing. (optional)
*/
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1120,7 +1120,12 @@ static void tcp_verify_retransmit_hint(s
*/
static void tcp_notify_skb_loss_event(struct tcp_sock *tp, const struct sk_buff *skb)
{
+ struct sock *sk = (struct sock *)tp;
+ const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
+
tp->lost += tcp_skb_pcount(skb);
+ if (ca_ops->skb_marked_lost)
+ ca_ops->skb_marked_lost(sk, skb);
}
void tcp_mark_skb_lost(struct sock *sk, struct sk_buff *skb)

View File

@@ -0,0 +1,59 @@
From af7d33e71649b8e2ae00dccf336720a8ab891606 Mon Sep 17 00:00:00 2001
From: Neal Cardwell <ncardwell@google.com>
Date: Wed, 1 May 2019 20:16:33 -0400
Subject: [PATCH 07/19] net-tcp_bbr: v2: adjust skb tx.in_flight upon merge in
tcp_shifted_skb()
When tcp_shifted_skb() updates state as adjacent SACKed skbs are
coalesced, previously the tx.in_flight was not adjusted, so we could
get contradictory state where the skb's recorded pcount was bigger
than the tx.in_flight (the number of segments that were in_flight
after sending the skb).
Normally have a SACKed skb with contradictory pcount/tx.in_flight
would not matter. However, with SACK reneging, the SACKed bit is
removed, and an skb once again becomes eligible for retransmitting,
fragmenting, SACKing, etc. Packetdrill testing verified the following
sequence is possible in a kernel that does not have this commit:
- skb N is SACKed
- skb N+1 is SACKed and combined with skb N using tcp_shifted_skb()
- tcp_shifted_skb() will increase the pcount of prev,
but leave tx.in_flight as-is
- so prev skb can have pcount > tx.in_flight
- RTO, tcp_timeout_mark_lost(), detect reneg,
remove "SACKed" bit, mark skb N as lost
- find pcount of skb N is greater than its tx.in_flight
I suspect this issue iw what caused the bbr2_inflight_hi_from_lost_skb():
WARN_ON_ONCE(inflight_prev < 0)
to fire in production machines using bbr2.
Effort: net-tcp_bbr
Origin-9xx-SHA1: 1a3e997e613d2dcf32b947992882854ebe873715
Change-Id: I1b0b75c27519953430c7db51c6f358f104c7af55
Signed-off-by: Alexandre Frade <kernel@xanmod.org>
---
net/ipv4/tcp_input.c | 11 +++++++++++
1 file changed, 11 insertions(+)
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1506,6 +1506,17 @@ static bool tcp_shifted_skb(struct sock
WARN_ON_ONCE(tcp_skb_pcount(skb) < pcount);
tcp_skb_pcount_add(skb, -pcount);
+ /* Adjust tx.in_flight as pcount is shifted from skb to prev. */
+ if (WARN_ONCE(TCP_SKB_CB(skb)->tx.in_flight < pcount,
+ "prev in_flight: %u skb in_flight: %u pcount: %u",
+ TCP_SKB_CB(prev)->tx.in_flight,
+ TCP_SKB_CB(skb)->tx.in_flight,
+ pcount))
+ TCP_SKB_CB(skb)->tx.in_flight = 0;
+ else
+ TCP_SKB_CB(skb)->tx.in_flight -= pcount;
+ TCP_SKB_CB(prev)->tx.in_flight += pcount;
+
/* When we're adding to gso_segs == 1, gso_size will be zero,
* in theory this shouldn't be necessary but as long as DSACK
* code can come after this skb later on it's better to keep

View File

@@ -0,0 +1,97 @@
From a4d44bce49f61f8755f558dc40edff5f8958b7c6 Mon Sep 17 00:00:00 2001
From: Neal Cardwell <ncardwell@google.com>
Date: Wed, 1 May 2019 20:16:25 -0400
Subject: [PATCH 08/19] net-tcp_bbr: v2: adjust skb tx.in_flight upon split in
tcp_fragment()
When we fragment an skb that has already been sent, we need to update
the tx.in_flight for the first skb in the resulting pair ("buff").
Because we were not updating the tx.in_flight, the tx.in_flight value
was inconsistent with the pcount of the "buff" skb (tx.in_flight would
be too high). That meant that if the "buff" skb was lost, then
bbr2_inflight_hi_from_lost_skb() would calculate an inflight_hi value
that is too high. This could result in longer queues and higher packet
loss.
Packetdrill testing verified that without this commit, when the second
half of an skb is SACKed and then later the first half of that skb is
marked lost, the calculated inflight_hi was incorrect.
Effort: net-tcp_bbr
Origin-9xx-SHA1: 385f1ddc610798fab2837f9f372857438b25f874
Origin-9xx-SHA1: a0eb099690af net-tcp_bbr: v2: fix tcp_fragment() tx.in_flight recomputation [prod feb 8 2021; use as a fixup]
Origin-9xx-SHA1: 885503228153ff0c9114e net-tcp_bbr: v2: introduce tcp_skb_tx_in_flight_is_suspicious() helper for warnings
Change-Id: I617f8cab4e9be7a0b8e8d30b047bf8645393354d
Signed-off-by: Alexandre Frade <kernel@xanmod.org>
---
include/net/tcp.h | 15 +++++++++++++++
net/ipv4/tcp_output.c | 26 +++++++++++++++++++++++++-
2 files changed, 40 insertions(+), 1 deletion(-)
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1283,6 +1283,21 @@ static inline bool tcp_skb_sent_after(u6
return t1 > t2 || (t1 == t2 && after(seq1, seq2));
}
+/* If a retransmit failed due to local qdisc congestion or other local issues,
+ * then we may have called tcp_set_skb_tso_segs() to increase the number of
+ * segments in the skb without increasing the tx.in_flight. In all other cases,
+ * the tx.in_flight should be at least as big as the pcount of the sk_buff. We
+ * do not have the state to know whether a retransmit failed due to local qdisc
+ * congestion or other local issues, so to avoid spurious warnings we consider
+ * that any skb marked lost may have suffered that fate.
+ */
+static inline bool tcp_skb_tx_in_flight_is_suspicious(u32 skb_pcount,
+ u32 skb_sacked_flags,
+ u32 tx_in_flight)
+{
+ return (skb_pcount > tx_in_flight) && !(skb_sacked_flags & TCPCB_LOST);
+}
+
/* These functions determine how the current flow behaves in respect of SACK
* handling. SACK is negotiated with the peer, and therefore it can vary
* between different flows.
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1601,7 +1601,7 @@ int tcp_fragment(struct sock *sk, enum t
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *buff;
- int old_factor;
+ int old_factor, inflight_prev;
long limit;
int nlen;
u8 flags;
@@ -1676,6 +1676,30 @@ int tcp_fragment(struct sock *sk, enum t
if (diff)
tcp_adjust_pcount(sk, skb, diff);
+
+ inflight_prev = TCP_SKB_CB(skb)->tx.in_flight - old_factor;
+ if (inflight_prev < 0) {
+ WARN_ONCE(tcp_skb_tx_in_flight_is_suspicious(
+ old_factor,
+ TCP_SKB_CB(skb)->sacked,
+ TCP_SKB_CB(skb)->tx.in_flight),
+ "inconsistent: tx.in_flight: %u "
+ "old_factor: %d mss: %u sacked: %u "
+ "1st pcount: %d 2nd pcount: %d "
+ "1st len: %u 2nd len: %u ",
+ TCP_SKB_CB(skb)->tx.in_flight, old_factor,
+ mss_now, TCP_SKB_CB(skb)->sacked,
+ tcp_skb_pcount(skb), tcp_skb_pcount(buff),
+ skb->len, buff->len);
+ inflight_prev = 0;
+ }
+ /* Set 1st tx.in_flight as if 1st were sent by itself: */
+ TCP_SKB_CB(skb)->tx.in_flight = inflight_prev +
+ tcp_skb_pcount(skb);
+ /* Set 2nd tx.in_flight with new 1st and 2nd pcounts: */
+ TCP_SKB_CB(buff)->tx.in_flight = inflight_prev +
+ tcp_skb_pcount(skb) +
+ tcp_skb_pcount(buff);
}
/* Link BUFF into the send queue. */

View File

@@ -0,0 +1,73 @@
From 65cca0e8fd954a150ec874650af47f7800ea3049 Mon Sep 17 00:00:00 2001
From: Yousuk Seung <ysseung@google.com>
Date: Wed, 23 May 2018 17:55:54 -0700
Subject: [PATCH 09/19] net-tcp: add new ca opts flag TCP_CONG_WANTS_CE_EVENTS
Add a a new ca opts flag TCP_CONG_WANTS_CE_EVENTS that allows a
congestion control module to receive CE events.
Currently congestion control modules have to set the TCP_CONG_NEEDS_ECN
bit in opts flag to receive CE events but this may incur changes in ECN
behavior elsewhere. This patch adds a new bit TCP_CONG_WANTS_CE_EVENTS
that allows congestion control modules to receive CE events
independently of TCP_CONG_NEEDS_ECN.
Effort: net-tcp
Origin-9xx-SHA1: 9f7e14716cde760bc6c67ef8ef7e1ee48501d95b
Change-Id: I2255506985242f376d910c6fd37daabaf4744f24
Signed-off-by: Alexandre Frade <kernel@xanmod.org>
---
include/net/tcp.h | 14 +++++++++++++-
net/ipv4/tcp_input.c | 4 ++--
2 files changed, 15 insertions(+), 3 deletions(-)
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1119,7 +1119,11 @@ enum tcp_ca_ack_event_flags {
#define TCP_CONG_NON_RESTRICTED 0x1
/* Requires ECN/ECT set on all packets */
#define TCP_CONG_NEEDS_ECN 0x2
-#define TCP_CONG_MASK (TCP_CONG_NON_RESTRICTED | TCP_CONG_NEEDS_ECN)
+/* Wants notification of CE events (CA_EVENT_ECN_IS_CE, CA_EVENT_ECN_NO_CE). */
+#define TCP_CONG_WANTS_CE_EVENTS 0x4
+#define TCP_CONG_MASK (TCP_CONG_NON_RESTRICTED | \
+ TCP_CONG_NEEDS_ECN | \
+ TCP_CONG_WANTS_CE_EVENTS)
union tcp_cc_info;
@@ -1251,6 +1255,14 @@ static inline char *tcp_ca_get_name_by_k
}
#endif
+static inline bool tcp_ca_wants_ce_events(const struct sock *sk)
+{
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+
+ return icsk->icsk_ca_ops->flags & (TCP_CONG_NEEDS_ECN |
+ TCP_CONG_WANTS_CE_EVENTS);
+}
+
static inline bool tcp_ca_needs_ecn(const struct sock *sk)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -370,7 +370,7 @@ static void __tcp_ecn_check_ce(struct so
tcp_enter_quickack_mode(sk, 2);
break;
case INET_ECN_CE:
- if (tcp_ca_needs_ecn(sk))
+ if (tcp_ca_wants_ce_events(sk))
tcp_ca_event(sk, CA_EVENT_ECN_IS_CE);
if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) {
@@ -381,7 +381,7 @@ static void __tcp_ecn_check_ce(struct so
tp->ecn_flags |= TCP_ECN_SEEN;
break;
default:
- if (tcp_ca_needs_ecn(sk))
+ if (tcp_ca_wants_ce_events(sk))
tcp_ca_event(sk, CA_EVENT_ECN_NO_CE);
tp->ecn_flags |= TCP_ECN_SEEN;
break;

View File

@@ -0,0 +1,118 @@
From 3acb852e1cfcdeea388bd428c6dd81609fd40792 Mon Sep 17 00:00:00 2001
From: Neal Cardwell <ncardwell@google.com>
Date: Fri, 27 Sep 2019 17:10:26 -0400
Subject: [PATCH 10/19] net-tcp: re-generalize TSO sizing in TCP CC module API
Reorganize the API for CC modules so that the CC module once again
gets complete control of the TSO sizing decision. This is how the API
was set up around 2016 and the initial BBRv1 upstreaming. Later Eric
Dumazet simplified it. But with wider testing it now seems that to
avoid CPU regressions BBR needs to have a different TSO sizing
function.
This is necessary to handle cases where there are many flows
bottlenecked on the sender host's NIC, in which case BBR's pacing rate
is much lower than CUBIC/Reno/DCTCP's. Why does this happen? Because
BBR's pacing rate adapts to the low bandwidth share each flow sees. By
contrast, CUBIC/Reno/DCTCP see no loss or ECN, so they grow a very
large cwnd, and thus large pacing rate and large TSO burst size.
Change-Id: Ic8ccfdbe4010ee8d4bf6a6334c48a2fceb2171ea
Signed-off-by: Alexandre Frade <kernel@xanmod.org>
---
include/net/tcp.h | 4 ++--
net/ipv4/tcp_bbr.c | 37 ++++++++++++++++++++++++++-----------
net/ipv4/tcp_output.c | 11 +++++------
3 files changed, 33 insertions(+), 19 deletions(-)
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1185,8 +1185,8 @@ struct tcp_congestion_ops {
/* hook for packet ack accounting (optional) */
void (*pkts_acked)(struct sock *sk, const struct ack_sample *sample);
- /* override sysctl_tcp_min_tso_segs */
- u32 (*min_tso_segs)(struct sock *sk);
+ /* pick target number of segments per TSO/GSO skb (optional): */
+ u32 (*tso_segs)(struct sock *sk, unsigned int mss_now);
/* react to a specific lost skb (optional) */
void (*skb_marked_lost)(struct sock *sk, const struct sk_buff *skb);
--- a/net/ipv4/tcp_bbr.c
+++ b/net/ipv4/tcp_bbr.c
@@ -301,20 +301,35 @@ __bpf_kfunc static u32 bbr_min_tso_segs(
return READ_ONCE(sk->sk_pacing_rate) < (bbr_min_tso_rate >> 3) ? 1 : 2;
}
+/* Return the number of segments BBR would like in a TSO/GSO skb, given
+ * a particular max gso size as a constraint.
+ */
+static u32 bbr_tso_segs_generic(struct sock *sk, unsigned int mss_now,
+ u32 gso_max_size)
+{
+ u32 segs;
+ u64 bytes;
+
+ /* Budget a TSO/GSO burst size allowance based on bw (pacing_rate). */
+ bytes = READ_ONCE(sk->sk_pacing_rate) >> READ_ONCE(sk->sk_pacing_shift);
+
+ bytes = min_t(u32, bytes, gso_max_size - 1 - MAX_TCP_HEADER);
+ segs = max_t(u32, bytes / mss_now, bbr_min_tso_segs(sk));
+ return segs;
+}
+
+/* Custom tcp_tso_autosize() for BBR, used at transmit time to cap skb size. */
+static u32 bbr_tso_segs(struct sock *sk, unsigned int mss_now)
+{
+ return bbr_tso_segs_generic(sk, mss_now, sk->sk_gso_max_size);
+}
+
+/* Like bbr_tso_segs(), using mss_cache, ignoring driver's sk_gso_max_size. */
static u32 bbr_tso_segs_goal(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
- u32 segs, bytes;
-
- /* Sort of tcp_tso_autosize() but ignoring
- * driver provided sk_gso_max_size.
- */
- bytes = min_t(unsigned long,
- READ_ONCE(sk->sk_pacing_rate) >> READ_ONCE(sk->sk_pacing_shift),
- GSO_LEGACY_MAX_SIZE - 1 - MAX_TCP_HEADER);
- segs = max_t(u32, bytes / tp->mss_cache, bbr_min_tso_segs(sk));
- return min(segs, 0x7FU);
+ return bbr_tso_segs_generic(sk, tp->mss_cache, GSO_MAX_SIZE);
}
/* Save "last known good" cwnd so we can restore it after losses or PROBE_RTT */
@@ -1150,7 +1165,7 @@ static struct tcp_congestion_ops tcp_bbr
.undo_cwnd = bbr_undo_cwnd,
.cwnd_event = bbr_cwnd_event,
.ssthresh = bbr_ssthresh,
- .min_tso_segs = bbr_min_tso_segs,
+ .tso_segs = bbr_tso_segs,
.get_info = bbr_get_info,
.set_state = bbr_set_state,
};
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2057,13 +2057,12 @@ static u32 tcp_tso_autosize(const struct
static u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now)
{
const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
- u32 min_tso, tso_segs;
+ u32 tso_segs;
- min_tso = ca_ops->min_tso_segs ?
- ca_ops->min_tso_segs(sk) :
- READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs);
-
- tso_segs = tcp_tso_autosize(sk, mss_now, min_tso);
+ tso_segs = ca_ops->tso_segs ?
+ ca_ops->tso_segs(sk, mss_now) :
+ tcp_tso_autosize(sk, mss_now,
+ sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs);
return min_t(u32, tso_segs, sk->sk_gso_max_segs);
}

View File

@@ -0,0 +1,72 @@
From 3741ada76bab5111cbb9c279cf27e67a0334eb05 Mon Sep 17 00:00:00 2001
From: Neal Cardwell <ncardwell@google.com>
Date: Sun, 7 Jan 2024 21:11:26 -0300
Subject: [PATCH 11/19] net-tcp: add fast_ack_mode=1: skip rwin check in
tcp_fast_ack_mode__tcp_ack_snd_check()
Add logic for an experimental TCP connection behavior, enabled with
tp->fast_ack_mode = 1, which disables checking the receive window
before sending an ack in __tcp_ack_snd_check(). If this behavior is
enabled, the data receiver sends an ACK if the amount of data is >
RCV.MSS.
Change-Id: Iaa0a0fd7108221f883137a79d5bfa724f1b096d4
Signed-off-by: Alexandre Frade <kernel@xanmod.org>
---
include/linux/tcp.h | 3 ++-
net/ipv4/tcp.c | 1 +
net/ipv4/tcp_cong.c | 1 +
net/ipv4/tcp_input.c | 5 +++--
4 files changed, 7 insertions(+), 3 deletions(-)
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -369,7 +369,8 @@ struct tcp_sock {
u8 compressed_ack;
u8 dup_ack_counter:2,
tlp_retrans:1, /* TLP is a retransmission */
- unused:5;
+ fast_ack_mode:2, /* which fast ack mode ? */
+ unused:3;
u8 thin_lto : 1,/* Use linear timeouts for thin streams */
fastopen_connect:1, /* FASTOPEN_CONNECT sockopt */
fastopen_no_cookie:1, /* Allow send/recv SYN+data without a cookie */
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -3123,6 +3123,7 @@ int tcp_disconnect(struct sock *sk, int
tp->rx_opt.dsack = 0;
tp->rx_opt.num_sacks = 0;
tp->rcv_ooopack = 0;
+ tp->fast_ack_mode = 0;
/* Clean up fastopen related fields */
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -237,6 +237,7 @@ void tcp_init_congestion_control(struct
struct inet_connection_sock *icsk = inet_csk(sk);
tcp_sk(sk)->prior_ssthresh = 0;
+ tcp_sk(sk)->fast_ack_mode = 0;
if (icsk->icsk_ca_ops->init)
icsk->icsk_ca_ops->init(sk);
if (tcp_ca_needs_ecn(sk))
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5763,13 +5763,14 @@ static void __tcp_ack_snd_check(struct s
/* More than one full frame received... */
if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss &&
+ (tp->fast_ack_mode == 1 ||
/* ... and right edge of window advances far enough.
* (tcp_recvmsg() will send ACK otherwise).
* If application uses SO_RCVLOWAT, we want send ack now if
* we have not received enough bytes to satisfy the condition.
*/
- (tp->rcv_nxt - tp->copied_seq < sk->sk_rcvlowat ||
- __tcp_select_window(sk) >= tp->rcv_wnd)) ||
+ (tp->rcv_nxt - tp->copied_seq < sk->sk_rcvlowat ||
+ __tcp_select_window(sk) >= tp->rcv_wnd))) ||
/* We ACK each frame or... */
tcp_in_quickack_mode(sk) ||
/* Protocol state mandates a one-time immediate ACK */

View File

@@ -0,0 +1,45 @@
From e5d35b7c882b7001f8a31b14c9f08917230dedc3 Mon Sep 17 00:00:00 2001
From: Jianfeng Wang <jfwang@google.com>
Date: Fri, 19 Jun 2020 17:33:45 +0000
Subject: [PATCH 12/19] net-tcp_bbr: v2: record app-limited status of
TLP-repaired flight
When sending a TLP retransmit, record whether the outstanding flight
of data is application limited. This is important for congestion
control modules that want to respond to losses repaired by TLP
retransmits. This is important because the following scenarios convey
very different information:
(1) a packet loss with a small number of packets in flight;
(2) a packet loss with the maximum amount of data in flight allowed
by the CC module;
Effort: net-tcp_bbr
Change-Id: Ic8ae567caa4e4bfd5fd82c3d4be12a5d9171655e
Signed-off-by: Alexandre Frade <kernel@xanmod.org>
---
include/linux/tcp.h | 3 ++-
net/ipv4/tcp_output.c | 1 +
2 files changed, 3 insertions(+), 1 deletion(-)
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -370,7 +370,8 @@ struct tcp_sock {
u8 dup_ack_counter:2,
tlp_retrans:1, /* TLP is a retransmission */
fast_ack_mode:2, /* which fast ack mode ? */
- unused:3;
+ tlp_orig_data_app_limited:1, /* app-limited before TLP rtx? */
+ unused:2;
u8 thin_lto : 1,/* Use linear timeouts for thin streams */
fastopen_connect:1, /* FASTOPEN_CONNECT sockopt */
fastopen_no_cookie:1, /* Allow send/recv SYN+data without a cookie */
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -3003,6 +3003,7 @@ void tcp_send_loss_probe(struct sock *sk
if (WARN_ON(!skb || !tcp_skb_pcount(skb)))
goto rearm_timer;
+ tp->tlp_orig_data_app_limited = TCP_SKB_CB(skb)->tx.is_app_limited;
if (__tcp_retransmit_skb(sk, skb, 1))
goto rearm_timer;

View File

@@ -0,0 +1,45 @@
From 77e7c22b63f8934206b1e89c173558c3967f0779 Mon Sep 17 00:00:00 2001
From: Jianfeng Wang <jfwang@google.com>
Date: Tue, 16 Jun 2020 17:41:19 +0000
Subject: [PATCH 13/19] net-tcp_bbr: v2: inform CC module of losses repaired by
TLP probe
Before this commit, when there is a packet loss that creates a sequence
hole that is filled by a TLP loss probe, then tcp_process_tlp_ack()
only informs the congestion control (CC) module via a back-to-back entry
and exit of CWR. But some congestion control modules (e.g. BBR) do not
respond to CWR events.
This commit adds a new CA event with which the core TCP stack notifies
the CC module when a loss is repaired by a TLP. This will allow CC
modules that do not use the CWR mechanism to have a custom handler for
such TLP recoveries.
Effort: net-tcp_bbr
Change-Id: Ieba72332b401b329bff5a641d2b2043a3fb8f632
Signed-off-by: Alexandre Frade <kernel@xanmod.org>
---
include/net/tcp.h | 1 +
net/ipv4/tcp_input.c | 1 +
2 files changed, 2 insertions(+)
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1097,6 +1097,7 @@ enum tcp_ca_event {
CA_EVENT_LOSS, /* loss timeout */
CA_EVENT_ECN_NO_CE, /* ECT set, but not CE marked */
CA_EVENT_ECN_IS_CE, /* received CE marked IP packet */
+ CA_EVENT_TLP_RECOVERY, /* a lost segment was repaired by TLP probe */
};
/* Information about inbound ACK, passed to cong_ops->in_ack_event() */
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3859,6 +3859,7 @@ static void tcp_process_tlp_ack(struct s
/* ACK advances: there was a loss, so reduce cwnd. Reset
* tlp_high_seq in tcp_init_cwnd_reduction()
*/
+ tcp_ca_event(sk, CA_EVENT_TLP_RECOVERY);
tcp_init_cwnd_reduction(sk);
tcp_set_ca_state(sk, TCP_CA_CWR);
tcp_end_cwnd_reduction(sk);

View File

@@ -0,0 +1,73 @@
From cab22a8e2e87870e8334a12ffcd0ba04ea81126f Mon Sep 17 00:00:00 2001
From: Neal Cardwell <ncardwell@google.com>
Date: Mon, 21 Sep 2020 14:46:26 -0400
Subject: [PATCH 14/19] net-tcp_bbr: v2: introduce is_acking_tlp_retrans_seq
into rate_sample
Introduce is_acking_tlp_retrans_seq into rate_sample. This bool will
export to the CC module the knowledge of whether the current ACK
matched a TLP retransmit.
Note that when this bool is true, we cannot yet tell (in general) whether
this ACK is for the original or the TLP retransmit.
Effort: net-tcp_bbr
Change-Id: I2e6494332167e75efcbdc99bd5c119034e9c39b4
Signed-off-by: Alexandre Frade <kernel@xanmod.org>
---
include/net/tcp.h | 1 +
net/ipv4/tcp_input.c | 12 +++++++++---
2 files changed, 10 insertions(+), 3 deletions(-)
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1161,6 +1161,7 @@ struct rate_sample {
u32 last_end_seq; /* end_seq of most recently ACKed packet */
bool is_app_limited; /* is sample from packet with bubble in pipe? */
bool is_retrans; /* is sample from retransmission? */
+ bool is_acking_tlp_retrans_seq; /* ACKed a TLP retransmit sequence? */
bool is_ack_delayed; /* is this (likely) a delayed ACK? */
bool is_ece; /* did this ACK have ECN marked? */
};
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3842,7 +3842,8 @@ static void tcp_replace_ts_recent(struct
/* This routine deals with acks during a TLP episode and ends an episode by
* resetting tlp_high_seq. Ref: TLP algorithm in draft-ietf-tcpm-rack
*/
-static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag)
+static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag,
+ struct rate_sample *rs)
{
struct tcp_sock *tp = tcp_sk(sk);
@@ -3870,6 +3871,11 @@ static void tcp_process_tlp_ack(struct s
FLAG_NOT_DUP | FLAG_DATA_SACKED))) {
/* Pure dupack: original and TLP probe arrived; no loss */
tp->tlp_high_seq = 0;
+ } else {
+ /* This ACK matches a TLP retransmit. We cannot yet tell if
+ * this ACK is for the original or the TLP retransmit.
+ */
+ rs->is_acking_tlp_retrans_seq = 1;
}
}
@@ -4053,7 +4059,7 @@ static int tcp_ack(struct sock *sk, cons
tcp_rack_update_reo_wnd(sk, &rs);
if (tp->tlp_high_seq)
- tcp_process_tlp_ack(sk, ack, flag);
+ tcp_process_tlp_ack(sk, ack, flag, &rs);
if (tcp_ack_is_dubious(sk, flag)) {
if (!(flag & (FLAG_SND_UNA_ADVANCED |
@@ -4097,7 +4103,7 @@ no_queue:
tcp_ack_probe(sk);
if (tp->tlp_high_seq)
- tcp_process_tlp_ack(sk, ack, flag);
+ tcp_process_tlp_ack(sk, ack, flag, &rs);
return 1;
old_ack:

View File

@@ -0,0 +1,112 @@
From 38dd25482f815d949fec91edd7694b2f15823f67 Mon Sep 17 00:00:00 2001
From: David Morley <morleyd@google.com>
Date: Fri, 14 Jul 2023 11:07:56 -0400
Subject: [PATCH 15/19] tcp: introduce per-route feature RTAX_FEATURE_ECN_LOW
Define and implement a new per-route feature, RTAX_FEATURE_ECN_LOW.
This feature indicates that the given destination network is a
low-latency ECN environment, meaning both that ECN CE marks are
applied by the network using a low-latency marking threshold and also
that TCP endpoints provide precise per-data-segment ECN feedback in
ACKs (where the ACK ECE flag echoes the received CE status of all
newly-acknowledged data segments). This feature indication can be used
by congestion control algorithms to decide how to interpret ECN
signals over the given destination network.
This feature is appropriate for datacenter-style ECN marking, such as
the ECN marking approach expected by DCTCP or BBR congestion control
modules.
Signed-off-by: David Morley <morleyd@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Tested-by: David Morley <morleyd@google.com>
Change-Id: I6bc06e9c6cb426fbae7243fc71c9a8c18175f5d3
Signed-off-by: Alexandre Frade <kernel@xanmod.org>
---
include/net/tcp.h | 10 ++++++++++
include/uapi/linux/rtnetlink.h | 4 +++-
net/ipv4/tcp_minisocks.c | 2 ++
net/ipv4/tcp_output.c | 6 ++++--
4 files changed, 19 insertions(+), 3 deletions(-)
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -375,6 +375,7 @@ static inline void tcp_dec_quickack_mode
#define TCP_ECN_QUEUE_CWR 2
#define TCP_ECN_DEMAND_CWR 4
#define TCP_ECN_SEEN 8
+#define TCP_ECN_LOW 16
enum tcp_tw_status {
TCP_TW_SUCCESS = 0,
@@ -777,6 +778,15 @@ static inline void tcp_fast_path_check(s
tcp_fast_path_on(tp);
}
+static inline void tcp_set_ecn_low_from_dst(struct sock *sk,
+ const struct dst_entry *dst)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (dst_feature(dst, RTAX_FEATURE_ECN_LOW))
+ tp->ecn_flags |= TCP_ECN_LOW;
+}
+
u32 tcp_delack_max(const struct sock *sk);
/* Compute the actual rto_min value */
--- a/include/uapi/linux/rtnetlink.h
+++ b/include/uapi/linux/rtnetlink.h
@@ -507,12 +507,14 @@ enum {
#define RTAX_FEATURE_TIMESTAMP (1 << 2) /* unused */
#define RTAX_FEATURE_ALLFRAG (1 << 3) /* unused */
#define RTAX_FEATURE_TCP_USEC_TS (1 << 4)
+#define RTAX_FEATURE_ECN_LOW (1 << 5)
#define RTAX_FEATURE_MASK (RTAX_FEATURE_ECN | \
RTAX_FEATURE_SACK | \
RTAX_FEATURE_TIMESTAMP | \
RTAX_FEATURE_ALLFRAG | \
- RTAX_FEATURE_TCP_USEC_TS)
+ RTAX_FEATURE_TCP_USEC_TS | \
+ RTAX_FEATURE_ECN_LOW)
struct rta_session {
__u8 proto;
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -459,6 +459,8 @@ void tcp_ca_openreq_child(struct sock *s
u32 ca_key = dst_metric(dst, RTAX_CC_ALGO);
bool ca_got_dst = false;
+ tcp_set_ecn_low_from_dst(sk, dst);
+
if (ca_key != TCP_CA_UNSPEC) {
const struct tcp_congestion_ops *ca;
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -336,10 +336,9 @@ static void tcp_ecn_send_syn(struct sock
bool bpf_needs_ecn = tcp_bpf_ca_needs_ecn(sk);
bool use_ecn = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn) == 1 ||
tcp_ca_needs_ecn(sk) || bpf_needs_ecn;
+ const struct dst_entry *dst = __sk_dst_get(sk);
if (!use_ecn) {
- const struct dst_entry *dst = __sk_dst_get(sk);
-
if (dst && dst_feature(dst, RTAX_FEATURE_ECN))
use_ecn = true;
}
@@ -351,6 +350,9 @@ static void tcp_ecn_send_syn(struct sock
tp->ecn_flags = TCP_ECN_OK;
if (tcp_ca_needs_ecn(sk) || bpf_needs_ecn)
INET_ECN_xmit(sk);
+
+ if (dst)
+ tcp_set_ecn_low_from_dst(sk, dst);
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,59 @@
From 99e86f904f246ae9ec7a13d1b920eaf4a8c2d47b Mon Sep 17 00:00:00 2001
From: Adithya Abraham Philip <abrahamphilip@google.com>
Date: Fri, 11 Jun 2021 21:56:10 +0000
Subject: [PATCH 17/19] net-tcp_bbr: v3: ensure ECN-enabled BBR flows set ECT
on retransmits
Adds a new flag TCP_ECN_ECT_PERMANENT that is used by CCAs to
indicate that retransmitted packets and pure ACKs must have the
ECT bit set. This is necessary for BBR, which when using
ECN expects ECT to be set even on retransmitted packets and ACKs.
Previous to this addition of TCP_ECN_ECT_PERMANENT, CCAs which can use
ECN but don't "need" it did not have a way to indicate that ECT should
be set on retransmissions/ACKs.
Signed-off-by: Adithya Abraham Philip <abrahamphilip@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Change-Id: I8b048eaab35e136fe6501ef6cd89fd9faa15e6d2
Signed-off-by: Alexandre Frade <kernel@xanmod.org>
---
include/net/tcp.h | 1 +
net/ipv4/tcp_bbr.c | 3 +++
net/ipv4/tcp_output.c | 3 ++-
3 files changed, 6 insertions(+), 1 deletion(-)
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -376,6 +376,7 @@ static inline void tcp_dec_quickack_mode
#define TCP_ECN_DEMAND_CWR 4
#define TCP_ECN_SEEN 8
#define TCP_ECN_LOW 16
+#define TCP_ECN_ECT_PERMANENT 32
enum tcp_tw_status {
TCP_TW_SUCCESS = 0,
--- a/net/ipv4/tcp_bbr.c
+++ b/net/ipv4/tcp_bbr.c
@@ -2151,6 +2151,9 @@ __bpf_kfunc static void bbr_init(struct
bbr->plb.pause_until = 0;
tp->fast_ack_mode = bbr_fast_ack_mode ? 1 : 0;
+
+ if (bbr_can_use_ecn(sk))
+ tp->ecn_flags |= TCP_ECN_ECT_PERMANENT;
}
/* BBR marks the current round trip as a loss round. */
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -390,7 +390,8 @@ static void tcp_ecn_send(struct sock *sk
th->cwr = 1;
skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN;
}
- } else if (!tcp_ca_needs_ecn(sk)) {
+ } else if (!(tp->ecn_flags & TCP_ECN_ECT_PERMANENT) &&
+ !tcp_ca_needs_ecn(sk)) {
/* ACK or retransmitted segment: clear ECT|CE */
INET_ECN_dontxmit(sk);
}

View File

@@ -0,0 +1,38 @@
From 5d7cb61552d374bcaaa95022129b4ca1eace1c33 Mon Sep 17 00:00:00 2001
From: Neal Cardwell <ncardwell@google.com>
Date: Sun, 23 Jul 2023 23:25:34 -0400
Subject: [PATCH 18/19] tcp: export TCPI_OPT_ECN_LOW in tcp_info tcpi_options
field
Analogous to other important ECN information, export TCPI_OPT_ECN_LOW
in tcp_info tcpi_options field.
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Change-Id: I08d8d8c7e8780e6e37df54038ee50301ac5a0320
Signed-off-by: Alexandre Frade <kernel@xanmod.org>
---
include/uapi/linux/tcp.h | 1 +
net/ipv4/tcp.c | 2 ++
2 files changed, 3 insertions(+)
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -178,6 +178,7 @@ enum tcp_fastopen_client_fail {
#define TCPI_OPT_ECN_SEEN 16 /* we received at least one packet with ECT */
#define TCPI_OPT_SYN_DATA 32 /* SYN-ACK acked data in SYN sent or rcvd */
#define TCPI_OPT_USEC_TS 64 /* usec timestamps */
+#define TCPI_OPT_ECN_LOW 128 /* Low-latency ECN configured at init */
/*
* Sender's congestion state indicating normal or abnormal situations
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -3850,6 +3850,8 @@ void tcp_get_info(struct sock *sk, struc
info->tcpi_options |= TCPI_OPT_ECN;
if (tp->ecn_flags & TCP_ECN_SEEN)
info->tcpi_options |= TCPI_OPT_ECN_SEEN;
+ if (tp->ecn_flags & TCP_ECN_LOW)
+ info->tcpi_options |= TCPI_OPT_ECN_LOW;
if (tp->syn_data_acked)
info->tcpi_options |= TCPI_OPT_SYN_DATA;
if (tp->tcp_usec_ts)

View File

@@ -0,0 +1,42 @@
From 39838c2f0b09bec02004c092904aada85da2bc2e Mon Sep 17 00:00:00 2001
From: Alexandre Frade <kernel@xanmod.org>
Date: Mon, 11 Mar 2024 12:01:13 -0300
Subject: [PATCH 19/19] x86/cfi,bpf: Add tso_segs and skb_marked_lost to
bpf_struct_ops CFI
Rebased-by: Oleksandr Natalenko <oleksandr@natalenko.name>
[ https://github.com/sirlucjan/kernel-patches/blob/master/6.8/bbr3-patches/0001-tcp-bbr3-initial-import.patch ]
Signed-off-by: Alexandre Frade <kernel@xanmod.org>
---
net/ipv4/bpf_tcp_ca.c | 9 +++++++--
1 file changed, 7 insertions(+), 2 deletions(-)
--- a/net/ipv4/bpf_tcp_ca.c
+++ b/net/ipv4/bpf_tcp_ca.c
@@ -305,11 +305,15 @@ static void bpf_tcp_ca_pkts_acked(struct
{
}
-static u32 bpf_tcp_ca_min_tso_segs(struct sock *sk)
+static u32 bpf_tcp_ca_tso_segs(struct sock *sk, unsigned int mss_now)
{
return 0;
}
+static void bpf_tcp_ca_skb_marked_lost(struct sock *sk, const struct sk_buff *skb)
+{
+}
+
static void bpf_tcp_ca_cong_control(struct sock *sk, u32 ack, int flag,
const struct rate_sample *rs)
{
@@ -340,7 +344,8 @@ static struct tcp_congestion_ops __bpf_o
.cwnd_event = bpf_tcp_ca_cwnd_event,
.in_ack_event = bpf_tcp_ca_in_ack_event,
.pkts_acked = bpf_tcp_ca_pkts_acked,
- .min_tso_segs = bpf_tcp_ca_min_tso_segs,
+ .tso_segs = bpf_tcp_ca_tso_segs,
+ .skb_marked_lost = bpf_tcp_ca_skb_marked_lost,
.cong_control = bpf_tcp_ca_cong_control,
.undo_cwnd = bpf_tcp_ca_undo_cwnd,
.sndbuf_expand = bpf_tcp_ca_sndbuf_expand,