119 lines
4.3 KiB
Diff
119 lines
4.3 KiB
Diff
|
From 3acb852e1cfcdeea388bd428c6dd81609fd40792 Mon Sep 17 00:00:00 2001
|
||
|
From: Neal Cardwell <ncardwell@google.com>
|
||
|
Date: Fri, 27 Sep 2019 17:10:26 -0400
|
||
|
Subject: [PATCH 10/19] net-tcp: re-generalize TSO sizing in TCP CC module API
|
||
|
|
||
|
Reorganize the API for CC modules so that the CC module once again
|
||
|
gets complete control of the TSO sizing decision. This is how the API
|
||
|
was set up around 2016 and the initial BBRv1 upstreaming. Later Eric
|
||
|
Dumazet simplified it. But with wider testing it now seems that to
|
||
|
avoid CPU regressions BBR needs to have a different TSO sizing
|
||
|
function.
|
||
|
|
||
|
This is necessary to handle cases where there are many flows
|
||
|
bottlenecked on the sender host's NIC, in which case BBR's pacing rate
|
||
|
is much lower than CUBIC/Reno/DCTCP's. Why does this happen? Because
|
||
|
BBR's pacing rate adapts to the low bandwidth share each flow sees. By
|
||
|
contrast, CUBIC/Reno/DCTCP see no loss or ECN, so they grow a very
|
||
|
large cwnd, and thus large pacing rate and large TSO burst size.
|
||
|
|
||
|
Change-Id: Ic8ccfdbe4010ee8d4bf6a6334c48a2fceb2171ea
|
||
|
Signed-off-by: Alexandre Frade <kernel@xanmod.org>
|
||
|
---
|
||
|
include/net/tcp.h | 4 ++--
|
||
|
net/ipv4/tcp_bbr.c | 37 ++++++++++++++++++++++++++-----------
|
||
|
net/ipv4/tcp_output.c | 11 +++++------
|
||
|
3 files changed, 33 insertions(+), 19 deletions(-)
|
||
|
|
||
|
--- a/include/net/tcp.h
|
||
|
+++ b/include/net/tcp.h
|
||
|
@@ -1185,8 +1185,8 @@ struct tcp_congestion_ops {
|
||
|
/* hook for packet ack accounting (optional) */
|
||
|
void (*pkts_acked)(struct sock *sk, const struct ack_sample *sample);
|
||
|
|
||
|
- /* override sysctl_tcp_min_tso_segs */
|
||
|
- u32 (*min_tso_segs)(struct sock *sk);
|
||
|
+ /* pick target number of segments per TSO/GSO skb (optional): */
|
||
|
+ u32 (*tso_segs)(struct sock *sk, unsigned int mss_now);
|
||
|
|
||
|
/* react to a specific lost skb (optional) */
|
||
|
void (*skb_marked_lost)(struct sock *sk, const struct sk_buff *skb);
|
||
|
--- a/net/ipv4/tcp_bbr.c
|
||
|
+++ b/net/ipv4/tcp_bbr.c
|
||
|
@@ -301,20 +301,35 @@ __bpf_kfunc static u32 bbr_min_tso_segs(
|
||
|
return READ_ONCE(sk->sk_pacing_rate) < (bbr_min_tso_rate >> 3) ? 1 : 2;
|
||
|
}
|
||
|
|
||
|
+/* Return the number of segments BBR would like in a TSO/GSO skb, given
|
||
|
+ * a particular max gso size as a constraint.
|
||
|
+ */
|
||
|
+static u32 bbr_tso_segs_generic(struct sock *sk, unsigned int mss_now,
|
||
|
+ u32 gso_max_size)
|
||
|
+{
|
||
|
+ u32 segs;
|
||
|
+ u64 bytes;
|
||
|
+
|
||
|
+ /* Budget a TSO/GSO burst size allowance based on bw (pacing_rate). */
|
||
|
+ bytes = READ_ONCE(sk->sk_pacing_rate) >> READ_ONCE(sk->sk_pacing_shift);
|
||
|
+
|
||
|
+ bytes = min_t(u32, bytes, gso_max_size - 1 - MAX_TCP_HEADER);
|
||
|
+ segs = max_t(u32, bytes / mss_now, bbr_min_tso_segs(sk));
|
||
|
+ return segs;
|
||
|
+}
|
||
|
+
|
||
|
+/* Custom tcp_tso_autosize() for BBR, used at transmit time to cap skb size. */
|
||
|
+static u32 bbr_tso_segs(struct sock *sk, unsigned int mss_now)
|
||
|
+{
|
||
|
+ return bbr_tso_segs_generic(sk, mss_now, sk->sk_gso_max_size);
|
||
|
+}
|
||
|
+
|
||
|
+/* Like bbr_tso_segs(), using mss_cache, ignoring driver's sk_gso_max_size. */
|
||
|
static u32 bbr_tso_segs_goal(struct sock *sk)
|
||
|
{
|
||
|
struct tcp_sock *tp = tcp_sk(sk);
|
||
|
- u32 segs, bytes;
|
||
|
-
|
||
|
- /* Sort of tcp_tso_autosize() but ignoring
|
||
|
- * driver provided sk_gso_max_size.
|
||
|
- */
|
||
|
- bytes = min_t(unsigned long,
|
||
|
- READ_ONCE(sk->sk_pacing_rate) >> READ_ONCE(sk->sk_pacing_shift),
|
||
|
- GSO_LEGACY_MAX_SIZE - 1 - MAX_TCP_HEADER);
|
||
|
- segs = max_t(u32, bytes / tp->mss_cache, bbr_min_tso_segs(sk));
|
||
|
|
||
|
- return min(segs, 0x7FU);
|
||
|
+ return bbr_tso_segs_generic(sk, tp->mss_cache, GSO_MAX_SIZE);
|
||
|
}
|
||
|
|
||
|
/* Save "last known good" cwnd so we can restore it after losses or PROBE_RTT */
|
||
|
@@ -1150,7 +1165,7 @@ static struct tcp_congestion_ops tcp_bbr
|
||
|
.undo_cwnd = bbr_undo_cwnd,
|
||
|
.cwnd_event = bbr_cwnd_event,
|
||
|
.ssthresh = bbr_ssthresh,
|
||
|
- .min_tso_segs = bbr_min_tso_segs,
|
||
|
+ .tso_segs = bbr_tso_segs,
|
||
|
.get_info = bbr_get_info,
|
||
|
.set_state = bbr_set_state,
|
||
|
};
|
||
|
--- a/net/ipv4/tcp_output.c
|
||
|
+++ b/net/ipv4/tcp_output.c
|
||
|
@@ -2057,13 +2057,12 @@ static u32 tcp_tso_autosize(const struct
|
||
|
static u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now)
|
||
|
{
|
||
|
const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
|
||
|
- u32 min_tso, tso_segs;
|
||
|
+ u32 tso_segs;
|
||
|
|
||
|
- min_tso = ca_ops->min_tso_segs ?
|
||
|
- ca_ops->min_tso_segs(sk) :
|
||
|
- READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs);
|
||
|
-
|
||
|
- tso_segs = tcp_tso_autosize(sk, mss_now, min_tso);
|
||
|
+ tso_segs = ca_ops->tso_segs ?
|
||
|
+ ca_ops->tso_segs(sk, mss_now) :
|
||
|
+ tcp_tso_autosize(sk, mss_now,
|
||
|
+ sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs);
|
||
|
return min_t(u32, tso_segs, sk->sk_gso_max_segs);
|
||
|
}
|
||
|
|