Konstantin Demin
8cbaf1dea2
3rd patchs (in alphabetical order): - bbr3 - ntsync5 - openwrt - pf-kernel - xanmod - zen no configuration changes for now
153 lines
5.5 KiB
Diff
153 lines
5.5 KiB
Diff
From 772c6e460211ac740b2550fa75be36b8a49731fe Mon Sep 17 00:00:00 2001
|
|
From: "mfreemon@cloudflare.com" <mfreemon@cloudflare.com>
|
|
Date: Tue, 1 Mar 2022 17:06:02 -0600
|
|
Subject: [PATCH] tcp: Add a sysctl to skip tcp collapse processing when the
|
|
receive buffer is full
|
|
|
|
For context and additional information about this patch, see the
|
|
blog post at https://blog.cloudflare.com/optimizing-tcp-for-high-throughput-and-low-latency/
|
|
|
|
sysctl: net.ipv4.tcp_collapse_max_bytes
|
|
|
|
If tcp_collapse_max_bytes is non-zero, attempt to collapse the
|
|
queue to free up memory if the current amount of memory allocated
|
|
is less than tcp_collapse_max_bytes. Otherwise, the packet is
|
|
dropped without attempting to collapse the queue.
|
|
|
|
If tcp_collapse_max_bytes is zero, this feature is disabled
|
|
and the default Linux behavior is used. The default Linux
|
|
behavior is to always perform the attempt to collapse the
|
|
queue to free up memory.
|
|
|
|
When the receive queue is small, we want to collapse the
|
|
queue. There are two reasons for this: (a) the latency of
|
|
performing the collapse will be small on a small queue, and
|
|
(b) we want to avoid sending a congestion signal (via a
|
|
packet drop) to the sender when the receive queue is small.
|
|
|
|
The result is that we avoid latency spikes caused by the
|
|
time it takes to perform the collapse logic when the receive
|
|
queue is large and full, while preserving existing behavior
|
|
and performance for all other cases.
|
|
|
|
Signed-off-by: Alexandre Frade <kernel@xanmod.org>
|
|
---
|
|
include/net/netns/ipv4.h | 1 +
|
|
include/trace/events/tcp.h | 7 +++++++
|
|
net/ipv4/sysctl_net_ipv4.c | 7 +++++++
|
|
net/ipv4/tcp_input.c | 36 ++++++++++++++++++++++++++++++++++++
|
|
net/ipv4/tcp_ipv4.c | 1 +
|
|
5 files changed, 52 insertions(+)
|
|
|
|
--- a/include/net/netns/ipv4.h
|
|
+++ b/include/net/netns/ipv4.h
|
|
@@ -223,6 +223,7 @@ struct netns_ipv4 {
|
|
|
|
u8 sysctl_fib_notify_on_flag_change;
|
|
u8 sysctl_tcp_syn_linear_timeouts;
|
|
+ unsigned int sysctl_tcp_collapse_max_bytes;
|
|
|
|
#ifdef CONFIG_NET_L3_MASTER_DEV
|
|
u8 sysctl_udp_l3mdev_accept;
|
|
--- a/include/trace/events/tcp.h
|
|
+++ b/include/trace/events/tcp.h
|
|
@@ -213,6 +213,13 @@ DEFINE_EVENT(tcp_event_sk, tcp_rcv_space
|
|
TP_ARGS(sk)
|
|
);
|
|
|
|
+DEFINE_EVENT(tcp_event_sk, tcp_collapse_max_bytes_exceeded,
|
|
+
|
|
+ TP_PROTO(struct sock *sk),
|
|
+
|
|
+ TP_ARGS(sk)
|
|
+);
|
|
+
|
|
TRACE_EVENT(tcp_retransmit_synack,
|
|
|
|
TP_PROTO(const struct sock *sk, const struct request_sock *req),
|
|
--- a/net/ipv4/sysctl_net_ipv4.c
|
|
+++ b/net/ipv4/sysctl_net_ipv4.c
|
|
@@ -1558,6 +1558,13 @@ static struct ctl_table ipv4_net_table[]
|
|
.extra2 = SYSCTL_ONE,
|
|
},
|
|
{
|
|
+ .procname = "tcp_collapse_max_bytes",
|
|
+ .data = &init_net.ipv4.sysctl_tcp_collapse_max_bytes,
|
|
+ .maxlen = sizeof(unsigned int),
|
|
+ .mode = 0644,
|
|
+ .proc_handler = proc_douintvec_minmax,
|
|
+ },
|
|
+ {
|
|
.procname = "tcp_pingpong_thresh",
|
|
.data = &init_net.ipv4.sysctl_tcp_pingpong_thresh,
|
|
.maxlen = sizeof(u8),
|
|
--- a/net/ipv4/tcp_input.c
|
|
+++ b/net/ipv4/tcp_input.c
|
|
@@ -5645,6 +5645,7 @@ static bool tcp_prune_ofo_queue(struct s
|
|
static int tcp_prune_queue(struct sock *sk, const struct sk_buff *in_skb)
|
|
{
|
|
struct tcp_sock *tp = tcp_sk(sk);
|
|
+ struct net *net = sock_net(sk);
|
|
|
|
NET_INC_STATS(sock_net(sk), LINUX_MIB_PRUNECALLED);
|
|
|
|
@@ -5656,6 +5657,39 @@ static int tcp_prune_queue(struct sock *
|
|
if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
|
|
return 0;
|
|
|
|
+ /* For context and additional information about this patch, see the
|
|
+ * blog post at
|
|
+ *
|
|
+ * sysctl: net.ipv4.tcp_collapse_max_bytes
|
|
+ *
|
|
+ * If tcp_collapse_max_bytes is non-zero, attempt to collapse the
|
|
+ * queue to free up memory if the current amount of memory allocated
|
|
+ * is less than tcp_collapse_max_bytes. Otherwise, the packet is
|
|
+ * dropped without attempting to collapse the queue.
|
|
+ *
|
|
+ * If tcp_collapse_max_bytes is zero, this feature is disabled
|
|
+ * and the default Linux behavior is used. The default Linux
|
|
+ * behavior is to always perform the attempt to collapse the
|
|
+ * queue to free up memory.
|
|
+ *
|
|
+ * When the receive queue is small, we want to collapse the
|
|
+ * queue. There are two reasons for this: (a) the latency of
|
|
+ * performing the collapse will be small on a small queue, and
|
|
+ * (b) we want to avoid sending a congestion signal (via a
|
|
+ * packet drop) to the sender when the receive queue is small.
|
|
+ *
|
|
+ * The result is that we avoid latency spikes caused by the
|
|
+ * time it takes to perform the collapse logic when the receive
|
|
+ * queue is large and full, while preserving existing behavior
|
|
+ * and performance for all other cases.
|
|
+ */
|
|
+ if (net->ipv4.sysctl_tcp_collapse_max_bytes &&
|
|
+ (atomic_read(&sk->sk_rmem_alloc) > net->ipv4.sysctl_tcp_collapse_max_bytes)) {
|
|
+ /* We are dropping the packet */
|
|
+ trace_tcp_collapse_max_bytes_exceeded(sk);
|
|
+ goto do_not_collapse;
|
|
+ }
|
|
+
|
|
tcp_collapse_ofo_queue(sk);
|
|
if (!skb_queue_empty(&sk->sk_receive_queue))
|
|
tcp_collapse(sk, &sk->sk_receive_queue, NULL,
|
|
@@ -5674,6 +5708,8 @@ static int tcp_prune_queue(struct sock *
|
|
if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
|
|
return 0;
|
|
|
|
+do_not_collapse:
|
|
+
|
|
/* If we are really being abused, tell the caller to silently
|
|
* drop receive data on the floor. It will get retransmitted
|
|
* and hopefully then we'll have sufficient space.
|
|
--- a/net/ipv4/tcp_ipv4.c
|
|
+++ b/net/ipv4/tcp_ipv4.c
|
|
@@ -3508,6 +3508,7 @@ static int __net_init tcp_sk_init(struct
|
|
|
|
net->ipv4.sysctl_tcp_syn_linear_timeouts = 4;
|
|
net->ipv4.sysctl_tcp_shrink_window = 0;
|
|
+ net->ipv4.sysctl_tcp_collapse_max_bytes = 0;
|
|
|
|
net->ipv4.sysctl_tcp_pingpong_thresh = 1;
|
|
net->ipv4.sysctl_tcp_rto_min_us = jiffies_to_usecs(TCP_RTO_MIN);
|