1
0

add 3rd party/custom patches

3rd patchs (in alphabetical order):
- bbr3
- ntsync5
- openwrt
- pf-kernel
- xanmod
- zen

no configuration changes for now
This commit is contained in:
2024-10-29 05:12:06 +03:00
parent 8082dfeaca
commit 8cbaf1dea2
186 changed files with 43626 additions and 0 deletions

View File

@@ -0,0 +1,209 @@
From 0bbb6450b1ba362c1c2e7d8d752b39ec9844629b Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian@brauner.io>
Date: Wed, 16 Jan 2019 23:13:25 +0100
Subject: [PATCH 1/4] binder: turn into module
The Android binder driver needs to become a module for the sake of shipping
Anbox. To do this we need to export the following functions since binder is
currently still using them:
- security_binder_set_context_mgr()
- security_binder_transaction()
- security_binder_transfer_binder()
- security_binder_transfer_file()
- can_nice()
- __close_fd_get_file()
- mmput_async()
- task_work_add()
- map_kernel_range_noflush()
- get_vm_area()
- zap_page_range_single()
- put_ipc_ns()
- get_ipc_ns_exported()
- show_init_ipc_ns()
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
[ saf: fix additional reference to init_ipc_ns from 5.0-rc6 ]
Signed-off-by: Seth Forshee <seth.forshee@canonical.com>
[ arighi: fix EXPORT_SYMBOL vs EXPORT_SYMBOL_GPL change from 6.0-rc5 ]
[ arighi: zap_page_range() has been dropped, export zap_page_range_single() in 6.3 ]
Signed-off-by: Andrea Righi <andrea.righi@canonical.com>
Signed-off-by: Alexandre Frade <kernel@xanmod.org>
---
drivers/android/Kconfig | 6 +++---
drivers/android/binder.c | 17 ++++++++++++++---
drivers/android/binder_alloc.h | 3 ++-
drivers/android/binder_internal.h | 5 +++--
drivers/android/binderfs.c | 6 +++---
fs/file.c | 1 +
include/linux/ipc_namespace.h | 3 +++
ipc/namespace.c | 17 +++++++++++++++++
kernel/sched/syscalls.c | 1 +
kernel/task_work.c | 1 +
mm/memory.c | 1 +
mm/vmalloc.c | 1 +
security/security.c | 4 ++++
14 files changed, 61 insertions(+), 15 deletions(-)
--- a/drivers/android/Kconfig
+++ b/drivers/android/Kconfig
@@ -14,8 +14,8 @@ config ANDROID_BINDER_IPC
between said processes.
config ANDROID_BINDERFS
- bool "Android Binderfs filesystem"
- depends on ANDROID_BINDER_IPC
+ tristate "Android Binderfs filesystem"
+ depends on (ANDROID_BINDER_IPC=y) || (ANDROID_BINDER_IPC=m && m)
default n
help
Binderfs is a pseudo-filesystem for the Android Binder IPC driver
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -6713,9 +6713,20 @@ err_alloc_device_names_failed:
return ret;
}
-device_initcall(binder_init);
+module_init(binder_init);
+/*
+ * binder will have no exit function since binderfs instances can be mounted
+ * multiple times and also in user namespaces finding and destroying them all
+ * is not feasible without introducing insane locking. Just ignoring existing
+ * instances on module unload also wouldn't work since we would loose track of
+ * what major numer was dynamically allocated and also what minor numbers are
+ * already given out. So this would get us into all kinds of issues with device
+ * number reuse. So simply don't allow unloading unless we are forced to do so.
+ */
+
+MODULE_AUTHOR("Google, Inc.");
+MODULE_DESCRIPTION("Driver for Android binder device");
+MODULE_LICENSE("GPL v2");
#define CREATE_TRACE_POINTS
#include "binder_trace.h"
-
-MODULE_LICENSE("GPL v2");
--- a/drivers/android/binder_alloc.h
+++ b/drivers/android/binder_alloc.h
@@ -6,6 +6,7 @@
#ifndef _LINUX_BINDER_ALLOC_H
#define _LINUX_BINDER_ALLOC_H
+#include <linux/kconfig.h>
#include <linux/rbtree.h>
#include <linux/list.h>
#include <linux/mm.h>
@@ -111,7 +112,7 @@ struct binder_alloc {
bool oneway_spam_detected;
};
-#ifdef CONFIG_ANDROID_BINDER_IPC_SELFTEST
+#if IS_ENABLED(CONFIG_ANDROID_BINDER_IPC_SELFTEST)
void binder_selftest_alloc(struct binder_alloc *alloc);
#else
static inline void binder_selftest_alloc(struct binder_alloc *alloc) {}
--- a/drivers/android/binder_internal.h
+++ b/drivers/android/binder_internal.h
@@ -5,6 +5,7 @@
#include <linux/export.h>
#include <linux/fs.h>
+#include <linux/kconfig.h>
#include <linux/list.h>
#include <linux/miscdevice.h>
#include <linux/mutex.h>
@@ -78,7 +79,7 @@ extern const struct file_operations bind
extern char *binder_devices_param;
-#ifdef CONFIG_ANDROID_BINDERFS
+#if IS_ENABLED(CONFIG_ANDROID_BINDERFS)
extern bool is_binderfs_device(const struct inode *inode);
extern struct dentry *binderfs_create_file(struct dentry *dir, const char *name,
const struct file_operations *fops,
@@ -99,7 +100,7 @@ static inline struct dentry *binderfs_cr
static inline void binderfs_remove_file(struct dentry *dentry) {}
#endif
-#ifdef CONFIG_ANDROID_BINDERFS
+#if IS_ENABLED(CONFIG_ANDROID_BINDERFS)
extern int __init init_binderfs(void);
#else
static inline int __init init_binderfs(void)
--- a/drivers/android/binderfs.c
+++ b/drivers/android/binderfs.c
@@ -120,7 +120,7 @@ static int binderfs_binder_device_create
struct super_block *sb = ref_inode->i_sb;
struct binderfs_info *info = sb->s_fs_info;
#if defined(CONFIG_IPC_NS)
- bool use_reserve = (info->ipc_ns == &init_ipc_ns);
+ bool use_reserve = (info->ipc_ns == show_init_ipc_ns());
#else
bool use_reserve = true;
#endif
@@ -397,7 +397,7 @@ static int binderfs_binder_ctl_create(st
struct dentry *root = sb->s_root;
struct binderfs_info *info = sb->s_fs_info;
#if defined(CONFIG_IPC_NS)
- bool use_reserve = (info->ipc_ns == &init_ipc_ns);
+ bool use_reserve = (info->ipc_ns == show_init_ipc_ns());
#else
bool use_reserve = true;
#endif
@@ -683,7 +683,7 @@ static int binderfs_fill_super(struct su
return -ENOMEM;
info = sb->s_fs_info;
- info->ipc_ns = get_ipc_ns(current->nsproxy->ipc_ns);
+ info->ipc_ns = get_ipc_ns_exported(current->nsproxy->ipc_ns);
info->root_gid = make_kgid(sb->s_user_ns, 0);
if (!gid_valid(info->root_gid))
--- a/include/linux/ipc_namespace.h
+++ b/include/linux/ipc_namespace.h
@@ -128,6 +128,9 @@ extern int mq_init_ns(struct ipc_namespa
static inline int mq_init_ns(struct ipc_namespace *ns) { return 0; }
#endif
+extern struct ipc_namespace *get_ipc_ns_exported(struct ipc_namespace *ns);
+extern struct ipc_namespace *show_init_ipc_ns(void);
+
#if defined(CONFIG_IPC_NS)
extern struct ipc_namespace *copy_ipcs(unsigned long flags,
struct user_namespace *user_ns, struct ipc_namespace *ns);
--- a/ipc/namespace.c
+++ b/ipc/namespace.c
@@ -207,6 +207,22 @@ void put_ipc_ns(struct ipc_namespace *ns
}
EXPORT_SYMBOL_GPL(put_ipc_ns);
+struct ipc_namespace *get_ipc_ns_exported(struct ipc_namespace *ns)
+{
+ return get_ipc_ns(ns);
+}
+EXPORT_SYMBOL(get_ipc_ns_exported);
+
+struct ipc_namespace *show_init_ipc_ns(void)
+{
+#if defined(CONFIG_IPC_NS)
+ return &init_ipc_ns;
+#else
+ return NULL;
+#endif
+}
+EXPORT_SYMBOL(show_init_ipc_ns);
+
static inline struct ipc_namespace *to_ipc_ns(struct ns_common *ns)
{
return container_of(ns, struct ipc_namespace, ns);
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -3166,6 +3166,7 @@ struct vm_struct *get_vm_area(unsigned l
NUMA_NO_NODE, GFP_KERNEL,
__builtin_return_address(0));
}
+EXPORT_SYMBOL(get_vm_area);
struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags,
const void *caller)

View File

@@ -0,0 +1,82 @@
From 1e2892b83e6c4062623f9aec06eba27884d47059 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Thu, 13 Dec 2018 01:00:49 +0000
Subject: [PATCH 1/4] sched/wait: Do accept() in LIFO order for cache
efficiency
Signed-off-by: Alexandre Frade <kernel@xanmod.org>
---
include/linux/wait.h | 2 ++
kernel/sched/wait.c | 24 ++++++++++++++++++++++++
net/ipv4/inet_connection_sock.c | 2 +-
3 files changed, 27 insertions(+), 1 deletion(-)
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -163,6 +163,7 @@ static inline bool wq_has_sleeper(struct
extern void add_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry);
extern void add_wait_queue_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry);
+extern void add_wait_queue_exclusive_lifo(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry);
extern void add_wait_queue_priority(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry);
extern void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry);
@@ -1191,6 +1192,7 @@ do { \
*/
void prepare_to_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state);
bool prepare_to_wait_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state);
+void prepare_to_wait_exclusive_lifo(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state);
long prepare_to_wait_event(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state);
void finish_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry);
long wait_woken(struct wait_queue_entry *wq_entry, unsigned mode, long timeout);
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -47,6 +47,17 @@ void add_wait_queue_priority(struct wait
}
EXPORT_SYMBOL_GPL(add_wait_queue_priority);
+void add_wait_queue_exclusive_lifo(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
+{
+ unsigned long flags;
+
+ wq_entry->flags |= WQ_FLAG_EXCLUSIVE;
+ spin_lock_irqsave(&wq_head->lock, flags);
+ __add_wait_queue(wq_head, wq_entry);
+ spin_unlock_irqrestore(&wq_head->lock, flags);
+}
+EXPORT_SYMBOL(add_wait_queue_exclusive_lifo);
+
void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
{
unsigned long flags;
@@ -259,6 +270,19 @@ prepare_to_wait_exclusive(struct wait_qu
}
EXPORT_SYMBOL(prepare_to_wait_exclusive);
+void prepare_to_wait_exclusive_lifo(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state)
+{
+ unsigned long flags;
+
+ wq_entry->flags |= WQ_FLAG_EXCLUSIVE;
+ spin_lock_irqsave(&wq_head->lock, flags);
+ if (list_empty(&wq_entry->entry))
+ __add_wait_queue(wq_head, wq_entry);
+ set_current_state(state);
+ spin_unlock_irqrestore(&wq_head->lock, flags);
+}
+EXPORT_SYMBOL(prepare_to_wait_exclusive_lifo);
+
void init_wait_entry(struct wait_queue_entry *wq_entry, int flags)
{
wq_entry->flags = flags;
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -634,7 +634,7 @@ static int inet_csk_wait_for_connect(str
* having to remove and re-insert us on the wait queue.
*/
for (;;) {
- prepare_to_wait_exclusive(sk_sleep(sk), &wait,
+ prepare_to_wait_exclusive_lifo(sk_sleep(sk), &wait,
TASK_INTERRUPTIBLE);
release_sock(sk);
if (reqsk_queue_empty(&icsk->icsk_accept_queue))

View File

@@ -0,0 +1,25 @@
From 52caaf4af0242cc3ac1ddaf9791c4c08a4b7226d Mon Sep 17 00:00:00 2001
From: William Douglas <william.douglas@intel.com>
Date: Wed, 20 Jun 2018 17:23:21 +0000
Subject: [PATCH 2/4] firmware: Enable stateless firmware loading
Prefer the order of specific version before generic and /etc before
/lib to enable the user to give specific overrides for generic
firmware and distribution firmware.
Signed-off-by: Alexandre Frade <kernel@xanmod.org>
---
drivers/base/firmware_loader/main.c | 2 ++
1 file changed, 2 insertions(+)
--- a/drivers/base/firmware_loader/main.c
+++ b/drivers/base/firmware_loader/main.c
@@ -471,6 +471,8 @@ static int fw_decompress_xz(struct devic
static char fw_path_para[256];
static const char * const fw_path[] = {
fw_path_para,
+ "/etc/firmware/" UTS_RELEASE,
+ "/etc/firmware",
"/lib/firmware/updates/" UTS_RELEASE,
"/lib/firmware/updates",
"/lib/firmware/" UTS_RELEASE,

View File

@@ -0,0 +1,32 @@
From f57e26428b29ddcdf58b35659002293bde1bf281 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Sun, 18 Feb 2018 23:35:41 +0000
Subject: [PATCH 3/4] locking: rwsem: spin faster
tweak rwsem owner spinning a bit
Signed-off-by: Alexandre Frade <kernel@xanmod.org>
---
kernel/locking/rwsem.c | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -749,6 +749,7 @@ rwsem_spin_on_owner(struct rw_semaphore
struct task_struct *new, *owner;
unsigned long flags, new_flags;
enum owner_state state;
+ int i = 0;
lockdep_assert_preemption_disabled();
@@ -785,7 +786,8 @@ rwsem_spin_on_owner(struct rw_semaphore
break;
}
- cpu_relax();
+ if (i++ > 1000)
+ cpu_relax();
}
return state;

View File

@@ -0,0 +1,809 @@
From 9a066610b055315bf155a99b2ea0a58245ef11e2 Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Tue, 20 Feb 2018 15:56:02 +0100
Subject: [PATCH 2/2] netfilter: add xt_FLOWOFFLOAD target
Signed-off-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Alexandre Frade <kernel@xanmod.org>
---
include/net/netfilter/nf_flow_table.h | 5 +
include/uapi/linux/netfilter/xt_FLOWOFFLOAD.h | 17 +
net/netfilter/Kconfig | 9 +
net/netfilter/Makefile | 1 +
net/netfilter/nf_flow_table_core.c | 5 +-
net/netfilter/xt_FLOWOFFLOAD.c | 698 ++++++++++++++++++
6 files changed, 732 insertions(+), 3 deletions(-)
create mode 100644 include/uapi/linux/netfilter/xt_FLOWOFFLOAD.h
create mode 100644 net/netfilter/xt_FLOWOFFLOAD.c
--- a/include/net/netfilter/nf_flow_table.h
+++ b/include/net/netfilter/nf_flow_table.h
@@ -294,6 +294,11 @@ void nf_flow_table_free(struct nf_flowta
void flow_offload_teardown(struct flow_offload *flow);
+int nf_flow_table_iterate(struct nf_flowtable *flow_table,
+ void (*iter)(struct nf_flowtable *flowtable,
+ struct flow_offload *flow, void *data),
+ void *data);
+
void nf_flow_snat_port(const struct flow_offload *flow,
struct sk_buff *skb, unsigned int thoff,
u8 protocol, enum flow_offload_tuple_dir dir);
--- /dev/null
+++ b/include/uapi/linux/netfilter/xt_FLOWOFFLOAD.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _XT_FLOWOFFLOAD_H
+#define _XT_FLOWOFFLOAD_H
+
+#include <linux/types.h>
+
+enum {
+ XT_FLOWOFFLOAD_HW = 1 << 0,
+
+ XT_FLOWOFFLOAD_MASK = XT_FLOWOFFLOAD_HW
+};
+
+struct xt_flowoffload_target_info {
+ __u32 flags;
+};
+
+#endif /* _XT_FLOWOFFLOAD_H */
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -1041,6 +1041,15 @@ config NETFILTER_XT_TARGET_NOTRACK
depends on NETFILTER_ADVANCED
select NETFILTER_XT_TARGET_CT
+config NETFILTER_XT_TARGET_FLOWOFFLOAD
+ tristate '"FLOWOFFLOAD" target support'
+ depends on NF_FLOW_TABLE
+ depends on NETFILTER_INGRESS
+ help
+ This option adds a `FLOWOFFLOAD' target, which uses the nf_flow_offload
+ module to speed up processing of packets by bypassing the usual
+ netfilter chains
+
config NETFILTER_XT_TARGET_RATEEST
tristate '"RATEEST" target support'
depends on NETFILTER_ADVANCED
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -168,6 +168,7 @@ obj-$(CONFIG_NETFILTER_XT_TARGET_CLASSIF
obj-$(CONFIG_NETFILTER_XT_TARGET_CONNSECMARK) += xt_CONNSECMARK.o
obj-$(CONFIG_NETFILTER_XT_TARGET_CT) += xt_CT.o
obj-$(CONFIG_NETFILTER_XT_TARGET_DSCP) += xt_DSCP.o
+obj-$(CONFIG_NETFILTER_XT_TARGET_FLOWOFFLOAD) += xt_FLOWOFFLOAD.o
obj-$(CONFIG_NETFILTER_XT_TARGET_HL) += xt_HL.o
obj-$(CONFIG_NETFILTER_XT_TARGET_HMARK) += xt_HMARK.o
obj-$(CONFIG_NETFILTER_XT_TARGET_LED) += xt_LED.o
--- a/net/netfilter/nf_flow_table_core.c
+++ b/net/netfilter/nf_flow_table_core.c
@@ -7,7 +7,6 @@
#include <linux/netdevice.h>
#include <net/ip.h>
#include <net/ip6_route.h>
-#include <net/netfilter/nf_tables.h>
#include <net/netfilter/nf_flow_table.h>
#include <net/netfilter/nf_conntrack.h>
#include <net/netfilter/nf_conntrack_core.h>
@@ -373,8 +372,7 @@ flow_offload_lookup(struct nf_flowtable
}
EXPORT_SYMBOL_GPL(flow_offload_lookup);
-static int
-nf_flow_table_iterate(struct nf_flowtable *flow_table,
+int nf_flow_table_iterate(struct nf_flowtable *flow_table,
void (*iter)(struct nf_flowtable *flowtable,
struct flow_offload *flow, void *data),
void *data)
@@ -435,6 +433,7 @@ static void nf_flow_offload_gc_step(stru
nf_flow_offload_stats(flow_table, flow);
}
}
+EXPORT_SYMBOL_GPL(nf_flow_table_iterate);
void nf_flow_table_gc_run(struct nf_flowtable *flow_table)
{
--- /dev/null
+++ b/net/netfilter/xt_FLOWOFFLOAD.c
@@ -0,0 +1,698 @@
+/*
+ * Copyright (C) 2018-2021 Felix Fietkau <nbd@nbd.name>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/xt_FLOWOFFLOAD.h>
+#include <linux/if_vlan.h>
+#include <net/ip.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_extend.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_flow_table.h>
+
+struct xt_flowoffload_hook {
+ struct hlist_node list;
+ struct nf_hook_ops ops;
+ struct net *net;
+ bool registered;
+ bool used;
+};
+
+struct xt_flowoffload_table {
+ struct nf_flowtable ft;
+ struct hlist_head hooks;
+ struct delayed_work work;
+};
+
+struct nf_forward_info {
+ const struct net_device *indev;
+ const struct net_device *outdev;
+ const struct net_device *hw_outdev;
+ struct id {
+ __u16 id;
+ __be16 proto;
+ } encap[NF_FLOW_TABLE_ENCAP_MAX];
+ u8 num_encaps;
+ u8 ingress_vlans;
+ u8 h_source[ETH_ALEN];
+ u8 h_dest[ETH_ALEN];
+ enum flow_offload_xmit_type xmit_type;
+};
+
+static DEFINE_SPINLOCK(hooks_lock);
+
+struct xt_flowoffload_table flowtable[2];
+
+static unsigned int
+xt_flowoffload_net_hook(void *priv, struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ struct vlan_ethhdr *veth;
+ __be16 proto;
+
+ switch (skb->protocol) {
+ case htons(ETH_P_8021Q):
+ veth = (struct vlan_ethhdr *)skb_mac_header(skb);
+ proto = veth->h_vlan_encapsulated_proto;
+ break;
+ case htons(ETH_P_PPP_SES):
+ if (!nf_flow_pppoe_proto(skb, &proto))
+ return NF_ACCEPT;
+ break;
+ default:
+ proto = skb->protocol;
+ break;
+ }
+
+ switch (proto) {
+ case htons(ETH_P_IP):
+ return nf_flow_offload_ip_hook(priv, skb, state);
+ case htons(ETH_P_IPV6):
+ return nf_flow_offload_ipv6_hook(priv, skb, state);
+ }
+
+ return NF_ACCEPT;
+}
+
+static int
+xt_flowoffload_create_hook(struct xt_flowoffload_table *table,
+ struct net_device *dev)
+{
+ struct xt_flowoffload_hook *hook;
+ struct nf_hook_ops *ops;
+
+ hook = kzalloc(sizeof(*hook), GFP_ATOMIC);
+ if (!hook)
+ return -ENOMEM;
+
+ ops = &hook->ops;
+ ops->pf = NFPROTO_NETDEV;
+ ops->hooknum = NF_NETDEV_INGRESS;
+ ops->priority = 10;
+ ops->priv = &table->ft;
+ ops->hook = xt_flowoffload_net_hook;
+ ops->dev = dev;
+
+ hlist_add_head(&hook->list, &table->hooks);
+ mod_delayed_work(system_power_efficient_wq, &table->work, 0);
+
+ return 0;
+}
+
+static struct xt_flowoffload_hook *
+flow_offload_lookup_hook(struct xt_flowoffload_table *table,
+ struct net_device *dev)
+{
+ struct xt_flowoffload_hook *hook;
+
+ hlist_for_each_entry(hook, &table->hooks, list) {
+ if (hook->ops.dev == dev)
+ return hook;
+ }
+
+ return NULL;
+}
+
+static void
+xt_flowoffload_check_device(struct xt_flowoffload_table *table,
+ struct net_device *dev)
+{
+ struct xt_flowoffload_hook *hook;
+
+ if (!dev)
+ return;
+
+ spin_lock_bh(&hooks_lock);
+ hook = flow_offload_lookup_hook(table, dev);
+ if (hook)
+ hook->used = true;
+ else
+ xt_flowoffload_create_hook(table, dev);
+ spin_unlock_bh(&hooks_lock);
+}
+
+static void
+xt_flowoffload_register_hooks(struct xt_flowoffload_table *table)
+{
+ struct xt_flowoffload_hook *hook;
+
+restart:
+ hlist_for_each_entry(hook, &table->hooks, list) {
+ if (hook->registered)
+ continue;
+
+ hook->registered = true;
+ hook->net = dev_net(hook->ops.dev);
+ spin_unlock_bh(&hooks_lock);
+ nf_register_net_hook(hook->net, &hook->ops);
+ if (table->ft.flags & NF_FLOWTABLE_HW_OFFLOAD)
+ table->ft.type->setup(&table->ft, hook->ops.dev,
+ FLOW_BLOCK_BIND);
+ spin_lock_bh(&hooks_lock);
+ goto restart;
+ }
+
+}
+
+static bool
+xt_flowoffload_cleanup_hooks(struct xt_flowoffload_table *table)
+{
+ struct xt_flowoffload_hook *hook;
+ bool active = false;
+
+restart:
+ spin_lock_bh(&hooks_lock);
+ hlist_for_each_entry(hook, &table->hooks, list) {
+ if (hook->used || !hook->registered) {
+ active = true;
+ continue;
+ }
+
+ hlist_del(&hook->list);
+ spin_unlock_bh(&hooks_lock);
+ if (table->ft.flags & NF_FLOWTABLE_HW_OFFLOAD)
+ table->ft.type->setup(&table->ft, hook->ops.dev,
+ FLOW_BLOCK_UNBIND);
+ nf_unregister_net_hook(hook->net, &hook->ops);
+ kfree(hook);
+ goto restart;
+ }
+ spin_unlock_bh(&hooks_lock);
+
+ return active;
+}
+
+static void
+xt_flowoffload_check_hook(struct nf_flowtable *flowtable,
+ struct flow_offload *flow, void *data)
+{
+ struct xt_flowoffload_table *table;
+ struct flow_offload_tuple *tuple0 = &flow->tuplehash[0].tuple;
+ struct flow_offload_tuple *tuple1 = &flow->tuplehash[1].tuple;
+ struct xt_flowoffload_hook *hook;
+
+ table = container_of(flowtable, struct xt_flowoffload_table, ft);
+
+ spin_lock_bh(&hooks_lock);
+ hlist_for_each_entry(hook, &table->hooks, list) {
+ if (hook->ops.dev->ifindex != tuple0->iifidx &&
+ hook->ops.dev->ifindex != tuple1->iifidx)
+ continue;
+
+ hook->used = true;
+ }
+ spin_unlock_bh(&hooks_lock);
+}
+
+static void
+xt_flowoffload_hook_work(struct work_struct *work)
+{
+ struct xt_flowoffload_table *table;
+ struct xt_flowoffload_hook *hook;
+ int err;
+
+ table = container_of(work, struct xt_flowoffload_table, work.work);
+
+ spin_lock_bh(&hooks_lock);
+ xt_flowoffload_register_hooks(table);
+ hlist_for_each_entry(hook, &table->hooks, list)
+ hook->used = false;
+ spin_unlock_bh(&hooks_lock);
+
+ err = nf_flow_table_iterate(&table->ft, xt_flowoffload_check_hook,
+ NULL);
+ if (err && err != -EAGAIN)
+ goto out;
+
+ if (!xt_flowoffload_cleanup_hooks(table))
+ return;
+
+out:
+ queue_delayed_work(system_power_efficient_wq, &table->work, HZ);
+}
+
+static bool
+xt_flowoffload_skip(struct sk_buff *skb, int family)
+{
+ if (skb_sec_path(skb))
+ return true;
+
+ if (family == NFPROTO_IPV4) {
+ const struct ip_options *opt = &(IPCB(skb)->opt);
+
+ if (unlikely(opt->optlen))
+ return true;
+ }
+
+ return false;
+}
+
+static enum flow_offload_xmit_type nf_xmit_type(struct dst_entry *dst)
+{
+ if (dst_xfrm(dst))
+ return FLOW_OFFLOAD_XMIT_XFRM;
+
+ return FLOW_OFFLOAD_XMIT_NEIGH;
+}
+
+static void nf_default_forward_path(struct nf_flow_route *route,
+ struct dst_entry *dst_cache,
+ enum ip_conntrack_dir dir,
+ struct net_device **dev)
+{
+ dev[!dir] = dst_cache->dev;
+ route->tuple[!dir].in.ifindex = dst_cache->dev->ifindex;
+ route->tuple[dir].dst = dst_cache;
+ route->tuple[dir].xmit_type = nf_xmit_type(dst_cache);
+}
+
+static bool nf_is_valid_ether_device(const struct net_device *dev)
+{
+ if (!dev || (dev->flags & IFF_LOOPBACK) || dev->type != ARPHRD_ETHER ||
+ dev->addr_len != ETH_ALEN || !is_valid_ether_addr(dev->dev_addr))
+ return false;
+
+ return true;
+}
+
+static void nf_dev_path_info(const struct net_device_path_stack *stack,
+ struct nf_forward_info *info,
+ unsigned char *ha)
+{
+ const struct net_device_path *path;
+ int i;
+
+ memcpy(info->h_dest, ha, ETH_ALEN);
+
+ for (i = 0; i < stack->num_paths; i++) {
+ path = &stack->path[i];
+ switch (path->type) {
+ case DEV_PATH_ETHERNET:
+ case DEV_PATH_DSA:
+ case DEV_PATH_VLAN:
+ case DEV_PATH_PPPOE:
+ info->indev = path->dev;
+ if (is_zero_ether_addr(info->h_source))
+ memcpy(info->h_source, path->dev->dev_addr, ETH_ALEN);
+
+ if (path->type == DEV_PATH_ETHERNET)
+ break;
+ if (path->type == DEV_PATH_DSA) {
+ i = stack->num_paths;
+ break;
+ }
+
+ /* DEV_PATH_VLAN and DEV_PATH_PPPOE */
+ if (info->num_encaps >= NF_FLOW_TABLE_ENCAP_MAX) {
+ info->indev = NULL;
+ break;
+ }
+ if (!info->outdev)
+ info->outdev = path->dev;
+ info->encap[info->num_encaps].id = path->encap.id;
+ info->encap[info->num_encaps].proto = path->encap.proto;
+ info->num_encaps++;
+ if (path->type == DEV_PATH_PPPOE)
+ memcpy(info->h_dest, path->encap.h_dest, ETH_ALEN);
+ break;
+ case DEV_PATH_BRIDGE:
+ if (is_zero_ether_addr(info->h_source))
+ memcpy(info->h_source, path->dev->dev_addr, ETH_ALEN);
+
+ switch (path->bridge.vlan_mode) {
+ case DEV_PATH_BR_VLAN_UNTAG_HW:
+ info->ingress_vlans |= BIT(info->num_encaps - 1);
+ break;
+ case DEV_PATH_BR_VLAN_TAG:
+ info->encap[info->num_encaps].id = path->bridge.vlan_id;
+ info->encap[info->num_encaps].proto = path->bridge.vlan_proto;
+ info->num_encaps++;
+ break;
+ case DEV_PATH_BR_VLAN_UNTAG:
+ info->num_encaps--;
+ break;
+ case DEV_PATH_BR_VLAN_KEEP:
+ break;
+ }
+ break;
+ default:
+ info->indev = NULL;
+ break;
+ }
+ }
+ if (!info->outdev)
+ info->outdev = info->indev;
+
+ info->hw_outdev = info->indev;
+
+ if (nf_is_valid_ether_device(info->indev))
+ info->xmit_type = FLOW_OFFLOAD_XMIT_DIRECT;
+}
+
+static int nf_dev_fill_forward_path(const struct nf_flow_route *route,
+ const struct dst_entry *dst_cache,
+ const struct nf_conn *ct,
+ enum ip_conntrack_dir dir, u8 *ha,
+ struct net_device_path_stack *stack)
+{
+ const void *daddr = &ct->tuplehash[!dir].tuple.src.u3;
+ struct net_device *dev = dst_cache->dev;
+ struct neighbour *n;
+ u8 nud_state;
+
+ if (!nf_is_valid_ether_device(dev))
+ goto out;
+
+ n = dst_neigh_lookup(dst_cache, daddr);
+ if (!n)
+ return -1;
+
+ read_lock_bh(&n->lock);
+ nud_state = n->nud_state;
+ ether_addr_copy(ha, n->ha);
+ read_unlock_bh(&n->lock);
+ neigh_release(n);
+
+ if (!(nud_state & NUD_VALID))
+ return -1;
+
+out:
+ return dev_fill_forward_path(dev, ha, stack);
+}
+
+static void nf_dev_forward_path(struct nf_flow_route *route,
+ const struct nf_conn *ct,
+ enum ip_conntrack_dir dir,
+ struct net_device **devs)
+{
+ const struct dst_entry *dst = route->tuple[dir].dst;
+ struct net_device_path_stack stack;
+ struct nf_forward_info info = {};
+ unsigned char ha[ETH_ALEN];
+ int i;
+
+ if (nf_dev_fill_forward_path(route, dst, ct, dir, ha, &stack) >= 0)
+ nf_dev_path_info(&stack, &info, ha);
+
+ devs[!dir] = (struct net_device *)info.indev;
+ if (!info.indev)
+ return;
+
+ route->tuple[!dir].in.ifindex = info.indev->ifindex;
+ for (i = 0; i < info.num_encaps; i++) {
+ route->tuple[!dir].in.encap[i].id = info.encap[i].id;
+ route->tuple[!dir].in.encap[i].proto = info.encap[i].proto;
+ }
+ route->tuple[!dir].in.num_encaps = info.num_encaps;
+ route->tuple[!dir].in.ingress_vlans = info.ingress_vlans;
+
+ if (info.xmit_type == FLOW_OFFLOAD_XMIT_DIRECT) {
+ memcpy(route->tuple[dir].out.h_source, info.h_source, ETH_ALEN);
+ memcpy(route->tuple[dir].out.h_dest, info.h_dest, ETH_ALEN);
+ route->tuple[dir].out.ifindex = info.outdev->ifindex;
+ route->tuple[dir].out.hw_ifindex = info.hw_outdev->ifindex;
+ route->tuple[dir].xmit_type = info.xmit_type;
+ }
+}
+
+static int
+xt_flowoffload_route(struct sk_buff *skb, const struct nf_conn *ct,
+ const struct xt_action_param *par,
+ struct nf_flow_route *route, enum ip_conntrack_dir dir,
+ struct net_device **devs)
+{
+ struct dst_entry *this_dst = skb_dst(skb);
+ struct dst_entry *other_dst = NULL;
+ struct flowi fl;
+
+ memset(&fl, 0, sizeof(fl));
+ switch (xt_family(par)) {
+ case NFPROTO_IPV4:
+ fl.u.ip4.daddr = ct->tuplehash[dir].tuple.src.u3.ip;
+ fl.u.ip4.flowi4_oif = xt_in(par)->ifindex;
+ break;
+ case NFPROTO_IPV6:
+ fl.u.ip6.saddr = ct->tuplehash[!dir].tuple.dst.u3.in6;
+ fl.u.ip6.daddr = ct->tuplehash[dir].tuple.src.u3.in6;
+ fl.u.ip6.flowi6_oif = xt_in(par)->ifindex;
+ break;
+ }
+
+ nf_route(xt_net(par), &other_dst, &fl, false, xt_family(par));
+ if (!other_dst)
+ return -ENOENT;
+
+ nf_default_forward_path(route, this_dst, dir, devs);
+ nf_default_forward_path(route, other_dst, !dir, devs);
+
+ if (route->tuple[dir].xmit_type == FLOW_OFFLOAD_XMIT_NEIGH &&
+ route->tuple[!dir].xmit_type == FLOW_OFFLOAD_XMIT_NEIGH) {
+ nf_dev_forward_path(route, ct, dir, devs);
+ nf_dev_forward_path(route, ct, !dir, devs);
+ }
+
+ return 0;
+}
+
+static unsigned int
+flowoffload_tg(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ struct xt_flowoffload_table *table;
+ const struct xt_flowoffload_target_info *info = par->targinfo;
+ struct tcphdr _tcph, *tcph = NULL;
+ enum ip_conntrack_info ctinfo;
+ enum ip_conntrack_dir dir;
+ struct nf_flow_route route = {};
+ struct flow_offload *flow = NULL;
+ struct net_device *devs[2] = {};
+ struct nf_conn *ct;
+ struct net *net;
+
+ if (xt_flowoffload_skip(skb, xt_family(par)))
+ return XT_CONTINUE;
+
+ ct = nf_ct_get(skb, &ctinfo);
+ if (ct == NULL)
+ return XT_CONTINUE;
+
+ switch (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum) {
+ case IPPROTO_TCP:
+ if (ct->proto.tcp.state != TCP_CONNTRACK_ESTABLISHED)
+ return XT_CONTINUE;
+
+ tcph = skb_header_pointer(skb, par->thoff,
+ sizeof(_tcph), &_tcph);
+ if (unlikely(!tcph || tcph->fin || tcph->rst))
+ return XT_CONTINUE;
+ break;
+ case IPPROTO_UDP:
+ break;
+ default:
+ return XT_CONTINUE;
+ }
+
+ if (nf_ct_ext_exist(ct, NF_CT_EXT_HELPER) ||
+ ct->status & (IPS_SEQ_ADJUST | IPS_NAT_CLASH))
+ return XT_CONTINUE;
+
+ if (!nf_ct_is_confirmed(ct))
+ return XT_CONTINUE;
+
+ devs[dir] = xt_out(par);
+ devs[!dir] = xt_in(par);
+
+ if (!devs[dir] || !devs[!dir])
+ return XT_CONTINUE;
+
+ if (test_and_set_bit(IPS_OFFLOAD_BIT, &ct->status))
+ return XT_CONTINUE;
+
+ dir = CTINFO2DIR(ctinfo);
+
+ if (xt_flowoffload_route(skb, ct, par, &route, dir, devs) < 0)
+ goto err_flow_route;
+
+ flow = flow_offload_alloc(ct);
+ if (!flow)
+ goto err_flow_alloc;
+
+ flow_offload_route_init(flow, &route);
+
+ if (tcph) {
+ ct->proto.tcp.seen[0].flags |= IP_CT_TCP_FLAG_BE_LIBERAL;
+ ct->proto.tcp.seen[1].flags |= IP_CT_TCP_FLAG_BE_LIBERAL;
+ }
+
+ table = &flowtable[!!(info->flags & XT_FLOWOFFLOAD_HW)];
+
+ net = read_pnet(&table->ft.net);
+ if (!net)
+ write_pnet(&table->ft.net, xt_net(par));
+
+ if (flow_offload_add(&table->ft, flow) < 0)
+ goto err_flow_add;
+
+ xt_flowoffload_check_device(table, devs[0]);
+ xt_flowoffload_check_device(table, devs[1]);
+
+ dst_release(route.tuple[!dir].dst);
+
+ return XT_CONTINUE;
+
+err_flow_add:
+ flow_offload_free(flow);
+err_flow_alloc:
+ dst_release(route.tuple[!dir].dst);
+err_flow_route:
+ clear_bit(IPS_OFFLOAD_BIT, &ct->status);
+
+ return XT_CONTINUE;
+}
+
+static int flowoffload_chk(const struct xt_tgchk_param *par)
+{
+ struct xt_flowoffload_target_info *info = par->targinfo;
+
+ if (info->flags & ~XT_FLOWOFFLOAD_MASK)
+ return -EINVAL;
+
+ return 0;
+}
+
+static struct xt_target offload_tg_reg __read_mostly = {
+ .family = NFPROTO_UNSPEC,
+ .name = "FLOWOFFLOAD",
+ .revision = 0,
+ .targetsize = sizeof(struct xt_flowoffload_target_info),
+ .usersize = sizeof(struct xt_flowoffload_target_info),
+ .checkentry = flowoffload_chk,
+ .target = flowoffload_tg,
+ .me = THIS_MODULE,
+};
+
+static int flow_offload_netdev_event(struct notifier_block *this,
+ unsigned long event, void *ptr)
+{
+ struct xt_flowoffload_hook *hook0, *hook1;
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+
+ if (event != NETDEV_UNREGISTER)
+ return NOTIFY_DONE;
+
+ spin_lock_bh(&hooks_lock);
+ hook0 = flow_offload_lookup_hook(&flowtable[0], dev);
+ if (hook0)
+ hlist_del(&hook0->list);
+
+ hook1 = flow_offload_lookup_hook(&flowtable[1], dev);
+ if (hook1)
+ hlist_del(&hook1->list);
+ spin_unlock_bh(&hooks_lock);
+
+ if (hook0) {
+ nf_unregister_net_hook(hook0->net, &hook0->ops);
+ kfree(hook0);
+ }
+
+ if (hook1) {
+ nf_unregister_net_hook(hook1->net, &hook1->ops);
+ kfree(hook1);
+ }
+
+ nf_flow_table_cleanup(dev);
+
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block flow_offload_netdev_notifier = {
+ .notifier_call = flow_offload_netdev_event,
+};
+
+static int nf_flow_rule_route_inet(struct net *net,
+ struct flow_offload *flow,
+ enum flow_offload_tuple_dir dir,
+ struct nf_flow_rule *flow_rule)
+{
+ const struct flow_offload_tuple *flow_tuple = &flow->tuplehash[dir].tuple;
+ int err;
+
+ switch (flow_tuple->l3proto) {
+ case NFPROTO_IPV4:
+ err = nf_flow_rule_route_ipv4(net, flow, dir, flow_rule);
+ break;
+ case NFPROTO_IPV6:
+ err = nf_flow_rule_route_ipv6(net, flow, dir, flow_rule);
+ break;
+ default:
+ err = -1;
+ break;
+ }
+
+ return err;
+}
+
+static struct nf_flowtable_type flowtable_inet = {
+ .family = NFPROTO_INET,
+ .init = nf_flow_table_init,
+ .setup = nf_flow_table_offload_setup,
+ .action = nf_flow_rule_route_inet,
+ .free = nf_flow_table_free,
+ .hook = xt_flowoffload_net_hook,
+ .owner = THIS_MODULE,
+};
+
+static int init_flowtable(struct xt_flowoffload_table *tbl)
+{
+ INIT_DELAYED_WORK(&tbl->work, xt_flowoffload_hook_work);
+ tbl->ft.type = &flowtable_inet;
+ tbl->ft.flags = NF_FLOWTABLE_COUNTER;
+
+ return nf_flow_table_init(&tbl->ft);
+}
+
+static int __init xt_flowoffload_tg_init(void)
+{
+ int ret;
+
+ register_netdevice_notifier(&flow_offload_netdev_notifier);
+
+ ret = init_flowtable(&flowtable[0]);
+ if (ret)
+ return ret;
+
+ ret = init_flowtable(&flowtable[1]);
+ if (ret)
+ goto cleanup;
+
+ flowtable[1].ft.flags |= NF_FLOWTABLE_HW_OFFLOAD;
+
+ ret = xt_register_target(&offload_tg_reg);
+ if (ret)
+ goto cleanup2;
+
+ return 0;
+
+cleanup2:
+ nf_flow_table_free(&flowtable[1].ft);
+cleanup:
+ nf_flow_table_free(&flowtable[0].ft);
+ return ret;
+}
+
+static void __exit xt_flowoffload_tg_exit(void)
+{
+ xt_unregister_target(&offload_tg_reg);
+ unregister_netdevice_notifier(&flow_offload_netdev_notifier);
+ nf_flow_table_free(&flowtable[0].ft);
+ nf_flow_table_free(&flowtable[1].ft);
+}
+
+MODULE_LICENSE("GPL");
+module_init(xt_flowoffload_tg_init);
+module_exit(xt_flowoffload_tg_exit);

View File

@@ -0,0 +1,152 @@
From 772c6e460211ac740b2550fa75be36b8a49731fe Mon Sep 17 00:00:00 2001
From: "mfreemon@cloudflare.com" <mfreemon@cloudflare.com>
Date: Tue, 1 Mar 2022 17:06:02 -0600
Subject: [PATCH] tcp: Add a sysctl to skip tcp collapse processing when the
receive buffer is full
For context and additional information about this patch, see the
blog post at https://blog.cloudflare.com/optimizing-tcp-for-high-throughput-and-low-latency/
sysctl: net.ipv4.tcp_collapse_max_bytes
If tcp_collapse_max_bytes is non-zero, attempt to collapse the
queue to free up memory if the current amount of memory allocated
is less than tcp_collapse_max_bytes. Otherwise, the packet is
dropped without attempting to collapse the queue.
If tcp_collapse_max_bytes is zero, this feature is disabled
and the default Linux behavior is used. The default Linux
behavior is to always perform the attempt to collapse the
queue to free up memory.
When the receive queue is small, we want to collapse the
queue. There are two reasons for this: (a) the latency of
performing the collapse will be small on a small queue, and
(b) we want to avoid sending a congestion signal (via a
packet drop) to the sender when the receive queue is small.
The result is that we avoid latency spikes caused by the
time it takes to perform the collapse logic when the receive
queue is large and full, while preserving existing behavior
and performance for all other cases.
Signed-off-by: Alexandre Frade <kernel@xanmod.org>
---
include/net/netns/ipv4.h | 1 +
include/trace/events/tcp.h | 7 +++++++
net/ipv4/sysctl_net_ipv4.c | 7 +++++++
net/ipv4/tcp_input.c | 36 ++++++++++++++++++++++++++++++++++++
net/ipv4/tcp_ipv4.c | 1 +
5 files changed, 52 insertions(+)
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -223,6 +223,7 @@ struct netns_ipv4 {
u8 sysctl_fib_notify_on_flag_change;
u8 sysctl_tcp_syn_linear_timeouts;
+ unsigned int sysctl_tcp_collapse_max_bytes;
#ifdef CONFIG_NET_L3_MASTER_DEV
u8 sysctl_udp_l3mdev_accept;
--- a/include/trace/events/tcp.h
+++ b/include/trace/events/tcp.h
@@ -213,6 +213,13 @@ DEFINE_EVENT(tcp_event_sk, tcp_rcv_space
TP_ARGS(sk)
);
+DEFINE_EVENT(tcp_event_sk, tcp_collapse_max_bytes_exceeded,
+
+ TP_PROTO(struct sock *sk),
+
+ TP_ARGS(sk)
+);
+
TRACE_EVENT(tcp_retransmit_synack,
TP_PROTO(const struct sock *sk, const struct request_sock *req),
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -1558,6 +1558,13 @@ static struct ctl_table ipv4_net_table[]
.extra2 = SYSCTL_ONE,
},
{
+ .procname = "tcp_collapse_max_bytes",
+ .data = &init_net.ipv4.sysctl_tcp_collapse_max_bytes,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_douintvec_minmax,
+ },
+ {
.procname = "tcp_pingpong_thresh",
.data = &init_net.ipv4.sysctl_tcp_pingpong_thresh,
.maxlen = sizeof(u8),
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5645,6 +5645,7 @@ static bool tcp_prune_ofo_queue(struct s
static int tcp_prune_queue(struct sock *sk, const struct sk_buff *in_skb)
{
struct tcp_sock *tp = tcp_sk(sk);
+ struct net *net = sock_net(sk);
NET_INC_STATS(sock_net(sk), LINUX_MIB_PRUNECALLED);
@@ -5656,6 +5657,39 @@ static int tcp_prune_queue(struct sock *
if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
return 0;
+ /* For context and additional information about this patch, see the
+ * blog post at
+ *
+ * sysctl: net.ipv4.tcp_collapse_max_bytes
+ *
+ * If tcp_collapse_max_bytes is non-zero, attempt to collapse the
+ * queue to free up memory if the current amount of memory allocated
+ * is less than tcp_collapse_max_bytes. Otherwise, the packet is
+ * dropped without attempting to collapse the queue.
+ *
+ * If tcp_collapse_max_bytes is zero, this feature is disabled
+ * and the default Linux behavior is used. The default Linux
+ * behavior is to always perform the attempt to collapse the
+ * queue to free up memory.
+ *
+ * When the receive queue is small, we want to collapse the
+ * queue. There are two reasons for this: (a) the latency of
+ * performing the collapse will be small on a small queue, and
+ * (b) we want to avoid sending a congestion signal (via a
+ * packet drop) to the sender when the receive queue is small.
+ *
+ * The result is that we avoid latency spikes caused by the
+ * time it takes to perform the collapse logic when the receive
+ * queue is large and full, while preserving existing behavior
+ * and performance for all other cases.
+ */
+ if (net->ipv4.sysctl_tcp_collapse_max_bytes &&
+ (atomic_read(&sk->sk_rmem_alloc) > net->ipv4.sysctl_tcp_collapse_max_bytes)) {
+ /* We are dropping the packet */
+ trace_tcp_collapse_max_bytes_exceeded(sk);
+ goto do_not_collapse;
+ }
+
tcp_collapse_ofo_queue(sk);
if (!skb_queue_empty(&sk->sk_receive_queue))
tcp_collapse(sk, &sk->sk_receive_queue, NULL,
@@ -5674,6 +5708,8 @@ static int tcp_prune_queue(struct sock *
if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
return 0;
+do_not_collapse:
+
/* If we are really being abused, tell the caller to silently
* drop receive data on the floor. It will get retransmitted
* and hopefully then we'll have sufficient space.
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -3508,6 +3508,7 @@ static int __net_init tcp_sk_init(struct
net->ipv4.sysctl_tcp_syn_linear_timeouts = 4;
net->ipv4.sysctl_tcp_shrink_window = 0;
+ net->ipv4.sysctl_tcp_collapse_max_bytes = 0;
net->ipv4.sysctl_tcp_pingpong_thresh = 1;
net->ipv4.sysctl_tcp_rto_min_us = jiffies_to_usecs(TCP_RTO_MIN);

View File

@@ -0,0 +1,191 @@
From 454d96573eea54fcb7714a4a455d13173dfb9768 Mon Sep 17 00:00:00 2001
From: Mark Weiman <mark.weiman@markzz.com>
Date: Sun, 12 Aug 2018 11:36:21 -0400
Subject: [PATCH] PCI: Enable overrides for missing ACS capabilities
This an updated version of Alex Williamson's patch from:
https://lkml.org/lkml/2013/5/30/513
Original commit message follows:
PCIe ACS (Access Control Services) is the PCIe 2.0+ feature that
allows us to control whether transactions are allowed to be redirected
in various subnodes of a PCIe topology. For instance, if two
endpoints are below a root port or downsteam switch port, the
downstream port may optionally redirect transactions between the
devices, bypassing upstream devices. The same can happen internally
on multifunction devices. The transaction may never be visible to the
upstream devices.
One upstream device that we particularly care about is the IOMMU. If
a redirection occurs in the topology below the IOMMU, then the IOMMU
cannot provide isolation between devices. This is why the PCIe spec
encourages topologies to include ACS support. Without it, we have to
assume peer-to-peer DMA within a hierarchy can bypass IOMMU isolation.
Unfortunately, far too many topologies do not support ACS to make this
a steadfast requirement. Even the latest chipsets from Intel are only
sporadically supporting ACS. We have trouble getting interconnect
vendors to include the PCIe spec required PCIe capability, let alone
suggested features.
Therefore, we need to add some flexibility. The pcie_acs_override=
boot option lets users opt-in specific devices or sets of devices to
assume ACS support. The "downstream" option assumes full ACS support
on root ports and downstream switch ports. The "multifunction"
option assumes the subset of ACS features available on multifunction
endpoints and upstream switch ports are supported. The "id:nnnn:nnnn"
option enables ACS support on devices matching the provided vendor
and device IDs, allowing more strategic ACS overrides. These options
may be combined in any order. A maximum of 16 id specific overrides
are available. It's suggested to use the most limited set of options
necessary to avoid completely disabling ACS across the topology.
Note to hardware vendors, we have facilities to permanently quirk
specific devices which enforce isolation but not provide an ACS
capability. Please contact me to have your devices added and save
your customers the hassle of this boot option.
Rebased-by: Alexandre Frade <kernel@xanmod.org>
Signed-off-by: Mark Weiman <mark.weiman@markzz.com>
Signed-off-by: Alexandre Frade <kernel@xanmod.org>
---
.../admin-guide/kernel-parameters.txt | 9 ++
drivers/pci/quirks.c | 102 ++++++++++++++++++
2 files changed, 111 insertions(+)
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -4414,6 +4414,15 @@
nomsi [MSI] If the PCI_MSI kernel config parameter is
enabled, this kernel boot option can be used to
disable the use of MSI interrupts system-wide.
+ pcie_acs_override =
+ [PCIE] Override missing PCIe ACS support for:
+ downstream
+ All downstream ports - full ACS capabilities
+ multifunction
+ All multifunction devices - multifunction ACS subset
+ id:nnnn:nnnn
+ Specific device - full ACS capabilities
+ Specified as vid:did (vendor/device ID) in hex
noioapicquirk [APIC] Disable all boot interrupt quirks.
Safety option to keep boot IRQs enabled. This
should never be necessary.
--- a/drivers/pci/quirks.c
+++ b/drivers/pci/quirks.c
@@ -3747,6 +3747,106 @@ static void quirk_no_bus_reset(struct pc
dev->dev_flags |= PCI_DEV_FLAGS_NO_BUS_RESET;
}
+static bool acs_on_downstream;
+static bool acs_on_multifunction;
+
+#define NUM_ACS_IDS 16
+struct acs_on_id {
+ unsigned short vendor;
+ unsigned short device;
+};
+static struct acs_on_id acs_on_ids[NUM_ACS_IDS];
+static u8 max_acs_id;
+
+static __init int pcie_acs_override_setup(char *p)
+{
+ if (!p)
+ return -EINVAL;
+
+ while (*p) {
+ if (!strncmp(p, "downstream", 10))
+ acs_on_downstream = true;
+ if (!strncmp(p, "multifunction", 13))
+ acs_on_multifunction = true;
+ if (!strncmp(p, "id:", 3)) {
+ char opt[5];
+ int ret;
+ long val;
+
+ if (max_acs_id >= NUM_ACS_IDS - 1) {
+ pr_warn("Out of PCIe ACS override slots (%d)\n",
+ NUM_ACS_IDS);
+ goto next;
+ }
+
+ p += 3;
+ snprintf(opt, 5, "%s", p);
+ ret = kstrtol(opt, 16, &val);
+ if (ret) {
+ pr_warn("PCIe ACS ID parse error %d\n", ret);
+ goto next;
+ }
+ acs_on_ids[max_acs_id].vendor = val;
+
+ p += strcspn(p, ":");
+ if (*p != ':') {
+ pr_warn("PCIe ACS invalid ID\n");
+ goto next;
+ }
+
+ p++;
+ snprintf(opt, 5, "%s", p);
+ ret = kstrtol(opt, 16, &val);
+ if (ret) {
+ pr_warn("PCIe ACS ID parse error %d\n", ret);
+ goto next;
+ }
+ acs_on_ids[max_acs_id].device = val;
+ max_acs_id++;
+ }
+next:
+ p += strcspn(p, ",");
+ if (*p == ',')
+ p++;
+ }
+
+ if (acs_on_downstream || acs_on_multifunction || max_acs_id)
+ pr_warn("Warning: PCIe ACS overrides enabled; This may allow non-IOMMU protected peer-to-peer DMA\n");
+
+ return 0;
+}
+early_param("pcie_acs_override", pcie_acs_override_setup);
+
+static int pcie_acs_overrides(struct pci_dev *dev, u16 acs_flags)
+{
+ int i;
+
+ /* Never override ACS for legacy devices or devices with ACS caps */
+ if (!pci_is_pcie(dev) ||
+ pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ACS))
+ return -ENOTTY;
+
+ for (i = 0; i < max_acs_id; i++)
+ if (acs_on_ids[i].vendor == dev->vendor &&
+ acs_on_ids[i].device == dev->device)
+ return 1;
+
+ switch (pci_pcie_type(dev)) {
+ case PCI_EXP_TYPE_DOWNSTREAM:
+ case PCI_EXP_TYPE_ROOT_PORT:
+ if (acs_on_downstream)
+ return 1;
+ break;
+ case PCI_EXP_TYPE_ENDPOINT:
+ case PCI_EXP_TYPE_UPSTREAM:
+ case PCI_EXP_TYPE_LEG_END:
+ case PCI_EXP_TYPE_RC_END:
+ if (acs_on_multifunction && dev->multifunction)
+ return 1;
+ }
+
+ return -ENOTTY;
+}
/*
* Some NVIDIA GPU devices do not work with bus reset, SBR needs to be
* prevented for those affected devices.
@@ -5168,6 +5268,8 @@ static const struct pci_dev_acs_enabled
{ PCI_VENDOR_ID_ZHAOXIN, PCI_ANY_ID, pci_quirk_zhaoxin_pcie_ports_acs },
/* Wangxun nics */
{ PCI_VENDOR_ID_WANGXUN, PCI_ANY_ID, pci_quirk_wangxun_nic_acs },
+ /* ACS override */
+ { PCI_ANY_ID, PCI_ANY_ID, pcie_acs_overrides },
{ 0 }
};

View File

@@ -0,0 +1,219 @@
From e054b70adebd85ed4ab62b458ee6ec3e902d970f Mon Sep 17 00:00:00 2001
From: Andrey Smirnov <andrew.smirnov@gmail.com>
Date: Sun, 27 Feb 2022 14:46:08 -0800
Subject: [PATCH 1/6] extcon: Add driver for Steam Deck
(cherry picked from commit f9f2eddae582ae39d5f89c1218448fc259b90aa8)
Signed-off-by: Cristian Ciocaltea <cristian.ciocaltea@collabora.com>
Signed-off-by: Alexandre Frade <kernel@xanmod.org>
---
drivers/extcon/Kconfig | 7 ++
drivers/extcon/Makefile | 1 +
drivers/extcon/extcon-steamdeck.c | 180 ++++++++++++++++++++++++++++++
3 files changed, 188 insertions(+)
create mode 100644 drivers/extcon/extcon-steamdeck.c
--- a/drivers/extcon/Kconfig
+++ b/drivers/extcon/Kconfig
@@ -203,4 +203,11 @@ config EXTCON_RTK_TYPE_C
The DHC (Digital Home Hub) RTD series SoC contains a type c module.
This driver will detect the status of the type-c port.
+config EXTCON_STEAMDECK
+ tristate "Steam Deck extcon support"
+ depends on MFD_STEAMDECK
+ help
+ Say Y here to enable support of USB Type C cable detection extcon
+ support on Steam Deck devices
+
endif
--- a/drivers/extcon/Makefile
+++ b/drivers/extcon/Makefile
@@ -26,3 +26,4 @@ obj-$(CONFIG_EXTCON_USB_GPIO) += extcon-
obj-$(CONFIG_EXTCON_USBC_CROS_EC) += extcon-usbc-cros-ec.o
obj-$(CONFIG_EXTCON_USBC_TUSB320) += extcon-usbc-tusb320.o
obj-$(CONFIG_EXTCON_RTK_TYPE_C) += extcon-rtk-type-c.o
+obj-$(CONFIG_EXTCON_STEAMDECK) += extcon-steamdeck.o
--- /dev/null
+++ b/drivers/extcon/extcon-steamdeck.c
@@ -0,0 +1,180 @@
+
+#include <linux/acpi.h>
+#include <linux/platform_device.h>
+#include <linux/extcon-provider.h>
+
+#define ACPI_STEAMDECK_NOTIFY_STATUS 0x80
+
+/* 0 - port connected, 1 -port disconnected */
+#define ACPI_STEAMDECK_PORT_CONNECT BIT(0)
+/* 0 - Upstream Facing Port, 1 - Downdstream Facing Port */
+#define ACPI_STEAMDECK_CUR_DATA_ROLE BIT(3)
+/*
+ * Debouncing delay to allow negotiation process to settle. 2s value
+ * was arrived at via trial and error.
+ */
+#define STEAMDECK_ROLE_SWITCH_DELAY (msecs_to_jiffies(2000))
+
+struct steamdeck_extcon {
+ struct acpi_device *adev;
+ struct delayed_work role_work;
+ struct extcon_dev *edev;
+ struct device *dev;
+};
+
+static int steamdeck_read_pdcs(struct steamdeck_extcon *sd, unsigned long long *pdcs)
+{
+ acpi_status status;
+
+ status = acpi_evaluate_integer(sd->adev->handle, "PDCS", NULL, pdcs);
+ if (ACPI_FAILURE(status)) {
+ dev_err(sd->dev, "PDCS evaluation failed: %s\n",
+ acpi_format_exception(status));
+ return -EIO;
+ }
+
+ return 0;
+}
+
+static void steamdeck_usb_role_work(struct work_struct *work)
+{
+ struct steamdeck_extcon *sd =
+ container_of(work, struct steamdeck_extcon, role_work.work);
+ unsigned long long pdcs;
+ bool usb_host;
+
+ if (steamdeck_read_pdcs(sd, &pdcs))
+ return;
+
+ /*
+ * We only care about these two
+ */
+ pdcs &= ACPI_STEAMDECK_PORT_CONNECT | ACPI_STEAMDECK_CUR_DATA_ROLE;
+
+ /*
+ * For "connect" events our role is determined by a bit in
+ * PDCS, for "disconnect" we switch to being a gadget
+ * unconditionally. The thinking for the latter is we don't
+ * want to start acting as a USB host until we get
+ * confirmation from the firmware that we are a USB host
+ */
+ usb_host = (pdcs & ACPI_STEAMDECK_PORT_CONNECT) ?
+ pdcs & ACPI_STEAMDECK_CUR_DATA_ROLE : false;
+
+ dev_dbg(sd->dev, "USB role is %s\n", usb_host ? "host" : "device");
+ WARN_ON(extcon_set_state_sync(sd->edev, EXTCON_USB_HOST,
+ usb_host));
+
+}
+
+static void steamdeck_notify(acpi_handle handle, u32 event, void *context)
+{
+ struct device *dev = context;
+ struct steamdeck_extcon *sd = dev_get_drvdata(dev);
+ unsigned long long pdcs;
+ unsigned long delay;
+
+ switch (event) {
+ case ACPI_STEAMDECK_NOTIFY_STATUS:
+ if (steamdeck_read_pdcs(sd, &pdcs))
+ return;
+ /*
+ * We process "disconnect" events immediately and
+ * "connect" events with a delay to give the HW time
+ * to settle. For example attaching USB hub (at least
+ * for HW used for testing) will generate intermediary
+ * event with "host" bit not set, followed by the one
+ * that does have it set.
+ */
+ delay = (pdcs & ACPI_STEAMDECK_PORT_CONNECT) ?
+ STEAMDECK_ROLE_SWITCH_DELAY : 0;
+
+ queue_delayed_work(system_long_wq, &sd->role_work, delay);
+ break;
+ default:
+ dev_warn(dev, "Unsupported event [0x%x]\n", event);
+ }
+}
+
+static void steamdeck_remove_notify_handler(void *data)
+{
+ struct steamdeck_extcon *sd = data;
+
+ acpi_remove_notify_handler(sd->adev->handle, ACPI_DEVICE_NOTIFY,
+ steamdeck_notify);
+ cancel_delayed_work_sync(&sd->role_work);
+}
+
+static const unsigned int steamdeck_extcon_cable[] = {
+ EXTCON_USB,
+ EXTCON_USB_HOST,
+ EXTCON_CHG_USB_SDP,
+ EXTCON_CHG_USB_CDP,
+ EXTCON_CHG_USB_DCP,
+ EXTCON_CHG_USB_ACA,
+ EXTCON_NONE,
+};
+
+static int steamdeck_extcon_probe(struct platform_device *pdev)
+{
+ struct device *dev = &pdev->dev;
+ struct steamdeck_extcon *sd;
+ acpi_status status;
+ int ret;
+
+ sd = devm_kzalloc(dev, sizeof(*sd), GFP_KERNEL);
+ if (!sd)
+ return -ENOMEM;
+
+ INIT_DELAYED_WORK(&sd->role_work, steamdeck_usb_role_work);
+ platform_set_drvdata(pdev, sd);
+ sd->adev = ACPI_COMPANION(dev->parent);
+ sd->dev = dev;
+ sd->edev = devm_extcon_dev_allocate(dev, steamdeck_extcon_cable);
+ if (IS_ERR(sd->edev))
+ return PTR_ERR(sd->edev);
+
+ ret = devm_extcon_dev_register(dev, sd->edev);
+ if (ret < 0) {
+ dev_err(dev, "Failed to register extcon device: %d\n", ret);
+ return ret;
+ }
+
+ /*
+ * Set initial role value
+ */
+ queue_delayed_work(system_long_wq, &sd->role_work, 0);
+ flush_delayed_work(&sd->role_work);
+
+ status = acpi_install_notify_handler(sd->adev->handle,
+ ACPI_DEVICE_NOTIFY,
+ steamdeck_notify,
+ dev);
+ if (ACPI_FAILURE(status)) {
+ dev_err(dev, "Error installing ACPI notify handler\n");
+ return -EIO;
+ }
+
+ ret = devm_add_action_or_reset(dev, steamdeck_remove_notify_handler,
+ sd);
+ return ret;
+}
+
+static const struct platform_device_id steamdeck_extcon_id_table[] = {
+ { .name = "steamdeck-extcon" },
+ {}
+};
+MODULE_DEVICE_TABLE(platform, steamdeck_extcon_id_table);
+
+static struct platform_driver steamdeck_extcon_driver = {
+ .probe = steamdeck_extcon_probe,
+ .driver = {
+ .name = "steamdeck-extcon",
+ },
+ .id_table = steamdeck_extcon_id_table,
+};
+module_platform_driver(steamdeck_extcon_driver);
+
+MODULE_AUTHOR("Andrey Smirnov <andrew.smirnov@gmail.com>");
+MODULE_DESCRIPTION("Steam Deck extcon driver");
+MODULE_LICENSE("GPL");

View File

@@ -0,0 +1,274 @@
From be312a466f817a38a48668f93bb07a75ff9fa875 Mon Sep 17 00:00:00 2001
From: Andrey Smirnov <andrew.smirnov@gmail.com>
Date: Sat, 19 Feb 2022 16:09:45 -0800
Subject: [PATCH 2/6] hwmon: Add driver for Steam Deck's EC sensors
Add driver for sensors exposed by EC firmware on Steam Deck hardware.
(cherry picked from commit 6917aac77bee6185ae3920b936cdbe7876118c0b)
Signed-off-by: Cristian Ciocaltea <cristian.ciocaltea@collabora.com>
Signed-off-by: Alexandre Frade <kernel@xanmod.org>
---
drivers/hwmon/Kconfig | 11 ++
drivers/hwmon/Makefile | 1 +
drivers/hwmon/steamdeck-hwmon.c | 224 ++++++++++++++++++++++++++++++++
3 files changed, 236 insertions(+)
create mode 100644 drivers/hwmon/steamdeck-hwmon.c
--- a/drivers/hwmon/Kconfig
+++ b/drivers/hwmon/Kconfig
@@ -2050,6 +2050,17 @@ config SENSORS_SCH5636
This driver can also be built as a module. If so, the module
will be called sch5636.
+config SENSORS_STEAMDECK
+ tristate "Steam Deck EC sensors"
+ depends on MFD_STEAMDECK
+ help
+ If you say yes here you get support for the hardware
+ monitoring features exposed by EC firmware on Steam Deck
+ devices
+
+ This driver can also be built as a module. If so, the module
+ will be called steamdeck-hwmon.
+
config SENSORS_STTS751
tristate "ST Microelectronics STTS751"
depends on I2C
--- a/drivers/hwmon/Makefile
+++ b/drivers/hwmon/Makefile
@@ -207,6 +207,7 @@ obj-$(CONFIG_SENSORS_SMSC47M1) += smsc47
obj-$(CONFIG_SENSORS_SMSC47M192)+= smsc47m192.o
obj-$(CONFIG_SENSORS_SPARX5) += sparx5-temp.o
obj-$(CONFIG_SENSORS_SPD5118) += spd5118.o
+obj-$(CONFIG_SENSORS_STEAMDECK) += steamdeck-hwmon.o
obj-$(CONFIG_SENSORS_STTS751) += stts751.o
obj-$(CONFIG_SENSORS_SURFACE_FAN)+= surface_fan.o
obj-$(CONFIG_SENSORS_SY7636A) += sy7636a-hwmon.o
--- /dev/null
+++ b/drivers/hwmon/steamdeck-hwmon.c
@@ -0,0 +1,224 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Steam Deck EC sensors driver
+ *
+ * Copyright (C) 2021-2022 Valve Corporation
+ */
+
+#include <linux/acpi.h>
+#include <linux/hwmon.h>
+#include <linux/platform_device.h>
+
+#define STEAMDECK_HWMON_NAME "steamdeck-hwmon"
+
+struct steamdeck_hwmon {
+ struct acpi_device *adev;
+};
+
+static long
+steamdeck_hwmon_get(struct steamdeck_hwmon *sd, const char *method)
+{
+ unsigned long long val;
+ if (ACPI_FAILURE(acpi_evaluate_integer(sd->adev->handle,
+ (char *)method, NULL, &val)))
+ return -EIO;
+
+ return val;
+}
+
+static int
+steamdeck_hwmon_read(struct device *dev, enum hwmon_sensor_types type,
+ u32 attr, int channel, long *out)
+{
+ struct steamdeck_hwmon *sd = dev_get_drvdata(dev);
+
+ switch (type) {
+ case hwmon_curr:
+ if (attr != hwmon_curr_input)
+ return -EOPNOTSUPP;
+
+ *out = steamdeck_hwmon_get(sd, "PDAM");
+ if (*out < 0)
+ return *out;
+ break;
+ case hwmon_in:
+ if (attr != hwmon_in_input)
+ return -EOPNOTSUPP;
+
+ *out = steamdeck_hwmon_get(sd, "PDVL");
+ if (*out < 0)
+ return *out;
+ break;
+ case hwmon_temp:
+ if (attr != hwmon_temp_input)
+ return -EOPNOTSUPP;
+
+ *out = steamdeck_hwmon_get(sd, "BATT");
+ if (*out < 0)
+ return *out;
+ /*
+ * Assuming BATT returns deg C we need to mutiply it
+ * by 1000 to convert to mC
+ */
+ *out *= 1000;
+ break;
+ case hwmon_fan:
+ switch (attr) {
+ case hwmon_fan_input:
+ *out = steamdeck_hwmon_get(sd, "FANR");
+ if (*out < 0)
+ return *out;
+ break;
+ case hwmon_fan_target:
+ *out = steamdeck_hwmon_get(sd, "FSSR");
+ if (*out < 0)
+ return *out;
+ break;
+ case hwmon_fan_fault:
+ *out = steamdeck_hwmon_get(sd, "FANC");
+ if (*out < 0)
+ return *out;
+ /*
+ * FANC (Fan check):
+ * 0: Abnormal
+ * 1: Normal
+ */
+ *out = !*out;
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ return 0;
+}
+
+static int
+steamdeck_hwmon_read_string(struct device *dev, enum hwmon_sensor_types type,
+ u32 attr, int channel, const char **str)
+{
+ switch (type) {
+ /*
+ * These two aren't, strictly speaking, measured. EC
+ * firmware just reports what PD negotiation resulted
+ * in.
+ */
+ case hwmon_curr:
+ *str = "PD Contract Current";
+ break;
+ case hwmon_in:
+ *str = "PD Contract Voltage";
+ break;
+ case hwmon_temp:
+ *str = "Battery Temp";
+ break;
+ case hwmon_fan:
+ *str = "System Fan";
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ return 0;
+}
+
+static int
+steamdeck_hwmon_write(struct device *dev, enum hwmon_sensor_types type,
+ u32 attr, int channel, long val)
+{
+ struct steamdeck_hwmon *sd = dev_get_drvdata(dev);
+
+ if (type != hwmon_fan ||
+ attr != hwmon_fan_target)
+ return -EOPNOTSUPP;
+
+ val = clamp_val(val, 0, 7300);
+
+ if (ACPI_FAILURE(acpi_execute_simple_method(sd->adev->handle,
+ "FANS", val)))
+ return -EIO;
+
+ return 0;
+}
+
+static umode_t
+steamdeck_hwmon_is_visible(const void *data, enum hwmon_sensor_types type,
+ u32 attr, int channel)
+{
+ if (type == hwmon_fan &&
+ attr == hwmon_fan_target)
+ return 0644;
+
+ return 0444;
+}
+
+static const struct hwmon_channel_info *steamdeck_hwmon_info[] = {
+ HWMON_CHANNEL_INFO(in,
+ HWMON_I_INPUT | HWMON_I_LABEL),
+ HWMON_CHANNEL_INFO(curr,
+ HWMON_C_INPUT | HWMON_C_LABEL),
+ HWMON_CHANNEL_INFO(temp,
+ HWMON_T_INPUT | HWMON_T_LABEL),
+ HWMON_CHANNEL_INFO(fan,
+ HWMON_F_INPUT | HWMON_F_LABEL |
+ HWMON_F_TARGET | HWMON_F_FAULT),
+ NULL
+};
+
+static const struct hwmon_ops steamdeck_hwmon_ops = {
+ .is_visible = steamdeck_hwmon_is_visible,
+ .read = steamdeck_hwmon_read,
+ .read_string = steamdeck_hwmon_read_string,
+ .write = steamdeck_hwmon_write,
+};
+
+static const struct hwmon_chip_info steamdeck_hwmon_chip_info = {
+ .ops = &steamdeck_hwmon_ops,
+ .info = steamdeck_hwmon_info,
+};
+
+static int steamdeck_hwmon_probe(struct platform_device *pdev)
+{
+ struct device *dev = &pdev->dev;
+ struct steamdeck_hwmon *sd;
+ struct device *hwmon;
+
+ sd = devm_kzalloc(dev, sizeof(*sd), GFP_KERNEL);
+ if (!sd)
+ return -ENOMEM;
+
+ sd->adev = ACPI_COMPANION(dev->parent);
+ hwmon = devm_hwmon_device_register_with_info(dev,
+ "steamdeck_hwmon",
+ sd,
+ &steamdeck_hwmon_chip_info,
+ NULL);
+ if (IS_ERR(hwmon)) {
+ dev_err(dev, "Failed to register HWMON device");
+ return PTR_ERR(hwmon);
+ }
+
+ return 0;
+}
+
+static const struct platform_device_id steamdeck_hwmon_id_table[] = {
+ { .name = STEAMDECK_HWMON_NAME },
+ {}
+};
+MODULE_DEVICE_TABLE(platform, steamdeck_hwmon_id_table);
+
+static struct platform_driver steamdeck_hwmon_driver = {
+ .probe = steamdeck_hwmon_probe,
+ .driver = {
+ .name = STEAMDECK_HWMON_NAME,
+ },
+ .id_table = steamdeck_hwmon_id_table,
+};
+module_platform_driver(steamdeck_hwmon_driver);
+
+MODULE_AUTHOR("Andrey Smirnov <andrew.smirnov@gmail.com>");
+MODULE_DESCRIPTION("Steam Deck EC sensors driver");
+MODULE_LICENSE("GPL");

View File

@@ -0,0 +1,104 @@
From efe7b80f8a7448265d238c13ed1b6e327a9c739e Mon Sep 17 00:00:00 2001
From: Andrey Smirnov <andrew.smirnov@gmail.com>
Date: Sat, 15 Jul 2023 12:58:54 -0700
Subject: [PATCH 3/6] hwmon: steamdeck-hwmon: Add support for max battery
level/rate
Add support for max battery level/charge rate attributes.
Signed-off-by: Andrey Smirnov <andrew.smirnov@gmail.com>
(cherry picked from commit 50af83e8fd75dc52221edd3fb6fd7a7f70c4d8a4)
Signed-off-by: Cristian Ciocaltea <cristian.ciocaltea@collabora.com>
Signed-off-by: Alexandre Frade <kernel@xanmod.org>
---
drivers/hwmon/steamdeck-hwmon.c | 72 ++++++++++++++++++++++++++++++++-
1 file changed, 71 insertions(+), 1 deletion(-)
--- a/drivers/hwmon/steamdeck-hwmon.c
+++ b/drivers/hwmon/steamdeck-hwmon.c
@@ -180,6 +180,76 @@ static const struct hwmon_chip_info stea
.info = steamdeck_hwmon_info,
};
+
+static ssize_t
+steamdeck_hwmon_simple_store(struct device *dev, const char *buf, size_t count,
+ const char *method,
+ unsigned long upper_limit)
+{
+ struct steamdeck_hwmon *sd = dev_get_drvdata(dev);
+ unsigned long value;
+
+ if (kstrtoul(buf, 10, &value) || value >= upper_limit)
+ return -EINVAL;
+
+ if (ACPI_FAILURE(acpi_execute_simple_method(sd->adev->handle,
+ (char *)method, value)))
+ return -EIO;
+
+ return count;
+}
+
+static ssize_t
+steamdeck_hwmon_simple_show(struct device *dev, char *buf,
+ const char *method)
+{
+ struct steamdeck_hwmon *sd = dev_get_drvdata(dev);
+ unsigned long value;
+
+ value = steamdeck_hwmon_get(sd, method);
+ if (value < 0)
+ return value;
+
+ return sprintf(buf, "%ld\n", value);
+}
+
+#define STEAMDECK_HWMON_ATTR_RW(_name, _set_method, _get_method, \
+ _upper_limit) \
+ static ssize_t _name##_show(struct device *dev, \
+ struct device_attribute *attr, \
+ char *buf) \
+ { \
+ return steamdeck_hwmon_simple_show(dev, buf, \
+ _get_method); \
+ } \
+ static ssize_t _name##_store(struct device *dev, \
+ struct device_attribute *attr, \
+ const char *buf, size_t count) \
+ { \
+ return steamdeck_hwmon_simple_store(dev, buf, count, \
+ _set_method, \
+ _upper_limit); \
+ } \
+ static DEVICE_ATTR_RW(_name)
+
+STEAMDECK_HWMON_ATTR_RW(max_battery_charge_level, "FCBL", "SFBL", 101);
+STEAMDECK_HWMON_ATTR_RW(max_battery_charge_rate, "CHGR", "GCHR", 101);
+
+static struct attribute *steamdeck_hwmon_attributes[] = {
+ &dev_attr_max_battery_charge_level.attr,
+ &dev_attr_max_battery_charge_rate.attr,
+ NULL
+};
+
+static const struct attribute_group steamdeck_hwmon_group = {
+ .attrs = steamdeck_hwmon_attributes,
+};
+
+static const struct attribute_group *steamdeck_hwmon_groups[] = {
+ &steamdeck_hwmon_group,
+ NULL
+};
+
static int steamdeck_hwmon_probe(struct platform_device *pdev)
{
struct device *dev = &pdev->dev;
@@ -195,7 +265,7 @@ static int steamdeck_hwmon_probe(struct
"steamdeck_hwmon",
sd,
&steamdeck_hwmon_chip_info,
- NULL);
+ steamdeck_hwmon_groups);
if (IS_ERR(hwmon)) {
dev_err(dev, "Failed to register HWMON device");
return PTR_ERR(hwmon);

View File

@@ -0,0 +1,118 @@
From e76032abd59b7d2b15c8106630423e3091b298ce Mon Sep 17 00:00:00 2001
From: Andrey Smirnov <andrew.smirnov@gmail.com>
Date: Sun, 27 Feb 2022 12:58:05 -0800
Subject: [PATCH 4/6] leds: steamdeck: Add support for Steam Deck LED
(cherry picked from commit 85a86d19aa7022ff0555023d53aef78323a42d0c)
Signed-off-by: Cristian Ciocaltea <cristian.ciocaltea@collabora.com>
Signed-off-by: Alexandre Frade <kernel@xanmod.org>
---
drivers/leds/Kconfig | 7 ++++
drivers/leds/Makefile | 1 +
drivers/leds/leds-steamdeck.c | 74 +++++++++++++++++++++++++++++++++++
3 files changed, 82 insertions(+)
create mode 100644 drivers/leds/leds-steamdeck.c
--- a/drivers/leds/Kconfig
+++ b/drivers/leds/Kconfig
@@ -951,6 +951,13 @@ config LEDS_ACER_A500
This option enables support for the Power Button LED of
Acer Iconia Tab A500.
+config LEDS_STEAMDECK
+ tristate "LED support for Steam Deck"
+ depends on LEDS_CLASS && MFD_STEAMDECK
+ help
+ This option enabled support for the status LED (next to the
+ power button) on Steam Deck
+
source "drivers/leds/blink/Kconfig"
comment "Flash and Torch LED drivers"
--- a/drivers/leds/Makefile
+++ b/drivers/leds/Makefile
@@ -81,6 +81,7 @@ obj-$(CONFIG_LEDS_POWERNV) += leds-powe
obj-$(CONFIG_LEDS_PWM) += leds-pwm.o
obj-$(CONFIG_LEDS_REGULATOR) += leds-regulator.o
obj-$(CONFIG_LEDS_SC27XX_BLTC) += leds-sc27xx-bltc.o
+obj-$(CONFIG_LEDS_STEAMDECK) += leds-steamdeck.o
obj-$(CONFIG_LEDS_SUN50I_A100) += leds-sun50i-a100.o
obj-$(CONFIG_LEDS_SUNFIRE) += leds-sunfire.o
obj-$(CONFIG_LEDS_SYSCON) += leds-syscon.o
--- /dev/null
+++ b/drivers/leds/leds-steamdeck.c
@@ -0,0 +1,74 @@
+// SPDX-License-Identifier: GPL-2.0+
+
+/*
+ * Steam Deck EC MFD LED cell driver
+ *
+ * Copyright (C) 2021-2022 Valve Corporation
+ *
+ */
+
+#include <linux/acpi.h>
+#include <linux/leds.h>
+#include <linux/platform_device.h>
+
+struct steamdeck_led {
+ struct acpi_device *adev;
+ struct led_classdev cdev;
+};
+
+static int steamdeck_leds_brightness_set(struct led_classdev *cdev,
+ enum led_brightness value)
+{
+ struct steamdeck_led *sd = container_of(cdev, struct steamdeck_led,
+ cdev);
+
+ if (ACPI_FAILURE(acpi_execute_simple_method(sd->adev->handle,
+ "CHBV", value)))
+ return -EIO;
+
+ return 0;
+}
+
+static int steamdeck_leds_probe(struct platform_device *pdev)
+{
+ struct device *dev = &pdev->dev;
+ struct steamdeck_led *sd;
+ int ret;
+
+ sd = devm_kzalloc(dev, sizeof(*sd), GFP_KERNEL);
+ if (!sd)
+ return -ENOMEM;
+
+ sd->adev = ACPI_COMPANION(dev->parent);
+
+ sd->cdev.name = "status:white";
+ sd->cdev.brightness_set_blocking = steamdeck_leds_brightness_set;
+ sd->cdev.max_brightness = 100;
+
+ ret = devm_led_classdev_register(dev, &sd->cdev);
+ if (ret) {
+ dev_err(dev, "Failed to register LEDs device: %d\n", ret);
+ return ret;
+ }
+
+ return 0;
+}
+
+static const struct platform_device_id steamdeck_leds_id_table[] = {
+ { .name = "steamdeck-leds" },
+ {}
+};
+MODULE_DEVICE_TABLE(platform, steamdeck_leds_id_table);
+
+static struct platform_driver steamdeck_leds_driver = {
+ .probe = steamdeck_leds_probe,
+ .driver = {
+ .name = "steamdeck-leds",
+ },
+ .id_table = steamdeck_leds_id_table,
+};
+module_platform_driver(steamdeck_leds_driver);
+
+MODULE_AUTHOR("Andrey Smirnov <andrew.smirnov@gmail.com>");
+MODULE_DESCRIPTION("Steam Deck LEDs driver");
+MODULE_LICENSE("GPL");

View File

@@ -0,0 +1,176 @@
From c3cf089a9da1216d3e1e047eba2ec46e0febe457 Mon Sep 17 00:00:00 2001
From: Andrey Smirnov <andrew.smirnov@gmail.com>
Date: Sat, 19 Feb 2022 16:08:36 -0800
Subject: [PATCH 5/6] mfd: Add MFD core driver for Steam Deck
Add MFD core driver for Steam Deck. Doesn't really do much so far
besides instantiating a number of MFD cells that implement all the
interesting functionality.
(cherry picked from commit 5f534c2d6ebdefccb9c024eb0f013bc1c0c622d9)
Signed-off-by: Cristian Ciocaltea <cristian.ciocaltea@collabora.com>
Signed-off-by: Alexandre Frade <kernel@xanmod.org>
---
drivers/mfd/Kconfig | 11 ++++
drivers/mfd/Makefile | 2 +
drivers/mfd/steamdeck.c | 127 ++++++++++++++++++++++++++++++++++++++++
3 files changed, 140 insertions(+)
create mode 100644 drivers/mfd/steamdeck.c
--- a/drivers/mfd/Kconfig
+++ b/drivers/mfd/Kconfig
@@ -2390,5 +2390,16 @@ config MFD_RSMU_SPI
Additional drivers must be enabled in order to use the functionality
of the device.
+config MFD_STEAMDECK
+ tristate "Valve Steam Deck"
+ select MFD_CORE
+ depends on ACPI
+ depends on X86_64 || COMPILE_TEST
+ help
+ This driver registers various MFD cells that expose aspects
+ of Steam Deck specific ACPI functionality.
+
+ Say N here, unless you are running on Steam Deck hardware.
+
endmenu
endif
--- a/drivers/mfd/Makefile
+++ b/drivers/mfd/Makefile
@@ -288,3 +288,5 @@ obj-$(CONFIG_MFD_ATC260X_I2C) += atc260x
obj-$(CONFIG_MFD_RSMU_I2C) += rsmu_i2c.o rsmu_core.o
obj-$(CONFIG_MFD_RSMU_SPI) += rsmu_spi.o rsmu_core.o
+
+obj-$(CONFIG_MFD_STEAMDECK) += steamdeck.o
--- /dev/null
+++ b/drivers/mfd/steamdeck.c
@@ -0,0 +1,127 @@
+// SPDX-License-Identifier: GPL-2.0+
+
+/*
+ * Steam Deck EC MFD core driver
+ *
+ * Copyright (C) 2021-2022 Valve Corporation
+ *
+ */
+
+#include <linux/acpi.h>
+#include <linux/platform_device.h>
+#include <linux/mfd/core.h>
+
+#define STEAMDECK_STA_OK \
+ (ACPI_STA_DEVICE_ENABLED | \
+ ACPI_STA_DEVICE_PRESENT | \
+ ACPI_STA_DEVICE_FUNCTIONING)
+
+struct steamdeck {
+ struct acpi_device *adev;
+ struct device *dev;
+};
+
+#define STEAMDECK_ATTR_RO(_name, _method) \
+ static ssize_t _name##_show(struct device *dev, \
+ struct device_attribute *attr, \
+ char *buf) \
+ { \
+ struct steamdeck *sd = dev_get_drvdata(dev); \
+ unsigned long long val; \
+ \
+ if (ACPI_FAILURE(acpi_evaluate_integer( \
+ sd->adev->handle, \
+ _method, NULL, &val))) \
+ return -EIO; \
+ \
+ return sysfs_emit(buf, "%llu\n", val); \
+ } \
+ static DEVICE_ATTR_RO(_name)
+
+STEAMDECK_ATTR_RO(firmware_version, "PDFW");
+STEAMDECK_ATTR_RO(board_id, "BOID");
+
+static struct attribute *steamdeck_attrs[] = {
+ &dev_attr_firmware_version.attr,
+ &dev_attr_board_id.attr,
+ NULL
+};
+
+ATTRIBUTE_GROUPS(steamdeck);
+
+static const struct mfd_cell steamdeck_cells[] = {
+ { .name = "steamdeck-hwmon" },
+ { .name = "steamdeck-leds" },
+ { .name = "steamdeck-extcon" },
+};
+
+static void steamdeck_remove_sysfs_groups(void *data)
+{
+ struct steamdeck *sd = data;
+
+ sysfs_remove_groups(&sd->dev->kobj, steamdeck_groups);
+}
+
+static int steamdeck_probe(struct platform_device *pdev)
+{
+ struct device *dev = &pdev->dev;
+ unsigned long long sta;
+ struct steamdeck *sd;
+ acpi_status status;
+ int ret;
+
+ sd = devm_kzalloc(dev, sizeof(*sd), GFP_KERNEL);
+ if (!sd)
+ return -ENOMEM;
+ sd->adev = ACPI_COMPANION(dev);
+ sd->dev = dev;
+ platform_set_drvdata(pdev, sd);
+
+ status = acpi_evaluate_integer(sd->adev->handle, "_STA",
+ NULL, &sta);
+ if (ACPI_FAILURE(status)) {
+ dev_err(dev, "Status check failed (0x%x)\n", status);
+ return -EINVAL;
+ }
+
+ if ((sta & STEAMDECK_STA_OK) != STEAMDECK_STA_OK) {
+ dev_err(dev, "Device is not ready\n");
+ return -EINVAL;
+ }
+
+ ret = sysfs_create_groups(&dev->kobj, steamdeck_groups);
+ if (ret) {
+ dev_err(dev, "Failed to create sysfs group\n");
+ return ret;
+ }
+
+ ret = devm_add_action_or_reset(dev, steamdeck_remove_sysfs_groups,
+ sd);
+ if (ret) {
+ dev_err(dev, "Failed to register devres action\n");
+ return ret;
+ }
+
+ return devm_mfd_add_devices(dev, PLATFORM_DEVID_NONE,
+ steamdeck_cells, ARRAY_SIZE(steamdeck_cells),
+ NULL, 0, NULL);
+}
+
+static const struct acpi_device_id steamdeck_device_ids[] = {
+ { "VLV0100", 0 },
+ { "", 0 },
+};
+MODULE_DEVICE_TABLE(acpi, steamdeck_device_ids);
+
+static struct platform_driver steamdeck_driver = {
+ .probe = steamdeck_probe,
+ .driver = {
+ .name = "steamdeck",
+ .acpi_match_table = steamdeck_device_ids,
+ },
+};
+module_platform_driver(steamdeck_driver);
+
+MODULE_AUTHOR("Andrey Smirnov <andrew.smirnov@gmail.com>");
+MODULE_DESCRIPTION("Steam Deck EC MFD core driver");
+MODULE_LICENSE("GPL");

View File

@@ -0,0 +1,49 @@
From c24abd65482a08109e756c612e83c8c8450a263b Mon Sep 17 00:00:00 2001
From: Andrey Smirnov <andrew.smirnov@gmail.com>
Date: Sun, 24 Sep 2023 15:02:33 -0700
Subject: [PATCH 6/6] mfd: steamdeck: Expose controller board power in sysfs
As of version 118 Deck's BIOS implements "SCBP" method that allows
gating power of the controller board (VBUS). Add a basic WO method to
our root MFD device to allow toggling that.
Signed-off-by: Andrey Smirnov <andrew.smirnov@gmail.com>
(cherry picked from commit f97f32718acc10cbb51fef925842392e80904d74)
Signed-off-by: Cristian Ciocaltea <cristian.ciocaltea@collabora.com>
Signed-off-by: Alexandre Frade <kernel@xanmod.org>
---
drivers/mfd/steamdeck.c | 20 ++++++++++++++++++++
1 file changed, 20 insertions(+)
--- a/drivers/mfd/steamdeck.c
+++ b/drivers/mfd/steamdeck.c
@@ -41,9 +41,29 @@ struct steamdeck {
STEAMDECK_ATTR_RO(firmware_version, "PDFW");
STEAMDECK_ATTR_RO(board_id, "BOID");
+static ssize_t controller_board_power_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct steamdeck *sd = dev_get_drvdata(dev);
+ bool enabled;
+ ssize_t ret = kstrtobool(buf, &enabled);
+
+ if (ret)
+ return ret;
+
+ if (ACPI_FAILURE(acpi_execute_simple_method(sd->adev->handle,
+ "SCBP", enabled)))
+ return -EIO;
+
+ return count;
+}
+static DEVICE_ATTR_WO(controller_board_power);
+
static struct attribute *steamdeck_attrs[] = {
&dev_attr_firmware_version.attr,
&dev_attr_board_id.attr,
+ &dev_attr_controller_board_power.attr,
NULL
};

View File

@@ -0,0 +1,25 @@
From 2a949c896ea66e647d94254c28b1d4d6194ee14b Mon Sep 17 00:00:00 2001
From: Alexandre Frade <kernel@xanmod.org>
Date: Mon, 16 Sep 2024 00:55:35 +0000
Subject: [PATCH 03/19] XANMOD: kbuild: Add GCC SMS-based modulo scheduling
flags
Signed-off-by: Alexandre Frade <kernel@xanmod.org>
---
Makefile | 5 +++++
1 file changed, 5 insertions(+)
--- a/Makefile
+++ b/Makefile
@@ -822,6 +822,11 @@ KBUILD_CFLAGS += -Os
KBUILD_RUSTFLAGS += -Copt-level=s
endif
+# Perform swing modulo scheduling immediately before the first scheduling pass.
+# This pass looks at innermost loops and reorders their instructions by
+# overlapping different iterations.
+KBUILD_CFLAGS += $(call cc-option,-fmodulo-sched -fmodulo-sched-allow-regmoves)
+
# Always set `debug-assertions` and `overflow-checks` because their default
# depends on `opt-level` and `debug-assertions`, respectively.
KBUILD_RUSTFLAGS += -Cdebug-assertions=$(if $(CONFIG_RUST_DEBUG_ASSERTIONS),y,n)

View File

@@ -0,0 +1,76 @@
From 867a311f231e89b4c9194579e40718f751761ffc Mon Sep 17 00:00:00 2001
From: Alexandre Frade <kernel@xanmod.org>
Date: Sat, 31 Aug 2024 16:57:41 +0000
Subject: [PATCH 04/19] kbuild: Remove GCC minimal function alignment
Signed-off-by: Alexandre Frade <kernel@xanmod.org>
---
Makefile | 7 -------
arch/Kconfig | 12 ------------
include/linux/compiler_types.h | 10 +++++-----
3 files changed, 5 insertions(+), 24 deletions(-)
--- a/Makefile
+++ b/Makefile
@@ -981,15 +981,8 @@ export CC_FLAGS_FPU
export CC_FLAGS_NO_FPU
ifneq ($(CONFIG_FUNCTION_ALIGNMENT),0)
-# Set the minimal function alignment. Use the newer GCC option
-# -fmin-function-alignment if it is available, or fall back to -falign-funtions.
-# See also CONFIG_CC_HAS_SANE_FUNCTION_ALIGNMENT.
-ifdef CONFIG_CC_HAS_MIN_FUNCTION_ALIGNMENT
-KBUILD_CFLAGS += -fmin-function-alignment=$(CONFIG_FUNCTION_ALIGNMENT)
-else
KBUILD_CFLAGS += -falign-functions=$(CONFIG_FUNCTION_ALIGNMENT)
endif
-endif
# arch Makefile may override CC so keep this after arch Makefile is included
NOSTDINC_FLAGS += -nostdinc
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -1628,18 +1628,6 @@ config FUNCTION_ALIGNMENT
default 4 if FUNCTION_ALIGNMENT_4B
default 0
-config CC_HAS_MIN_FUNCTION_ALIGNMENT
- # Detect availability of the GCC option -fmin-function-alignment which
- # guarantees minimal alignment for all functions, unlike
- # -falign-functions which the compiler ignores for cold functions.
- def_bool $(cc-option, -fmin-function-alignment=8)
-
-config CC_HAS_SANE_FUNCTION_ALIGNMENT
- # Set if the guaranteed alignment with -fmin-function-alignment is
- # available or extra care is required in the kernel. Clang provides
- # strict alignment always, even with -falign-functions.
- def_bool CC_HAS_MIN_FUNCTION_ALIGNMENT || CC_IS_CLANG
-
config ARCH_NEED_CMPXCHG_1_EMU
bool
--- a/include/linux/compiler_types.h
+++ b/include/linux/compiler_types.h
@@ -99,17 +99,17 @@ static inline void __chk_io_ptr(const vo
* gcc: https://gcc.gnu.org/onlinedocs/gcc/Label-Attributes.html#index-cold-label-attribute
*
* When -falign-functions=N is in use, we must avoid the cold attribute as
- * GCC drops the alignment for cold functions. Worse, GCC can implicitly mark
- * callees of cold functions as cold themselves, so it's not sufficient to add
- * __function_aligned here as that will not ensure that callees are correctly
- * aligned.
+ * contemporary versions of GCC drop the alignment for cold functions. Worse,
+ * GCC can implicitly mark callees of cold functions as cold themselves, so
+ * it's not sufficient to add __function_aligned here as that will not ensure
+ * that callees are correctly aligned.
*
* See:
*
* https://lore.kernel.org/lkml/Y77%2FqVgvaJidFpYt@FVFF77S0Q05N
* https://gcc.gnu.org/bugzilla/show_bug.cgi?id=88345#c9
*/
-#if defined(CONFIG_CC_HAS_SANE_FUNCTION_ALIGNMENT) || (CONFIG_FUNCTION_ALIGNMENT == 0)
+#if !defined(CONFIG_CC_IS_GCC) || (CONFIG_FUNCTION_ALIGNMENT == 0)
#define __cold __attribute__((__cold__))
#else
#define __cold

View File

@@ -0,0 +1,22 @@
From e98005253085b5a00881c085fab388a4742e700b Mon Sep 17 00:00:00 2001
From: Alexandre Frade <kernel@xanmod.org>
Date: Thu, 11 May 2023 19:41:41 +0000
Subject: [PATCH 05/19] XANMOD: fair: Set scheduler tunable latencies to
unscaled
Signed-off-by: Alexandre Frade <kernel@xanmod.org>
---
kernel/sched/fair.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -66,7 +66,7 @@
*
* (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
*/
-unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG;
+unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE;
/*
* Minimal preemption granularity for CPU-bound tasks:

View File

@@ -0,0 +1,71 @@
From 07341955b471371f5414f94814c49289de9319cf Mon Sep 17 00:00:00 2001
From: Alexandre Frade <kernel@xanmod.org>
Date: Sun, 15 Sep 2024 23:03:38 +0000
Subject: [PATCH 06/19] XANMOD: sched: Add yield_type sysctl to reduce or
disable sched_yield
Signed-off-by: Alexandre Frade <kernel@xanmod.org>
---
kernel/sched/syscalls.c | 16 +++++++++++++++-
kernel/sysctl.c | 10 ++++++++++
2 files changed, 25 insertions(+), 1 deletion(-)
--- a/kernel/sched/syscalls.c
+++ b/kernel/sched/syscalls.c
@@ -1457,15 +1457,29 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t
return ret;
}
+/**
+ * sysctl_sched_yield_type - choose the yield level that will perform.
+ *
+ * 0: No yield.
+ * 1: Yield only to better priority/deadline tasks.
+ * 2: Re-queue current tasks. (default CFS)
+ */
+__read_mostly int sysctl_sched_yield_type = 2;
+
static void do_sched_yield(void)
{
struct rq_flags rf;
struct rq *rq;
+ if (!sysctl_sched_yield_type)
+ return;
+
rq = this_rq_lock_irq(&rf);
schedstat_inc(rq->yld_count);
- current->sched_class->yield_task(rq);
+
+ if (sysctl_sched_yield_type > 1)
+ current->sched_class->yield_task(rq);
preempt_disable();
rq_unlock_irq(rq, &rf);
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -97,6 +97,7 @@ static const int six_hundred_forty_kb =
#endif
+extern int sysctl_sched_yield_type;
static const int ngroups_max = NGROUPS_MAX;
static const int cap_last_cap = CAP_LAST_CAP;
@@ -1631,6 +1632,15 @@ static struct ctl_table kern_table[] = {
.proc_handler = proc_dointvec,
},
#endif
+ {
+ .procname = "yield_type",
+ .data = &sysctl_sched_yield_type,
+ .maxlen = sizeof (int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_TWO,
+ },
#ifdef CONFIG_PROC_SYSCTL
{
.procname = "tainted",

View File

@@ -0,0 +1,39 @@
From 9d6ba825d8c8635e217f7596fb4401c9ef9d408a Mon Sep 17 00:00:00 2001
From: Alexandre Frade <kernel@xanmod.org>
Date: Wed, 11 May 2022 18:56:51 +0000
Subject: [PATCH 07/19] XANMOD: block/mq-deadline: Increase write priority to
improve responsiveness
Signed-off-by: Alexandre Frade <kernel@xanmod.org>
---
block/mq-deadline.c | 7 +++++--
1 file changed, 5 insertions(+), 2 deletions(-)
--- a/block/mq-deadline.c
+++ b/block/mq-deadline.c
@@ -4,6 +4,9 @@
* for the blk-mq scheduling framework
*
* Copyright (C) 2016 Jens Axboe <axboe@kernel.dk>
+ *
+ * Tunes for responsiveness by Alexandre Frade
+ * (C) 2022 Alexandre Frade <kernel@xanmod.org>
*/
#include <linux/kernel.h>
#include <linux/fs.h>
@@ -28,13 +31,13 @@
* See Documentation/block/deadline-iosched.rst
*/
static const int read_expire = HZ / 2; /* max time before a read is submitted. */
-static const int write_expire = 5 * HZ; /* ditto for writes, these limits are SOFT! */
+static const int write_expire = HZ; /* ditto for writes, these limits are SOFT! */
/*
* Time after which to dispatch lower priority requests even if higher
* priority requests are pending.
*/
static const int prio_aging_expire = 10 * HZ;
-static const int writes_starved = 2; /* max times reads can starve a write */
+static const int writes_starved = 1; /* max times reads can starve a write */
static const int fifo_batch = 16; /* # of sequential requests treated as one
by the above parameters. For throughput. */

View File

@@ -0,0 +1,22 @@
From f4b45e0e5444254caf5992cde662236867ac388b Mon Sep 17 00:00:00 2001
From: Alexandre Frade <kernel@xanmod.org>
Date: Thu, 6 Jan 2022 16:59:01 +0000
Subject: [PATCH 08/19] XANMOD: block/mq-deadline: Disable front_merges by
default
Signed-off-by: Alexandre Frade <kernel@xanmod.org>
---
block/mq-deadline.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/block/mq-deadline.c
+++ b/block/mq-deadline.c
@@ -600,7 +600,7 @@ static int dd_init_sched(struct request_
dd->fifo_expire[DD_READ] = read_expire;
dd->fifo_expire[DD_WRITE] = write_expire;
dd->writes_starved = writes_starved;
- dd->front_merges = 1;
+ dd->front_merges = 0;
dd->last_dir = DD_WRITE;
dd->fifo_batch = fifo_batch;
dd->prio_aging_expire = prio_aging_expire;

View File

@@ -0,0 +1,23 @@
From 08580ad4d8ffb10cb30a228361d2d914a1502e3c Mon Sep 17 00:00:00 2001
From: Alexandre Frade <kernel@xanmod.org>
Date: Mon, 16 Sep 2024 15:36:01 +0000
Subject: [PATCH 09/19] XANMOD: block: Set rq_affinity to force complete I/O
requests on same CPU
Signed-off-by: Alexandre Frade <kernel@xanmod.org>
---
include/linux/blkdev.h | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -605,7 +605,8 @@ enum {
QUEUE_FLAG_MAX
};
-#define QUEUE_FLAG_MQ_DEFAULT (1UL << QUEUE_FLAG_SAME_COMP)
+#define QUEUE_FLAG_MQ_DEFAULT ((1UL << QUEUE_FLAG_SAME_COMP) | \
+ (1UL << QUEUE_FLAG_SAME_FORCE))
void blk_queue_flag_set(unsigned int flag, struct request_queue *q);
void blk_queue_flag_clear(unsigned int flag, struct request_queue *q);

View File

@@ -0,0 +1,30 @@
From 278088038e8259faf193e23ac16b48b0350da6fb Mon Sep 17 00:00:00 2001
From: Alexandre Frade <kernel@xanmod.org>
Date: Mon, 15 Jul 2024 04:50:34 +0000
Subject: [PATCH 10/19] XANMOD: blk-wbt: Set wbt_default_latency_nsec() to
2msec
Signed-off-by: Alexandre Frade <kernel@xanmod.org>
---
block/blk-wbt.c | 10 ++--------
1 file changed, 2 insertions(+), 8 deletions(-)
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -730,14 +730,8 @@ EXPORT_SYMBOL_GPL(wbt_enable_default);
u64 wbt_default_latency_nsec(struct request_queue *q)
{
- /*
- * We default to 2msec for non-rotational storage, and 75msec
- * for rotational storage.
- */
- if (blk_queue_nonrot(q))
- return 2000000ULL;
- else
- return 75000000ULL;
+ /* XanMod defaults to 2msec for any type of storage */
+ return 2000000ULL;
}
static int wbt_data_dir(const struct request *rq)

View File

@@ -0,0 +1,35 @@
From 1637fbf551572f4578ed5644ae485b8da659b509 Mon Sep 17 00:00:00 2001
From: Alexandre Frade <kernel@xanmod.org>
Date: Mon, 29 Jan 2018 17:26:15 +0000
Subject: [PATCH 11/19] XANMOD: kconfig: add 500Hz timer interrupt kernel
config option
Signed-off-by: Alexandre Frade <kernel@xanmod.org>
---
kernel/Kconfig.hz | 8 ++++++++
1 file changed, 8 insertions(+)
--- a/kernel/Kconfig.hz
+++ b/kernel/Kconfig.hz
@@ -40,6 +40,13 @@ choice
on SMP and NUMA systems and exactly dividing by both PAL and
NTSC frame rates for video and multimedia work.
+ config HZ_500
+ bool "500 HZ"
+ help
+ 500 Hz is a balanced timer frequency. Provides fast interactivity
+ on desktops with great smoothness without increasing CPU power
+ consumption and sacrificing the battery life on laptops.
+
config HZ_1000
bool "1000 HZ"
help
@@ -53,6 +60,7 @@ config HZ
default 100 if HZ_100
default 250 if HZ_250
default 300 if HZ_300
+ default 500 if HZ_500
default 1000 if HZ_1000
config SCHED_HRTICK

View File

@@ -0,0 +1,22 @@
From 53a8dc8afb62674acc347ac7d9ebb46338ab7d64 Mon Sep 17 00:00:00 2001
From: Alexandre Frade <kernel@xanmod.org>
Date: Mon, 29 Jan 2018 16:59:22 +0000
Subject: [PATCH 12/19] XANMOD: dcache: cache_pressure = 50 decreases the rate
at which VFS caches are reclaimed
Signed-off-by: Alexandre Frade <kernel@xanmod.org>
---
fs/dcache.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -73,7 +73,7 @@
* If no ancestor relationship:
* arbitrary, since it's serialized on rename_lock
*/
-int sysctl_vfs_cache_pressure __read_mostly = 100;
+int sysctl_vfs_cache_pressure __read_mostly = 50;
EXPORT_SYMBOL_GPL(sysctl_vfs_cache_pressure);
__cacheline_aligned_in_smp DEFINE_SEQLOCK(rename_lock);

View File

@@ -0,0 +1,21 @@
From 73df371934a8b28f57030858965f6869c3705836 Mon Sep 17 00:00:00 2001
From: Alexandre Frade <kernel@xanmod.org>
Date: Wed, 14 Aug 2024 18:54:53 +0000
Subject: [PATCH 14/19] XANMOD: mm/vmscan: Set minimum amount of swapping
Signed-off-by: Alexandre Frade <kernel@xanmod.org>
---
mm/vmscan.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -199,7 +199,7 @@ struct scan_control {
/*
* From 0 .. MAX_SWAPPINESS. Higher means more swappy.
*/
-int vm_swappiness = 60;
+int vm_swappiness = 1;
#ifdef CONFIG_MEMCG

View File

@@ -0,0 +1,84 @@
From 8b42012dc4aa0008ad10cf7575f684ebc3a587cf Mon Sep 17 00:00:00 2001
From: Alexandre Frade <kernel@xanmod.org>
Date: Wed, 15 Jun 2022 17:07:29 +0000
Subject: [PATCH 15/19] XANMOD: sched/autogroup: Add kernel parameter and
config option to enable/disable autogroup feature by default
Signed-off-by: Alexandre Frade <kernel@xanmod.org>
---
Documentation/admin-guide/kernel-parameters.txt | 6 ++++--
init/Kconfig | 12 ++++++++++++
kernel/sched/autogroup.c | 9 ++++++---
3 files changed, 22 insertions(+), 5 deletions(-)
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -493,6 +493,10 @@
Format: <int> (must be >=0)
Default: 64
+ autogroup= [KNL] Enable or disable scheduler automatic task group
+ creation.
+ Format: <bool>
+
bau= [X86_UV] Enable the BAU on SGI UV. The default
behavior is to disable the BAU (i.e. bau=0).
Format: { "0" | "1" }
@@ -3835,8 +3839,6 @@
noapic [SMP,APIC,EARLY] Tells the kernel to not make use of any
IOAPICs that may be present in the system.
- noautogroup Disable scheduler automatic task group creation.
-
nocache [ARM,EARLY]
no_console_suspend
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1309,6 +1309,18 @@ config SCHED_AUTOGROUP
desktop applications. Task group autogeneration is currently based
upon task session.
+config SCHED_AUTOGROUP_DEFAULT_ENABLED
+ bool "Enable automatic process group scheduling feature"
+ default y
+ depends on SCHED_AUTOGROUP
+ help
+ If set, automatic process group scheduling will be enabled per
+ default but can be disabled through passing autogroup=0 on the
+ kernel commandline during boot or a value of 0 via the file
+ proc/sys/kernel/sched_autogroup_enabled.
+
+ If unsure say Y.
+
config RELAY
bool "Kernel->user space relay support (formerly relayfs)"
select IRQ_WORK
--- a/kernel/sched/autogroup.c
+++ b/kernel/sched/autogroup.c
@@ -4,7 +4,8 @@
* Auto-group scheduling implementation:
*/
-unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1;
+unsigned int __read_mostly sysctl_sched_autogroup_enabled =
+ IS_ENABLED(CONFIG_SCHED_AUTOGROUP_DEFAULT_ENABLED) ? 1 : 0;
static struct autogroup autogroup_default;
static atomic_t autogroup_seq_nr;
@@ -219,11 +220,13 @@ void sched_autogroup_exit(struct signal_
static int __init setup_autogroup(char *str)
{
- sysctl_sched_autogroup_enabled = 0;
+ unsigned long enabled;
+ if (!kstrtoul(str, 0, &enabled))
+ sysctl_sched_autogroup_enabled = enabled ? 1 : 0;
return 1;
}
-__setup("noautogroup", setup_autogroup);
+__setup("autogroup=", setup_autogroup);
#ifdef CONFIG_PROC_FS

View File

@@ -0,0 +1,62 @@
From f41037c4ee33dd201c1750727daa390a3f652516 Mon Sep 17 00:00:00 2001
From: Alexandre Frade <kernel@xanmod.org>
Date: Tue, 31 Mar 2020 13:32:08 -0300
Subject: [PATCH 16/19] XANMOD: cpufreq: tunes ondemand and conservative
governor for performance
Signed-off-by: Alexandre Frade <kernel@xanmod.org>
---
drivers/cpufreq/cpufreq_conservative.c | 8 ++++----
drivers/cpufreq/cpufreq_ondemand.c | 8 ++++----
2 files changed, 8 insertions(+), 8 deletions(-)
--- a/drivers/cpufreq/cpufreq_conservative.c
+++ b/drivers/cpufreq/cpufreq_conservative.c
@@ -28,8 +28,8 @@ struct cs_dbs_tuners {
};
/* Conservative governor macros */
-#define DEF_FREQUENCY_UP_THRESHOLD (80)
-#define DEF_FREQUENCY_DOWN_THRESHOLD (20)
+#define DEF_FREQUENCY_UP_THRESHOLD (63)
+#define DEF_FREQUENCY_DOWN_THRESHOLD (26)
#define DEF_FREQUENCY_STEP (5)
#define DEF_SAMPLING_DOWN_FACTOR (1)
#define MAX_SAMPLING_DOWN_FACTOR (10)
@@ -47,9 +47,9 @@ static inline unsigned int get_freq_step
}
/*
- * Every sampling_rate, we check, if current idle time is less than 20%
+ * Every sampling_rate, we check, if current idle time is less than 37%
* (default), then we try to increase frequency. Every sampling_rate *
- * sampling_down_factor, we check, if current idle time is more than 80%
+ * sampling_down_factor, we check, if current idle time is more than 74%
* (default), then we try to decrease frequency
*
* Frequency updates happen at minimum steps of 5% (default) of maximum
--- a/drivers/cpufreq/cpufreq_ondemand.c
+++ b/drivers/cpufreq/cpufreq_ondemand.c
@@ -18,10 +18,10 @@
#include "cpufreq_ondemand.h"
/* On-demand governor macros */
-#define DEF_FREQUENCY_UP_THRESHOLD (80)
-#define DEF_SAMPLING_DOWN_FACTOR (1)
+#define DEF_FREQUENCY_UP_THRESHOLD (63)
+#define DEF_SAMPLING_DOWN_FACTOR (100)
#define MAX_SAMPLING_DOWN_FACTOR (100000)
-#define MICRO_FREQUENCY_UP_THRESHOLD (95)
+#define MICRO_FREQUENCY_UP_THRESHOLD (70)
#define MIN_FREQUENCY_UP_THRESHOLD (1)
#define MAX_FREQUENCY_UP_THRESHOLD (100)
@@ -128,7 +128,7 @@ static void dbs_freq_increase(struct cpu
}
/*
- * Every sampling_rate, we check, if current idle time is less than 20%
+ * Every sampling_rate, we check, if current idle time is less than 37%
* (default), then we try to increase frequency. Else, we adjust the frequency
* proportional to load.
*/

View File

@@ -0,0 +1,42 @@
From 000614733021b6723cfd78a4908b9b2e869ea001 Mon Sep 17 00:00:00 2001
From: Alexandre Frade <kernel@xanmod.org>
Date: Mon, 16 Sep 2024 08:09:56 +0000
Subject: [PATCH 17/19] XANMOD: lib/kconfig.debug: disable default
SYMBOLIC_ERRNAME and DEBUG_BUGVERBOSE
Signed-off-by: Alexandre Frade <kernel@xanmod.org>
---
fs/bcachefs/Kconfig | 1 -
lib/Kconfig.debug | 4 ++--
2 files changed, 2 insertions(+), 3 deletions(-)
--- a/fs/bcachefs/Kconfig
+++ b/fs/bcachefs/Kconfig
@@ -23,7 +23,6 @@ config BCACHEFS_FS
select XOR_BLOCKS
select XXHASH
select SRCU
- select SYMBOLIC_ERRNAME
help
The bcachefs filesystem - a modern, copy on write filesystem, with
support for multiple devices, compression, checksumming, etc.
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -190,7 +190,7 @@ config DYNAMIC_DEBUG_CORE
config SYMBOLIC_ERRNAME
bool "Support symbolic error names in printf"
- default y if PRINTK
+ default n
help
If you say Y here, the kernel's printf implementation will
be able to print symbolic error names such as ENOSPC instead
@@ -200,7 +200,7 @@ config SYMBOLIC_ERRNAME
config DEBUG_BUGVERBOSE
bool "Verbose BUG() reporting (adds 70K)" if DEBUG_KERNEL && EXPERT
depends on BUG && (GENERIC_BUG || HAVE_DEBUG_BUGVERBOSE)
- default y
+ default n
help
Say Y here to make BUG() panics output the file name and line number
of the BUG call as well as the EIP and oops trace. This aids

View File

@@ -0,0 +1,21 @@
From bea9f2e6b80582dc36b5693e08633546b14b2a4e Mon Sep 17 00:00:00 2001
From: Alexandre Frade <kernel@xanmod.org>
Date: Sun, 29 May 2022 00:57:40 +0000
Subject: [PATCH 18/19] XANMOD: scripts/setlocalversion: remove "+" tag for git
repo short version
Signed-off-by: Alexandre Frade <kernel@xanmod.org>
---
scripts/setlocalversion | 1 -
1 file changed, 1 deletion(-)
--- a/scripts/setlocalversion
+++ b/scripts/setlocalversion
@@ -92,7 +92,6 @@ scm_version()
# If only the short version is requested, don't bother
# running further git commands
if $short; then
- echo "+"
return
fi
# If we are past the tagged commit, we pretty print it.

View File

@@ -0,0 +1,19 @@
From eb42d6c8411f9a70472e6f3632051e2e44910843 Mon Sep 17 00:00:00 2001
From: Alexandre Frade <kernel@xanmod.org>
Date: Mon, 24 Apr 2023 04:50:34 +0000
Subject: [PATCH 19/19] XANMOD: scripts/setlocalversion: Move localversion*
files to the end
Signed-off-by: Alexandre Frade <kernel@xanmod.org>
---
scripts/setlocalversion | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/scripts/setlocalversion
+++ b/scripts/setlocalversion
@@ -182,4 +182,4 @@ elif [ "${LOCALVERSION+set}" != "set" ];
scm_version="$(scm_version --short)"
fi
-echo "${KERNELVERSION}${file_localversion}${config_localversion}${LOCALVERSION}${scm_version}"
+echo "${KERNELVERSION}${config_localversion}${LOCALVERSION}${scm_version}${file_localversion}"