From 3c32c0d457a2c4b2817f57e1e2c9cbba4624639e Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 22 Nov 2024 11:33:05 -0800 Subject: futex: improve user space accesses Josh Poimboeuf reports that he got a "will-it-scale.per_process_ops 1.9% improvement" report for his patch that changed __get_user() to use pointer masking instead of the explicit speculation barrier. However, that patch doesn't actually work in the general case, because some (very bad) architecture-specific code actually depends on __get_user() also working on kernel addresses. A profile showed that the offending __get_user() was the futex code, which really should be fixed up to not use that horrid legacy case. Rewrite futex_get_value_locked() to use the modern user acccess helpers, and inline it so that the compiler not only avoids the function call for a few instructions, but can do CSE on the address masking. It also turns out the x86 futex functions have unnecessary barriers in other places, so let's fix those up too. Link: https://lore.kernel.org/all/20241115230653.hfvzyf3aqqntgp63@jpoimboe/ Reported-by: Josh Poimboeuf Signed-off-by: Linus Torvalds --- arch/x86/include/asm/futex.h | 8 +++-- kernel/futex/core.c | 22 -------------- kernel/futex/futex.h | 59 ++++++++++++++++++++++++++++++++++-- 3 files changed, 63 insertions(+), 26 deletions(-) --- a/arch/x86/include/asm/futex.h +++ b/arch/x86/include/asm/futex.h @@ -48,7 +48,9 @@ do { \ static __always_inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *uaddr) { - if (!user_access_begin(uaddr, sizeof(u32))) + if (can_do_masked_user_access()) + uaddr = masked_user_access_begin(uaddr); + else if (!user_access_begin(uaddr, sizeof(u32))) return -EFAULT; switch (op) { @@ -84,7 +86,9 @@ static inline int futex_atomic_cmpxchg_i { int ret = 0; - if (!user_access_begin(uaddr, sizeof(u32))) + if (can_do_masked_user_access()) + uaddr = masked_user_access_begin(uaddr); + else if (!user_access_begin(uaddr, sizeof(u32))) return -EFAULT; asm volatile("\n" "1:\t" LOCK_PREFIX "cmpxchgl %3, %2\n" --- a/kernel/futex/core.c +++ b/kernel/futex/core.c @@ -451,28 +451,6 @@ struct futex_q *futex_top_waiter(struct return NULL; } -int futex_cmpxchg_value_locked(u32 *curval, u32 __user *uaddr, u32 uval, u32 newval) -{ - int ret; - - pagefault_disable(); - ret = futex_atomic_cmpxchg_inatomic(curval, uaddr, uval, newval); - pagefault_enable(); - - return ret; -} - -int futex_get_value_locked(u32 *dest, u32 __user *from) -{ - int ret; - - pagefault_disable(); - ret = __get_user(*dest, from); - pagefault_enable(); - - return ret ? -EFAULT : 0; -} - /** * wait_for_owner_exiting - Block until the owner has exited * @ret: owner's current futex lock status --- a/kernel/futex/futex.h +++ b/kernel/futex/futex.h @@ -6,6 +6,7 @@ #include #include #include +#include #ifdef CONFIG_PREEMPT_RT #include @@ -225,10 +226,64 @@ extern bool __futex_wake_mark(struct fut extern void futex_wake_mark(struct wake_q_head *wake_q, struct futex_q *q); extern int fault_in_user_writeable(u32 __user *uaddr); -extern int futex_cmpxchg_value_locked(u32 *curval, u32 __user *uaddr, u32 uval, u32 newval); -extern int futex_get_value_locked(u32 *dest, u32 __user *from); extern struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb, union futex_key *key); +static inline int futex_cmpxchg_value_locked(u32 *curval, u32 __user *uaddr, u32 uval, u32 newval) +{ + int ret; + + pagefault_disable(); + ret = futex_atomic_cmpxchg_inatomic(curval, uaddr, uval, newval); + pagefault_enable(); + + return ret; +} + +/* + * This does a plain atomic user space read, and the user pointer has + * already been verified earlier by get_futex_key() to be both aligned + * and actually in user space, just like futex_atomic_cmpxchg_inatomic(). + * + * We still want to avoid any speculation, and while __get_user() is + * the traditional model for this, it's actually slower then doing + * this manually these days. + * + * We could just have a per-architecture special function for it, + * the same way we do futex_atomic_cmpxchg_inatomic(), but rather + * than force everybody to do that, write it out long-hand using + * the low-level user-access infrastructure. + * + * This looks a bit overkill, but generally just results in a couple + * of instructions. + */ +static __always_inline int futex_read_inatomic(u32 *dest, u32 __user *from) +{ + u32 val; + + if (can_do_masked_user_access()) + from = masked_user_access_begin(from); + else if (!user_read_access_begin(from, sizeof(*from))) + return -EFAULT; + unsafe_get_user(val, from, Efault); + user_access_end(); + *dest = val; + return 0; +Efault: + user_access_end(); + return -EFAULT; +} + +static inline int futex_get_value_locked(u32 *dest, u32 __user *from) +{ + int ret; + + pagefault_disable(); + ret = futex_read_inatomic(dest, from); + pagefault_enable(); + + return ret; +} + extern void __futex_unqueue(struct futex_q *q); extern void __futex_queue(struct futex_q *q, struct futex_hash_bucket *hb); extern int futex_unqueue(struct futex_q *q);