106 lines
4.9 KiB
Diff
106 lines
4.9 KiB
Diff
From a2324e3cf5378205b4a18c3fa2cfe702a26f81d4 Mon Sep 17 00:00:00 2001
|
|
From: Christian Loehle <christian.loehle@arm.com>
|
|
Date: Wed, 3 Sep 2025 16:48:32 +0100
|
|
Subject: sched: Fix sched_numa_find_nth_cpu() if mask offline
|
|
|
|
sched_numa_find_nth_cpu() uses a bsearch to look for the 'closest'
|
|
CPU in sched_domains_numa_masks and given cpus mask. However they
|
|
might not intersect if all CPUs in the cpus mask are offline. bsearch
|
|
will return NULL in that case, bail out instead of dereferencing a
|
|
bogus pointer.
|
|
|
|
The previous behaviour lead to this bug when using maxcpus=4 on an
|
|
rk3399 (LLLLbb) (i.e. booting with all big CPUs offline):
|
|
|
|
[ 1.422922] Unable to handle kernel paging request at virtual address ffffff8000000000
|
|
[ 1.423635] Mem abort info:
|
|
[ 1.423889] ESR = 0x0000000096000006
|
|
[ 1.424227] EC = 0x25: DABT (current EL), IL = 32 bits
|
|
[ 1.424715] SET = 0, FnV = 0
|
|
[ 1.424995] EA = 0, S1PTW = 0
|
|
[ 1.425279] FSC = 0x06: level 2 translation fault
|
|
[ 1.425735] Data abort info:
|
|
[ 1.425998] ISV = 0, ISS = 0x00000006, ISS2 = 0x00000000
|
|
[ 1.426499] CM = 0, WnR = 0, TnD = 0, TagAccess = 0
|
|
[ 1.426952] GCS = 0, Overlay = 0, DirtyBit = 0, Xs = 0
|
|
[ 1.427428] swapper pgtable: 4k pages, 39-bit VAs, pgdp=0000000004a9f000
|
|
[ 1.428038] [ffffff8000000000] pgd=18000000f7fff403, p4d=18000000f7fff403, pud=18000000f7fff403, pmd=0000000000000000
|
|
[ 1.429014] Internal error: Oops: 0000000096000006 [#1] SMP
|
|
[ 1.429525] Modules linked in:
|
|
[ 1.429813] CPU: 3 UID: 0 PID: 1 Comm: swapper/0 Not tainted 6.17.0-rc4-dirty #343 PREEMPT
|
|
[ 1.430559] Hardware name: Pine64 RockPro64 v2.1 (DT)
|
|
[ 1.431012] pstate: 60000005 (nZCv daif -PAN -UAO -TCO -DIT -SSBS BTYPE=--)
|
|
[ 1.431634] pc : sched_numa_find_nth_cpu+0x2a0/0x488
|
|
[ 1.432094] lr : sched_numa_find_nth_cpu+0x284/0x488
|
|
[ 1.432543] sp : ffffffc084e1b960
|
|
[ 1.432843] x29: ffffffc084e1b960 x28: ffffff80078a8800 x27: ffffffc0846eb1d0
|
|
[ 1.433495] x26: 0000000000000000 x25: 0000000000000000 x24: 0000000000000000
|
|
[ 1.434144] x23: 0000000000000000 x22: fffffffffff7f093 x21: ffffffc081de6378
|
|
[ 1.434792] x20: 0000000000000000 x19: 0000000ffff7f093 x18: 00000000ffffffff
|
|
[ 1.435441] x17: 3030303866666666 x16: 66663d736b73616d x15: ffffffc104e1b5b7
|
|
[ 1.436091] x14: 0000000000000000 x13: ffffffc084712860 x12: 0000000000000372
|
|
[ 1.436739] x11: 0000000000000126 x10: ffffffc08476a860 x9 : ffffffc084712860
|
|
[ 1.437389] x8 : 00000000ffffefff x7 : ffffffc08476a860 x6 : 0000000000000000
|
|
[ 1.438036] x5 : 000000000000bff4 x4 : 0000000000000000 x3 : 0000000000000000
|
|
[ 1.438683] x2 : 0000000000000000 x1 : ffffffc0846eb000 x0 : ffffff8000407b68
|
|
[ 1.439332] Call trace:
|
|
[ 1.439559] sched_numa_find_nth_cpu+0x2a0/0x488 (P)
|
|
[ 1.440016] smp_call_function_any+0xc8/0xd0
|
|
[ 1.440416] armv8_pmu_init+0x58/0x27c
|
|
[ 1.440770] armv8_cortex_a72_pmu_init+0x20/0x2c
|
|
[ 1.441199] arm_pmu_device_probe+0x1e4/0x5e8
|
|
[ 1.441603] armv8_pmu_device_probe+0x1c/0x28
|
|
[ 1.442007] platform_probe+0x5c/0xac
|
|
[ 1.442347] really_probe+0xbc/0x298
|
|
[ 1.442683] __driver_probe_device+0x78/0x12c
|
|
[ 1.443087] driver_probe_device+0xdc/0x160
|
|
[ 1.443475] __driver_attach+0x94/0x19c
|
|
[ 1.443833] bus_for_each_dev+0x74/0xd4
|
|
[ 1.444190] driver_attach+0x24/0x30
|
|
[ 1.444525] bus_add_driver+0xe4/0x208
|
|
[ 1.444874] driver_register+0x60/0x128
|
|
[ 1.445233] __platform_driver_register+0x24/0x30
|
|
[ 1.445662] armv8_pmu_driver_init+0x28/0x4c
|
|
[ 1.446059] do_one_initcall+0x44/0x25c
|
|
[ 1.446416] kernel_init_freeable+0x1dc/0x3bc
|
|
[ 1.446820] kernel_init+0x20/0x1d8
|
|
[ 1.447151] ret_from_fork+0x10/0x20
|
|
[ 1.447493] Code: 90022e21 f000e5f5 910de2b5 2a1703e2 (f8767803)
|
|
[ 1.448040] ---[ end trace 0000000000000000 ]---
|
|
[ 1.448483] note: swapper/0[1] exited with preempt_count 1
|
|
[ 1.449047] Kernel panic - not syncing: Attempted to kill init! exitcode=0x0000000b
|
|
[ 1.449741] SMP: stopping secondary CPUs
|
|
[ 1.450105] Kernel Offset: disabled
|
|
[ 1.450419] CPU features: 0x000000,00080000,20002001,0400421b
|
|
[ 1.450935] Memory Limit: none
|
|
[ 1.451217] ---[ end Kernel panic - not syncing: Attempted to kill init! exitcode=0x0000000b ]---
|
|
|
|
Yury: with the fix, the function returns cpu == nr_cpu_ids, and later in
|
|
|
|
smp_call_function_any ->
|
|
smp_call_function_single ->
|
|
generic_exec_single
|
|
|
|
we test the cpu for '>= nr_cpu_ids' and return -ENXIO. So everything is
|
|
handled correctly.
|
|
|
|
Fixes: cd7f55359c90 ("sched: add sched_numa_find_nth_cpu()")
|
|
Cc: stable@vger.kernel.org
|
|
Signed-off-by: Christian Loehle <christian.loehle@arm.com>
|
|
Signed-off-by: Yury Norov (NVIDIA) <yury.norov@gmail.com>
|
|
---
|
|
kernel/sched/topology.c | 2 ++
|
|
1 file changed, 2 insertions(+)
|
|
|
|
--- a/kernel/sched/topology.c
|
|
+++ b/kernel/sched/topology.c
|
|
@@ -2212,6 +2212,8 @@ int sched_numa_find_nth_cpu(const struct
|
|
goto unlock;
|
|
|
|
hop_masks = bsearch(&k, k.masks, sched_domains_numa_levels, sizeof(k.masks[0]), hop_cmp);
|
|
+ if (!hop_masks)
|
|
+ goto unlock;
|
|
hop = hop_masks - k.masks;
|
|
|
|
ret = hop ?
|