[PATCH 2/2] powerpc/smp: Use GFP_ATOMIC while allocating tmp mask

Mon Oct 19 15:27:16 AEDT 2020

Qian Cai reported a regression where CPU Hotplug fails with the latest
powerpc/next

BUG: sleeping function called from invalid context at mm/slab.h:494
in_atomic(): 1, irqs_disabled(): 1, non_block: 0, pid: 0, name: swapper/88
no locks held by swapper/88/0.
irq event stamp: 18074448
hardirqs last  enabled at (18074447): [<c0000000001a2a7c>] tick_nohz_idle_enter+0x9c/0x110
hardirqs last disabled at (18074448): [<c000000000106798>] do_idle+0x138/0x3b0
do_idle at kernel/sched/idle.c:253 (discriminator 1)
softirqs last  enabled at (18074440): [<c0000000000bbec4>] irq_enter_rcu+0x94/0xa0
softirqs last disabled at (18074439): [<c0000000000bbea0>] irq_enter_rcu+0x70/0xa0
CPU: 88 PID: 0 Comm: swapper/88 Tainted: G        W         5.9.0-rc8-next-20201007 #1
Call Trace:
[c00020000a4bfcf0] [c000000000649e98] dump_stack+0xec/0x144 (unreliable)
[c00020000a4bfd30] [c0000000000f6c34] ___might_sleep+0x2f4/0x310
[c00020000a4bfdb0] [c000000000354f94] slab_pre_alloc_hook.constprop.82+0x124/0x190
[c00020000a4bfe00] [c00000000035e9e8] __kmalloc_node+0x88/0x3a0
slab_alloc_node at mm/slub.c:2817
(inlined by) __kmalloc_node at mm/slub.c:4013
[c00020000a4bfe80] [c0000000006494d8] alloc_cpumask_var_node+0x38/0x80
kmalloc_node at include/linux/slab.h:577
(inlined by) alloc_cpumask_var_node at lib/cpumask.c:116
[c00020000a4bfef0] [c00000000003eedc] start_secondary+0x27c/0x800
update_mask_by_l2 at arch/powerpc/kernel/smp.c:1267
(inlined by) add_cpu_to_masks at arch/powerpc/kernel/smp.c:1387
(inlined by) start_secondary at arch/powerpc/kernel/smp.c:1420
[c00020000a4bff90] [c00000000000c468] start_secondary_resume+0x10/0x14

Allocating a temporary mask while performing a CPU Hotplug operation
with CONFIG_CPUMASK_OFFSTACK enabled, leads to calling a sleepable
function from a atomic context. Fix this by allocating the temporary
mask with GFP_ATOMIC flag. Also instead of having to allocate twice,
allocate the mask in the caller so that we only have to allocate once.
If the allocation fails, assume the mask to be same as sibling mask, which
will make the scheduler to drop this domain for this CPU.

Fixes: 70a94089d7f7 ("powerpc/smp: Optimize update_coregroup_mask")
Fixes: 3ab33d6dc3e9 ("powerpc/smp: Optimize update_mask_by_l2")
Reported-by: Qian Cai <cai at redhat.com>
Signed-off-by: Srikar Dronamraju <srikar at linux.vnet.ibm.com>
Cc: linuxppc-dev <linuxppc-dev at lists.ozlabs.org>
Cc: LKML <linux-kernel at vger.kernel.org>
Cc: Michael Ellerman <mpe at ellerman.id.au>
Cc: Nathan Lynch <nathanl at linux.ibm.com>
Cc: Gautham R Shenoy <ego at linux.vnet.ibm.com>
Cc: Ingo Molnar <mingo at kernel.org>
Cc: Peter Zijlstra <peterz at infradead.org>
Cc: Valentin Schneider <valentin.schneider at arm.com>
Cc: Qian Cai <cai at redhat.com>
---
Changelog v1->v2:
https://lore.kernel.org/linuxppc-dev/20201008034240.34059-1-srikar@linux.vnet.ibm.com/t/#u
Updated 2nd patch based on comments from Michael Ellerman
- Remove the WARN_ON.
- Handle allocation failures in a more subtle fashion
- Allocate in the caller so that we allocate once.

 arch/powerpc/kernel/smp.c | 57 +++++++++++++++++++++------------------
 1 file changed, 31 insertions(+), 26 deletions(-)

diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index a864b9b3228c..028479e9b66b 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -1257,38 +1257,33 @@ static struct device_node *cpu_to_l2cache(int cpu)
 	return cache;
 }
 
-static bool update_mask_by_l2(int cpu)
+static bool update_mask_by_l2(int cpu, cpumask_var_t *mask)
 {
 	struct cpumask *(*submask_fn)(int) = cpu_sibling_mask;
 	struct device_node *l2_cache, *np;
-	cpumask_var_t mask;
 	int i;
 
 	if (has_big_cores)
 		submask_fn = cpu_smallcore_mask;
 
 	l2_cache = cpu_to_l2cache(cpu);
-	if (!l2_cache) {
-		/*
-		 * If no l2cache for this CPU, assume all siblings to share
-		 * cache with this CPU.
-		 */
+	if (!l2_cache || !*mask) {
+		/* Assume only core siblings share cache with this CPU */
 		for_each_cpu(i, submask_fn(cpu))
 			set_cpus_related(cpu, i, cpu_l2_cache_mask);
 
 		return false;
 	}
 
-	alloc_cpumask_var_node(&mask, GFP_KERNEL, cpu_to_node(cpu));
-	cpumask_and(mask, cpu_online_mask, cpu_cpu_mask(cpu));
+	cpumask_and(*mask, cpu_online_mask, cpu_cpu_mask(cpu));
 
 	/* Update l2-cache mask with all the CPUs that are part of submask */
 	or_cpumasks_related(cpu, cpu, submask_fn, cpu_l2_cache_mask);
 
 	/* Skip all CPUs already part of current CPU l2-cache mask */
-	cpumask_andnot(mask, mask, cpu_l2_cache_mask(cpu));
+	cpumask_andnot(*mask, *mask, cpu_l2_cache_mask(cpu));
 
-	for_each_cpu(i, mask) {
+	for_each_cpu(i, *mask) {
 		/*
 		 * when updating the marks the current CPU has not been marked
 		 * online, but we need to update the cache masks
@@ -1298,15 +1293,14 @@ static bool update_mask_by_l2(int cpu)
 		/* Skip all CPUs already part of current CPU l2-cache */
 		if (np == l2_cache) {
 			or_cpumasks_related(cpu, i, submask_fn, cpu_l2_cache_mask);
-			cpumask_andnot(mask, mask, submask_fn(i));
+			cpumask_andnot(*mask, *mask, submask_fn(i));
 		} else {
-			cpumask_andnot(mask, mask, cpu_l2_cache_mask(i));
+			cpumask_andnot(*mask, *mask, cpu_l2_cache_mask(i));
 		}
 
 		of_node_put(np);
 	}
 	of_node_put(l2_cache);
-	free_cpumask_var(mask);
 
 	return true;
 }
@@ -1349,40 +1343,46 @@ static inline void add_cpu_to_smallcore_masks(int cpu)
 	}
 }
 
-static void update_coregroup_mask(int cpu)
+static void update_coregroup_mask(int cpu, cpumask_var_t *mask)
 {
 	struct cpumask *(*submask_fn)(int) = cpu_sibling_mask;
-	cpumask_var_t mask;
 	int coregroup_id = cpu_to_coregroup_id(cpu);
 	int i;
 
-	alloc_cpumask_var_node(&mask, GFP_KERNEL, cpu_to_node(cpu));
-	cpumask_and(mask, cpu_online_mask, cpu_cpu_mask(cpu));
-
 	if (shared_caches)
 		submask_fn = cpu_l2_cache_mask;
 
+	if (!*mask) {
+		/* Assume only siblings are part of this CPU's coregroup */
+		for_each_cpu(i, submask_fn(cpu))
+			set_cpus_related(cpu, i, cpu_coregroup_mask);
+
+		return;
+	}
+
+	cpumask_and(*mask, cpu_online_mask, cpu_cpu_mask(cpu));
+
 	/* Update coregroup mask with all the CPUs that are part of submask */
 	or_cpumasks_related(cpu, cpu, submask_fn, cpu_coregroup_mask);
 
 	/* Skip all CPUs already part of coregroup mask */
-	cpumask_andnot(mask, mask, cpu_coregroup_mask(cpu));
+	cpumask_andnot(*mask, *mask, cpu_coregroup_mask(cpu));
 
-	for_each_cpu(i, mask) {
+	for_each_cpu(i, *mask) {
 		/* Skip all CPUs not part of this coregroup */
 		if (coregroup_id == cpu_to_coregroup_id(i)) {
 			or_cpumasks_related(cpu, i, submask_fn, cpu_coregroup_mask);
-			cpumask_andnot(mask, mask, submask_fn(i));
+			cpumask_andnot(*mask, *mask, submask_fn(i));
 		} else {
-			cpumask_andnot(mask, mask, cpu_coregroup_mask(i));
+			cpumask_andnot(*mask, *mask, cpu_coregroup_mask(i));
 		}
 	}
-	free_cpumask_var(mask);
 }
 
 static void add_cpu_to_masks(int cpu)
 {
 	int first_thread = cpu_first_thread_sibling(cpu);
+	cpumask_var_t mask;
 	int i;
 
 	/*
@@ -1396,10 +1396,15 @@ static void add_cpu_to_masks(int cpu)
 			set_cpus_related(i, cpu, cpu_sibling_mask);
 
 	add_cpu_to_smallcore_masks(cpu);
-	update_mask_by_l2(cpu);
+
+	/* In CPU-hotplug path, hence use GFP_ATOMIC */
+	alloc_cpumask_var_node(&mask, GFP_ATOMIC, cpu_to_node(cpu));
+	update_mask_by_l2(cpu, &mask);
 
 	if (has_coregroup_support())
-		update_coregroup_mask(cpu);
+		update_coregroup_mask(cpu, &mask);
+
+	free_cpumask_var(mask);
 }
 
 /* Activate a secondary processor. */
-- 
2.18.2