[RFC] powerpc/hotplug: Resolve dlpar hot-add cpu deadlock

Fri Jun 2 23:34:30 AEST 2017

A deadlock problem was detected in the hot-add CPU operation when
modifying a Shared CPU Topology configuration on powerpc.  We have
a case where the cpu_hotplug_begin locks 'cpu_hotplug.lock' in one
thread, and then a call to get_online_cpus attempts to acquire the
same lock from lower in the same call chain.  The get_online_cpus
call cannot complete because 'cpu_hotplug.lock' is already held, nor
can it exit get_online_cpus early because the thread-specific value
of cpu_hotplug.active_writer set during the earlier call to
cpu_hotplug_begin does not match the value of 'current' in the new
thread that is calling get_online_cpus from lower in the same call
chain.

Here is a dump of the deadlocked tasks in the system:

[  489.684102] INFO: task kworker/0:1:823 blocked for more than 120 seconds.
[  489.684131]       Not tainted 4.12.0-rc3.wi91275_054a_02.ppc64le+ #18
[  489.684138] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[  489.684146] kworker/0:1     D    0   823      2 0x00000800
[  489.684161] Workqueue: events vmstat_shepherd
[  489.684169] Call Trace:
[  489.684177] [c0000004d1f57940] [c00000000001ce5c] __switch_to+0x30c/0x430
[  489.684188] [c0000004d1f579a0] [c000000000a1fa54] __schedule+0x4d4/0xfd0
[  489.684198] [c0000004d1f57a70] [c000000000a2059c] schedule+0x4c/0xd0
[  489.684208] [c0000004d1f57aa0] [c000000000a20b7c] schedule_preempt_disabled+0x2c/0x50
[  489.684218] [c0000004d1f57ad0] [c000000000a2272c] __mutex_lock+0x34c/0xb40
[  489.684228] [c0000004d1f57be0] [c0000000000e9194] get_online_cpus+0x94/0xd0
[  489.684238] [c0000004d1f57c10] [c0000000002c7178] vmstat_shepherd+0x38/0x150
[  489.684249] [c0000004d1f57c60] [c000000000114dcc] process_one_work+0x22c/0x7f0
[  489.684260] [c0000004d1f57d30] [c000000000115418] worker_thread+0x88/0x4b0
[  489.684270] [c0000004d1f57dc0] [c00000000011e520] kthread+0x170/0x1b0
[  489.684280] [c0000004d1f57e30] [c00000000000b4f4] ret_from_kernel_thread+0x5c/0x68
[  489.684312] INFO: task irqbalance:3141 blocked for more than 120 seconds.
[  489.684320]       Not tainted 4.12.0-rc3.wi91275_054a_02.ppc64le+ #18
[  489.684326] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[  489.684334] irqbalance      D    0  3141      1 0x00040080
[  489.684343] Call Trace:
[  489.684349] [c0000004a6247860] [c00000000001ce5c] __switch_to+0x30c/0x430
[  489.684359] [c0000004a62478c0] [c000000000a1fa54] __schedule+0x4d4/0xfd0
[  489.684368] [c0000004a6247990] [c000000000a2059c] schedule+0x4c/0xd0
[  489.684378] [c0000004a62479c0] [c000000000a20b7c] schedule_preempt_disabled+0x2c/0x50
[  489.684388] [c0000004a62479f0] [c000000000a2272c] __mutex_lock+0x34c/0xb40
[  489.684398] [c0000004a6247b00] [c00000000067ecc8] online_show+0x38/0x90
[  489.684408] [c0000004a6247b80] [c00000000067f5a0] dev_attr_show+0x40/0xa0
[  489.684419] [c0000004a6247bb0] [c00000000043e8f0] sysfs_kf_seq_show+0xf0/0x1c0
[  489.684429] [c0000004a6247c00] [c00000000043c1cc] kernfs_seq_show+0x3c/0x50
[  489.684439] [c0000004a6247c20] [c0000000003ae4cc] seq_read+0xfc/0x570
[  489.684449] [c0000004a6247cc0] [c00000000043d484] kernfs_fop_read+0x1a4/0x270
[  489.684459] [c0000004a6247d10] [c000000000370380] __vfs_read+0x40/0x1b0
[  489.684469] [c0000004a6247da0] [c000000000371d3c] vfs_read+0xac/0x190
[  489.684479] [c0000004a6247de0] [c000000000373a40] SyS_read+0x60/0x110
[  489.684489] [c0000004a6247e30] [c00000000000b190] system_call+0x38/0xe0
[  489.684555] INFO: task kworker/24:3:12619 blocked for more than 120 seconds.
[  489.684563]       Not tainted 4.12.0-rc3.wi91275_054a_02.ppc64le+ #18
[  489.684570] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[  489.684577] kworker/24:3    D    0 12619      2 0x00000880
[  489.684588] Workqueue: events rtas_event_scan
[  489.684596] Call Trace:
[  489.684603] [c0000004ba15b900] [c00000000001ce5c] __switch_to+0x30c/0x430
[  489.684612] [c0000004ba15b960] [c000000000a1fa54] __schedule+0x4d4/0xfd0
[  489.684622] [c0000004ba15ba30] [c000000000a2059c] schedule+0x4c/0xd0
[  489.684631] [c0000004ba15ba60] [c000000000a20b7c] schedule_preempt_disabled+0x2c/0x50
[  489.684642] [c0000004ba15ba90] [c000000000a2272c] __mutex_lock+0x34c/0xb40
[  489.684651] [c0000004ba15bba0] [c0000000000e9194] get_online_cpus+0x94/0xd0
[  489.684661] [c0000004ba15bbd0] [c000000000036318] rtas_event_scan+0xa8/0x2f0
[  489.684671] [c0000004ba15bc60] [c000000000114dcc] process_one_work+0x22c/0x7f0
[  489.684682] [c0000004ba15bd30] [c000000000115418] worker_thread+0x88/0x4b0
[  489.684692] [c0000004ba15bdc0] [c00000000011e520] kthread+0x170/0x1b0
[  489.684702] [c0000004ba15be30] [c00000000000b4f4] ret_from_kernel_thread+0x5c/0x68
[  489.684714] INFO: task drmgr:14416 blocked for more than 120 seconds.
[  489.684721]       Not tainted 4.12.0-rc3.wi91275_054a_02.ppc64le+ #18
[  489.684728] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[  489.684735] drmgr           D    0 14416   4793 0x00040080
[  489.684744] Call Trace:
[  489.684750] [c00000049d8b3470] [c00000000001ce5c] __switch_to+0x30c/0x430
[  489.684760] [c00000049d8b34d0] [c000000000a1fa54] __schedule+0x4d4/0xfd0
[  489.684770] [c00000049d8b35a0] [c000000000a2059c] schedule+0x4c/0xd0
[  489.684779] [c00000049d8b35d0] [c000000000a26f74] schedule_timeout+0x284/0x590
[  489.684790] [c00000049d8b36f0] [c000000000a217fc] wait_for_common+0x18c/0x280
[  489.684800] [c00000049d8b3770] [c0000000000ebcf4] bringup_cpu+0x84/0xe0
[  489.684810] [c00000049d8b37a0] [c0000000000e9a00] cpuhp_invoke_callback+0x240/0xcd0
[  489.684820] [c00000049d8b3860] [c0000000000ea6d8] cpuhp_up_callbacks+0x58/0x190
[  489.684830] [c00000049d8b38f0] [c0000000000ec0d8] _cpu_up+0xe8/0x1d0
[  489.684840] [c00000049d8b3950] [c0000000000ec2e0] do_cpu_up+0x120/0x150
[  489.684850] [c00000049d8b39d0] [c00000000068b7fc] cpu_subsys_online+0x5c/0xe0
[  489.684860] [c00000049d8b3a20] [c000000000682fe4] device_online+0xb4/0x120
[  489.684871] [c00000049d8b3a60] [c0000000000b54b4] dlpar_online_cpu+0x1e4/0x220
[  489.684881] [c00000049d8b3b30] [c0000000000b5c28] dlpar_cpu_add+0x108/0x2f0
[  489.684891] [c00000049d8b3bc0] [c0000000000b691c] dlpar_cpu_probe+0x3c/0x80
[  489.684901] [c00000049d8b3c00] [c0000000000231b8] arch_cpu_probe+0x38/0x60
[  489.684911] [c00000049d8b3c20] [c00000000068c020] cpu_probe_store+0x40/0x70
[  489.684921] [c00000049d8b3c50] [c00000000067e014] dev_attr_store+0x34/0x60
[  489.684931] [c00000049d8b3c70] [c00000000043e7d4] sysfs_kf_write+0x94/0xc0
[  489.684951] [c00000049d8b3cb0] [c00000000043d1f4] kernfs_fop_write+0x194/0x280
[  489.684961] [c00000049d8b3d00] [c000000000370530] __vfs_write+0x40/0x1c0
[  489.684970] [c00000049d8b3d90] [c000000000371ee8] vfs_write+0xc8/0x240
[  489.684981] [c00000049d8b3de0] [c000000000373b50] SyS_write+0x60/0x110
[  489.684991] [c00000049d8b3e30] [c00000000000b190] system_call+0x38/0xe0
[  489.685002] INFO: task cpuhp/136:14931 blocked for more than 120 seconds.
[  489.685009]       Not tainted 4.12.0-rc3.wi91275_054a_02.ppc64le+ #18
[  489.685015] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[  489.685022] cpuhp/136       D    0 14931      2 0x00000880
[  489.685031] Call Trace:
[  489.685038] [c00000049eb2b4a0] [c00000000001ce5c] __switch_to+0x30c/0x430
[  489.685047] [c00000049eb2b500] [c000000000a1fa54] __schedule+0x4d4/0xfd0
[  489.685057] [c00000049eb2b5d0] [c000000000a2059c] schedule+0x4c/0xd0
[  489.685066] [c00000049eb2b600] [c000000000a20b7c] schedule_preempt_disabled+0x2c/0x50
[  489.685077] [c00000049eb2b630] [c000000000a2272c] __mutex_lock+0x34c/0xb40
[  489.685086] [c00000049eb2b740] [c0000000000e9194] get_online_cpus+0x94/0xd0
[  489.685097] [c00000049eb2b770] [c000000000200844] stop_machine+0x34/0x1f0
[  489.685108] [c00000049eb2b7e0] [c000000000074b04] arch_update_cpu_topology+0x524/0x650
[  489.685119] [c00000049eb2ba80] [c00000000015f24c] partition_sched_domains+0x9c/0x4f0
[  489.685130] [c00000049eb2bb40] [c0000000001fb3c4] cpuset_update_active_cpus+0x24/0x60
[  489.685141] [c00000049eb2bb60] [c000000000137874] sched_cpu_activate+0x1a4/0x1b0
[  489.685151] [c00000049eb2bbb0] [c0000000000e9a00] cpuhp_invoke_callback+0x240/0xcd0
[  489.685161] [c00000049eb2bc70] [c0000000000ea6d8] cpuhp_up_callbacks+0x58/0x190
[  489.685171] [c00000049eb2bd00] [c0000000000ea978] cpuhp_thread_fun+0x168/0x1a0
[  489.685182] [c00000049eb2bd30] [c000000000124d2c] smpboot_thread_fn+0x2fc/0x3b0
[  489.685192] [c00000049eb2bdc0] [c00000000011e520] kthread+0x170/0x1b0
[  489.685203] [c00000049eb2be30] [c00000000000b4f4] ret_from_kernel_thread+0x5c/0x68

Here is a dump of all of the locks help in the system:

[  489.685213] Showing all locks held in the system:
[  489.685221] hm, tasklist_lock locked, retrying...
[  489.685223]  #1 locked it.
[  489.885281] 4 locks held by kworker/0:1/823:
[  489.885288]  #0:  ("events"){.+.+.+}, at: [<c000000000114d1c>] process_one_work+0x17c/0x7f0
[  489.885308]  #1:  ((shepherd).work){+.+...}, at: [<c000000000114d1c>] process_one_work+0x17c/0x7f0
[  489.885327]  #2:  (cpu_hotplug.dep_map){++++++}, at: [<c0000000000e9158>] get_online_cpus+0x58/0xd0
[  489.885345]  #3:  (cpu_hotplug.lock){+.+.+.}, at: [<c0000000000e9194>] get_online_cpus+0x94/0xd0
[  489.885363] 1 lock held by khungtaskd/829:
[  489.885368]  #0:  (tasklist_lock){.+.+..}, at: [<c00000000016b974>] debug_show_all_locks+0xa4/0x210
[  489.885397] 4 locks held by irqbalance/3141:
[  489.885403]  #0:  (&p->lock){+.+.+.}, at: [<c0000000003ae43c>] seq_read+0x6c/0x570
[  489.885420]  #1:  (&of->mutex){+.+.+.}, at: [<c00000000043c7a8>] kernfs_seq_start+0x38/0x120
[  489.885439]  #2:  (s_active#14){.+.+.+}, at: [<c00000000043c7b4>] kernfs_seq_start+0x44/0x120
[  489.885459]  #3:  (&dev->mutex){......}, at: [<c00000000067ecc8>] online_show+0x38/0x90
[  489.885477] 2 locks held by agetty/3195:
[  489.885482]  #0:  (&tty->ldisc_sem){++++.+}, at: [<c00000000062b0e4>] tty_ldisc_ref_wait+0x34/0x90
[  489.885501]  #1:  (&ldata->atomic_read_lock){+.+...}, at: [<c000000000626e1c>] n_tty_read+0xec/0x960
[  489.885528] 2 locks held by bash/7902:
[  489.885534]  #0:  (&tty->ldisc_sem){++++.+}, at: [<c00000000062b0e4>] tty_ldisc_ref_wait+0x34/0x90
[  489.885552]  #1:  (&ldata->atomic_read_lock){+.+...}, at: [<c000000000626e1c>] n_tty_read+0xec/0x960
[  489.885589] 4 locks held by kworker/24:3/12619:
[  489.885595]  #0:  ("events"){.+.+.+}, at: [<c000000000114d1c>] process_one_work+0x17c/0x7f0
[  489.885612]  #1:  ((event_scan_work).work){+.+...}, at: [<c000000000114d1c>] process_one_work+0x17c/0x7f0
[  489.885630]  #2:  (cpu_hotplug.dep_map){++++++}, at: [<c0000000000e9158>] get_online_cpus+0x58/0xd0
[  489.885648]  #3:  (cpu_hotplug.lock){+.+.+.}, at: [<c0000000000e9194>] get_online_cpus+0x94/0xd0
[  489.885667] 8 locks held by drmgr/14416:
[  489.885673]  #0:  (sb_writers#3){.+.+.+}, at: [<c000000000372030>] vfs_write+0x210/0x240
[  489.885692]  #1:  (&of->mutex){+.+.+.}, at: [<c00000000043d1a4>] kernfs_fop_write+0x144/0x280
[  489.885710]  #2:  (s_active#150){.+.+.+}, at: [<c00000000043d1b0>] kernfs_fop_write+0x150/0x280
[  489.885730]  #3:  (device_hotplug_lock){+.+.+.}, at: [<c000000000680e30>] lock_device_hotplug_sysfs+0x20/0x90
[  489.885749]  #4:  (&dev->mutex){......}, at: [<c000000000682f64>] device_online+0x34/0x120
[  489.885766]  #5:  (cpu_add_remove_lock){+.+.+.}, at: [<c0000000000ec288>] do_cpu_up+0xc8/0x150
[  489.885784]  #6:  (cpu_hotplug.dep_map){++++++}, at: [<c0000000000ebdd8>] cpu_hotplug_begin+0x8/0x100
[  489.885802]  #7:  (cpu_hotplug.lock){+.+.+.}, at: [<c0000000000ebe68>] cpu_hotplug_begin+0x98/0x100
[  489.885821] 3 locks held by cpuhp/136/14931:
[  489.885827]  #0:  (sched_domains_mutex){+.+.+.}, at: [<c00000000015f23c>] partition_sched_domains+0x8c/0x4f0
[  489.885845]  #1:  (cpu_hotplug.dep_map){++++++}, at: [<c0000000000e9158>] get_online_cpus+0x58/0xd0
[  489.885863]  #2:  (cpu_hotplug.lock){+.+.+.}, at: [<c0000000000e9194>] get_online_cpus+0x94/0xd0
[  489.885880]
[  489.885885] =============================================
[  489.885885]

This patch provides a way to short-circuit the call chains underneath
rebuild_sched_domains which run through arch_update_cpu_topology, and,
eventually reaches get_online_cpus and deadlocks.  This patch allows
only a copy of the arch_update_cpu_topology which is actually in the
same thread as the DLPAR hot-add cpu operation to operate.  To compensate,
the function arch_update_cpu_topology will be invoked by a repeating
timer worker function shortly after the device_online operation has
completed.

This workaround is completely contained within the powerpc-arch code.

Signed-off-by: Michael Bringmann <mwb at linux.vnet.ibm.com>
---
 arch/powerpc/include/asm/sparsemem.h         |    4 +++
 arch/powerpc/mm/numa.c                       |   31 ++++++++++++++++++++++++++
 arch/powerpc/platforms/pseries/hotplug-cpu.c |    4 +++
 3 files changed, 39 insertions(+)

diff --git a/arch/powerpc/include/asm/sparsemem.h b/arch/powerpc/include/asm/sparsemem.h
index c88930c..2cf0bac 100644
--- a/arch/powerpc/include/asm/sparsemem.h
+++ b/arch/powerpc/include/asm/sparsemem.h
@@ -26,8 +26,12 @@ static inline void resize_hpt_for_hotplug(unsigned long new_mem_size) { }
 #endif
 
 #ifdef CONFIG_NUMA
+extern int set_single_dlpar_cpu_readchg(void);
+extern int clear_single_dlpar_cpu_readchg(void);
 extern int hot_add_scn_to_nid(unsigned long scn_addr);
 #else
+#define set_single_dlpar_cpu_readchg()	0
+#define clear_single_dlpar_cpu_readchg()	0
 static inline int hot_add_scn_to_nid(unsigned long scn_addr)
 {
 	return 0;
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index 4f06409..05325a0 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -1305,6 +1305,34 @@ static int update_lookup_table(void *data)
 }
 
 /*
+ * Support for singleton execution of arch_update_cpu_topology
+ */
+int dlpar_cpu_chg_in_progress;
+pid_t dlpar_cpu_chg_pid;
+
+int set_single_dlpar_cpu_readchg(void)
+{
+	if (dlpar_cpu_chg_in_progress)
+		return dlpar_cpu_chg_pid;
+
+	dlpar_cpu_chg_in_progress = 1;
+	dlpar_cpu_chg_pid = current->pid;
+
+	return 0;
+}
+
+int clear_single_dlpar_cpu_readchg(void)
+{
+	if (dlpar_cpu_chg_pid != current->pid)
+		return current->pid;
+
+	dlpar_cpu_chg_in_progress = 0;
+	dlpar_cpu_chg_pid = -1;
+
+	return 0;
+}
+
+/*
  * Update the node maps and sysfs entries for each cpu whose home node
  * has changed. Returns 1 when the topology has changed, and 0 otherwise.
  */
@@ -1317,6 +1345,9 @@ int arch_update_cpu_topology(void)
 	struct device *dev;
 	int weight, new_nid, i = 0;
 
+	if (dlpar_cpu_chg_in_progress && (dlpar_cpu_chg_pid != current->pid))
+		return 0;
+
 	if (!prrn_enabled && !vphn_enabled) {
 		if (!topology_inited)
 			topology_update_needed = 1;
diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c b/arch/powerpc/platforms/pseries/hotplug-cpu.c
index 7bc0e91..0e97086 100644
--- a/arch/powerpc/platforms/pseries/hotplug-cpu.c
+++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c
@@ -356,7 +356,9 @@ static int dlpar_online_cpu(struct device_node *dn)
 			BUG_ON(get_cpu_current_state(cpu)
 					!= CPU_STATE_OFFLINE);
 			cpu_maps_update_done();
+			set_single_dlpar_cpu_readchg();
 			rc = device_online(get_cpu_device(cpu));
+			clear_single_dlpar_cpu_readchg();
 			if (rc)
 				goto out;
 			cpu_maps_update_begin();
@@ -522,7 +524,9 @@ static int dlpar_offline_cpu(struct device_node *dn)
 				set_preferred_offline_state(cpu,
 							    CPU_STATE_OFFLINE);
 				cpu_maps_update_done();
+				set_single_dlpar_cpu_readchg();
 				rc = device_offline(get_cpu_device(cpu));
+				clear_single_dlpar_cpu_readchg();
 				if (rc)
 					goto out;
 				cpu_maps_update_begin();