[Skiboot] [PATCH] core/console: fix deadlock when printing with console lock held

Nicholas Piggin npiggin at gmail.com
Tue May 8 16:58:53 AEST 2018


Some debugging options will print while the console lock is held,
which is why the console lock is taken as a recursive lock.
However console_write calls __flush_console, which will drop and
re-take the lock non-recursively in some cases.

Just set con_need_flush and return from __flush_console if we are
holding the console lock already.

This stack usage message (taken with this patch applied) could lead
to a deadlock without this:

CPU 0000 lowest stack mark 11768 bytes left pc=300cb808 token=0
CPU 0000 Backtrace:
S: 0000000031c03370 R: 00000000300cb808   .list_check_node+0x1c
S: 0000000031c03410 R: 00000000300cb910   .list_check+0x38
S: 0000000031c034b0 R: 00000000300190ac   .try_lock_caller+0xb8
S: 0000000031c03540 R: 00000000300192e0   .lock_caller+0x80
S: 0000000031c03600 R: 0000000030012c70   .__flush_console+0x134
S: 0000000031c036d0 R: 00000000300130cc   .console_write+0x68
S: 0000000031c03780 R: 00000000300347bc   .vprlog+0xc8
S: 0000000031c03970 R: 0000000030034844   ._prlog+0x50
S: 0000000031c03a00 R: 00000000300364a4   .log_simple_error+0x74
S: 0000000031c03b90 R: 000000003004ab48   .occ_pstates_init+0x184
S: 0000000031c03d50 R: 000000003001480c   .load_and_boot_kernel+0x38c
S: 0000000031c03e30 R: 000000003001571c   .main_cpu_entry+0x62c
S: 0000000031c03f00 R: 0000000030002700   boot_entry+0x1c0

Reported-by: Andrew Donnellan <andrew.donnellan at au1.ibm.com>
Signed-off-by: Nicholas Piggin <npiggin at gmail.com>
---
 core/console.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/core/console.c b/core/console.c
index b9129c9f..e8256758 100644
--- a/core/console.c
+++ b/core/console.c
@@ -100,7 +100,7 @@ void clear_console(void)
  * Optionally can skip flushing to drivers, leaving messages
  * just in memory console.
  */
-static bool __flush_console(bool flush_to_drivers)
+static bool __flush_console(bool flush_to_drivers, bool need_unlock)
 {
 	struct cpu_thread *cpu = this_cpu();
 	size_t req, len = 0;
@@ -114,8 +114,12 @@ static bool __flush_console(bool flush_to_drivers)
 	 * Console flushing is suspended on this CPU, typically because
 	 * some critical locks are held that would potentially cause a
 	 * flush to deadlock
+	 *
+	 * Also if it recursed on con_lock (need_unlock is false). This
+	 * can happen due to debug code firing (e.g., list or stack
+	 * debugging).
 	 */
-	if (cpu->con_suspend) {
+	if (cpu->con_suspend || !need_unlock) {
 		cpu->con_need_flush = true;
 		return false;
 	}
@@ -176,7 +180,7 @@ bool flush_console(void)
 	bool ret;
 
 	lock(&con_lock);
-	ret = __flush_console(true);
+	ret = __flush_console(true, true);
 	unlock(&con_lock);
 
 	return ret;
@@ -250,7 +254,7 @@ ssize_t console_write(bool flush_to_drivers, const void *buf, size_t count)
 		write_char(c);
 	}
 
-	__flush_console(flush_to_drivers);
+	__flush_console(flush_to_drivers, need_unlock);
 
 	if (need_unlock)
 		unlock(&con_lock);
-- 
2.17.0



More information about the Skiboot mailing list