[Skiboot] [PATCH 2/3] uart: Workaround for unresponsive console

Wed May 20 21:51:38 AEST 2020

We use 4K internal output buffer for host console write. Whenever kernel
makes opal_console_write() call it first write to internal buffer and
then calls uart_con_flush() to flush data to UART console.

If kernel is doing heavy console write operation and BMC becomes unresponsive
(like BMC reboot) then our internal buffer becomes full. Kernel will keep
on retrying write operation. This will result in kernel lockups.

This patch introduces new timer. It starts timer whenever our write
operation to UART is not making progress. If situation doesn't improve
within specified time (10 seconds) then it will send error back to
kernel.

Signed-off-by: Vasant Hegde <hegdevasant at linux.vnet.ibm.com>
---
 hw/lpc-uart.c | 60 +++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 54 insertions(+), 6 deletions(-)

diff --git a/hw/lpc-uart.c b/hw/lpc-uart.c
index ed56a4bdc..672d4612b 100644
--- a/hw/lpc-uart.c
+++ b/hw/lpc-uart.c
@@ -217,6 +217,11 @@ static uint8_t *out_buf;
 static uint32_t out_buf_prod;
 static uint32_t out_buf_cons;
 
+#define UART_BUFFER_OUT_TIMEOUT  10
+static uint64_t out_buf_timeout;
+static uint64_t out_buf_prev_len;
+static uint64_t con_flush_timeout;
+
 /* Asynchronous flush, uart_lock must be held */
 static int64_t uart_con_flush(void)
 {
@@ -246,14 +251,31 @@ static int64_t uart_con_flush(void)
 	}
 	if (tx_full != tx_was_full)
 		uart_update_ier();
+
 	if (out_buf_prod != out_buf_cons) {
-		/* Return busy if nothing was flushed this call */
-		if (out_buf_cons == out_buf_cons_initial)
-			return OPAL_BUSY;
 		/* Return partial if there's more to flush */
-		return OPAL_PARTIAL;
+		if (out_buf_cons != out_buf_cons_initial) {
+			con_flush_timeout = 0;
+			return OPAL_PARTIAL;
+		}
+
+		/*
+		 * Nothing was flushed. Start internal timer. We will continue
+		 * returning BUSY until timeout happens, hoping BMC will consume
+		 * data within timeout period.
+		 */
+		if (con_flush_timeout == 0) {
+			con_flush_timeout = mftb() +
+				secs_to_tb(UART_BUFFER_OUT_TIMEOUT);
+		}
+
+		if (tb_compare(mftb(), con_flush_timeout) != TB_AAFTERB)
+			return OPAL_BUSY;
+
+		return OPAL_INTERNAL_ERROR;
 	}
 
+	con_flush_timeout = 0;
 	return OPAL_SUCCESS;
 }
 
@@ -273,6 +295,11 @@ static int64_t uart_opal_write(int64_t term_number, __be64 *__length,
 
 	lock(&uart_lock);
 
+	if (!uart_tx_buf_space()) {
+		unlock(&uart_lock);
+		return OPAL_INTERNAL_ERROR;
+	}
+
 	/* Copy data to out buffer */
 	while (uart_tx_buf_space() && len--) {
 		out_buf[out_buf_prod++] = *(buffer++);
@@ -293,14 +320,35 @@ static int64_t uart_opal_write(int64_t term_number, __be64 *__length,
 static int64_t uart_opal_write_buffer_space(int64_t term_number,
 					    __be64 *__length)
 {
+	uint64_t tx_buf_len;
+
 	if (term_number != 0)
 		return OPAL_PARAMETER;
 
 	lock(&uart_lock);
-	*__length = cpu_to_be64(uart_tx_buf_space());
+	tx_buf_len = uart_tx_buf_space();
+	*__length = cpu_to_be64(tx_buf_len);
 	unlock(&uart_lock);
 
-	return OPAL_SUCCESS;
+	if ((tx_buf_len != out_buf_prev_len) || (tx_buf_len == OUT_BUF_SIZE - 1)) {
+		out_buf_prev_len = tx_buf_len;
+		out_buf_timeout = 0;
+		return OPAL_SUCCESS;
+	}
+
+	/*
+	 * Buffer is full, start internal timer. We will continue returning
+	 * SUCCESS until timeout happens, hoping BMC will consume data within
+	 * timeout period.
+	 */
+	if (out_buf_timeout == 0)
+		out_buf_timeout = mftb() + secs_to_tb(UART_BUFFER_OUT_TIMEOUT);
+
+	if (tb_compare(mftb(), out_buf_timeout) != TB_AAFTERB)
+		return OPAL_SUCCESS;
+
+	/* Timeout happened. Lets drop incoming data */
+	return OPAL_INTERNAL_ERROR;
 }
 
 /* Must be called with UART lock held */
-- 
2.21.1