[Skiboot] [RFC PATCH 6/6] opal/hmi: Send an error callout on threshold.

Mahesh Salgaonkar mahesh at linux.vnet.ibm.com
Wed Apr 24 04:13:17 AEST 2019


When threshold is hit send out an error log (eSEL) with hardware callout.

TODO:
- Send gard-able event to HBRT to create a gard record for faulty core/chip.
- Need to figure out a way to inform Linux host to offline faulty core.

Signed-off-by: Mahesh Salgaonkar <mahesh at linux.vnet.ibm.com>
---
 core/hmi.c         |   36 ++++++++++++++++++++++++++++++++++--
 include/errorlog.h |    1 +
 2 files changed, 35 insertions(+), 2 deletions(-)

diff --git a/core/hmi.c b/core/hmi.c
index cac8505d4..dd05d5369 100644
--- a/core/hmi.c
+++ b/core/hmi.c
@@ -34,6 +34,7 @@
 #include <nvram.h>
 #include <cpu.h>
 #include <timebase.h>
+#include <errorlog.h>
 
 /*
  * HMER register layout:
@@ -289,6 +290,9 @@ static uint32_t nx_status_reg;
 static uint32_t nx_dma_engine_fir;
 static uint32_t nx_pbi_fir;
 
+DEFINE_LOG_ENTRY(OPAL_RC_HW_ERROR_THRESHOLD, OPAL_PLATFORM_ERR_EVT, OPAL_CEC,
+		OPAL_PLATFORM_FIRMWARE, OPAL_ERROR_PANIC, OPAL_NA);
+
 static int setup_scom_addresses(void)
 {
 	switch (proc_gen) {
@@ -311,6 +315,26 @@ static int setup_scom_addresses(void)
 	return 0;
 }
 
+static int send_serviceable_event(void)
+{
+	struct errorlog *buf;
+	const char *loc, *part, *serial;
+
+	loc = chip_loc_code(this_cpu()->chip_id);
+	part = chip_part_number(this_cpu()->chip_id);
+	serial = chip_serial_number(this_cpu()->chip_id);
+	buf = opal_elog_create(&e_info(OPAL_RC_HW_ERROR_THRESHOLD), 0);
+	if (buf) {
+		/* TODO: Add error details as well. */
+		log_append_msg(buf, "TOD/TB error threshold reached");
+		log_mark_serviceable(buf);
+		log_add_callout_section(buf, loc, part, serial);
+		log_commit(buf);
+		return 1;
+	}
+	return 0;
+}
+
 static void threshold_check(hmi_err_count_t *counters, int error,
 							uint64_t tb_now)
 {
@@ -345,7 +369,11 @@ static void threshold_check(hmi_err_count_t *counters, int error,
 		counters[error].flags |= HMI_THRESHOLD_LEVEL1;
 		prlog(PR_DEBUG, "Threshold level1 limit reached for %d\n",
 								error);
-		/* TODO: Generate an errorlog with HW callout details. */
+		send_serviceable_event();
+		/*
+		 * TODO: Send a gard-able event to HBRT to gard faulty
+		 * chip/core on next IPL.
+		 */
 	}
 
 	if (tb_now > level2_timer) {
@@ -356,7 +384,11 @@ static void threshold_check(hmi_err_count_t *counters, int error,
 		counters[error].flags |= HMI_THRESHOLD_LEVEL2;
 		prlog(PR_DEBUG, "Threshold level2 limit reached for %d\n",
 								error);
-		/* TODO: Generate an errorlog with HW callout details. */
+		send_serviceable_event();
+		/*
+		 * TODO: Send a gard-able event to HBRT to gard faulty
+		 * chip/core on next IPL.
+		 */
 	}
 }
 
diff --git a/include/errorlog.h b/include/errorlog.h
index a99c63fc5..00cd69783 100644
--- a/include/errorlog.h
+++ b/include/errorlog.h
@@ -340,6 +340,7 @@ enum opal_reasoncode {
 
 /* Platform error */
 	OPAL_RC_ABNORMAL_REBOOT	    = OPAL_SRC_COMPONENT_CEC | 0x10,
+	OPAL_RC_HW_ERROR_THRESHOLD  = OPAL_SRC_COMPONENT_CEC | 0x11,
 
 /* FSP console */
 	OPAL_RC_CONSOLE_HANG	    = OPAL_SRC_COMPONENT_CONSOLE | 0x10,



More information about the Skiboot mailing list