[Skiboot] [RFC PATCH 6/6] opal/hmi: Send an error callout on threshold.
Mahesh Salgaonkar
mahesh at linux.vnet.ibm.com
Wed Apr 24 04:13:17 AEST 2019
When threshold is hit send out an error log (eSEL) with hardware callout.
TODO:
- Send gard-able event to HBRT to create a gard record for faulty core/chip.
- Need to figure out a way to inform Linux host to offline faulty core.
Signed-off-by: Mahesh Salgaonkar <mahesh at linux.vnet.ibm.com>
---
core/hmi.c | 36 ++++++++++++++++++++++++++++++++++--
include/errorlog.h | 1 +
2 files changed, 35 insertions(+), 2 deletions(-)
diff --git a/core/hmi.c b/core/hmi.c
index cac8505d4..dd05d5369 100644
--- a/core/hmi.c
+++ b/core/hmi.c
@@ -34,6 +34,7 @@
#include <nvram.h>
#include <cpu.h>
#include <timebase.h>
+#include <errorlog.h>
/*
* HMER register layout:
@@ -289,6 +290,9 @@ static uint32_t nx_status_reg;
static uint32_t nx_dma_engine_fir;
static uint32_t nx_pbi_fir;
+DEFINE_LOG_ENTRY(OPAL_RC_HW_ERROR_THRESHOLD, OPAL_PLATFORM_ERR_EVT, OPAL_CEC,
+ OPAL_PLATFORM_FIRMWARE, OPAL_ERROR_PANIC, OPAL_NA);
+
static int setup_scom_addresses(void)
{
switch (proc_gen) {
@@ -311,6 +315,26 @@ static int setup_scom_addresses(void)
return 0;
}
+static int send_serviceable_event(void)
+{
+ struct errorlog *buf;
+ const char *loc, *part, *serial;
+
+ loc = chip_loc_code(this_cpu()->chip_id);
+ part = chip_part_number(this_cpu()->chip_id);
+ serial = chip_serial_number(this_cpu()->chip_id);
+ buf = opal_elog_create(&e_info(OPAL_RC_HW_ERROR_THRESHOLD), 0);
+ if (buf) {
+ /* TODO: Add error details as well. */
+ log_append_msg(buf, "TOD/TB error threshold reached");
+ log_mark_serviceable(buf);
+ log_add_callout_section(buf, loc, part, serial);
+ log_commit(buf);
+ return 1;
+ }
+ return 0;
+}
+
static void threshold_check(hmi_err_count_t *counters, int error,
uint64_t tb_now)
{
@@ -345,7 +369,11 @@ static void threshold_check(hmi_err_count_t *counters, int error,
counters[error].flags |= HMI_THRESHOLD_LEVEL1;
prlog(PR_DEBUG, "Threshold level1 limit reached for %d\n",
error);
- /* TODO: Generate an errorlog with HW callout details. */
+ send_serviceable_event();
+ /*
+ * TODO: Send a gard-able event to HBRT to gard faulty
+ * chip/core on next IPL.
+ */
}
if (tb_now > level2_timer) {
@@ -356,7 +384,11 @@ static void threshold_check(hmi_err_count_t *counters, int error,
counters[error].flags |= HMI_THRESHOLD_LEVEL2;
prlog(PR_DEBUG, "Threshold level2 limit reached for %d\n",
error);
- /* TODO: Generate an errorlog with HW callout details. */
+ send_serviceable_event();
+ /*
+ * TODO: Send a gard-able event to HBRT to gard faulty
+ * chip/core on next IPL.
+ */
}
}
diff --git a/include/errorlog.h b/include/errorlog.h
index a99c63fc5..00cd69783 100644
--- a/include/errorlog.h
+++ b/include/errorlog.h
@@ -340,6 +340,7 @@ enum opal_reasoncode {
/* Platform error */
OPAL_RC_ABNORMAL_REBOOT = OPAL_SRC_COMPONENT_CEC | 0x10,
+ OPAL_RC_HW_ERROR_THRESHOLD = OPAL_SRC_COMPONENT_CEC | 0x11,
/* FSP console */
OPAL_RC_CONSOLE_HANG = OPAL_SRC_COMPONENT_CONSOLE | 0x10,
More information about the Skiboot
mailing list