[Skiboot] [PATCH v3 3/4] opal: Check Core FIRs to find the reason for Malfunction Alert.
Mahesh J Salgaonkar
mahesh at linux.vnet.ibm.com
Thu Mar 5 16:06:31 AEDT 2015
From: Mahesh Salgaonkar <mahesh at linux.vnet.ibm.com>
On Malfunction Alert read all CORE FIRs to identify reason for
core check stop and update a HMI event with relevant error information.
This patch changes the HMI event version to 2 (OpalHMIEvt_V2).
Signed-off-by: Mahesh Salgaonkar <mahesh at linux.vnet.ibm.com>
---
core/hmi.c | 123 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 120 insertions(+), 3 deletions(-)
diff --git a/core/hmi.c b/core/hmi.c
index 200a890..bb4c08d 100644
--- a/core/hmi.c
+++ b/core/hmi.c
@@ -23,6 +23,8 @@
#include <xscom.h>
#include <capp.h>
#include <pci.h>
+#include <cpu.h>
+#include <chip.h>
/*
* HMER register layout:
@@ -145,6 +147,32 @@
* NOTE: Per Dave Larson, never enable 8,9,21-23
*/
+/* xscom addresses for core FIR (Fault Isolation Register) */
+#define CORE_FIR 0x10013100
+
+static const struct core_xstop_bit_info {
+ uint8_t bit; /* CORE FIR bit number */
+ enum OpalHMI_CoreXstopReason reason;
+} xstop_bits[] = {
+ { 3, CORE_CHECKSTOP_IFU_REGFILE },
+ { 5, CORE_CHECKSTOP_IFU_LOGIC },
+ { 8, CORE_CHECKSTOP_PC_DURING_RECOV },
+ { 10, CORE_CHECKSTOP_ISU_REGFILE },
+ { 12, CORE_CHECKSTOP_ISU_LOGIC },
+ { 21, CORE_CHECKSTOP_FXU_LOGIC },
+ { 25, CORE_CHECKSTOP_VSU_LOGIC },
+ { 26, CORE_CHECKSTOP_PC_RECOV_IN_MAINT_MODE },
+ { 32, CORE_CHECKSTOP_LSU_REGFILE },
+ { 36, CORE_CHECKSTOP_PC_FWD_PROGRESS },
+ { 38, CORE_CHECKSTOP_LSU_LOGIC },
+ { 45, CORE_CHECKSTOP_PC_LOGIC },
+ { 48, CORE_CHECKSTOP_PC_HYP_RESOURCE },
+ { 52, CORE_CHECKSTOP_PC_HANG_RECOV_FAILED },
+ { 54, CORE_CHECKSTOP_PC_AMBI_HANG_DETECTED },
+ { 60, CORE_CHECKSTOP_PC_DEBUG_TRIG_ERR_INJ },
+ { 63, CORE_CHECKSTOP_PC_SPRD_HYP_ERR_INJ },
+};
+
static struct lock hmi_lock = LOCK_UNLOCKED;
static int queue_hmi_event(struct OpalHMIEvent *hmi_evt, int recover)
@@ -162,7 +190,7 @@ static int queue_hmi_event(struct OpalHMIEvent *hmi_evt, int recover)
hmi_evt->disposition = OpalHMI_DISPOSITION_NOT_RECOVERED;
/*
- * struct OpalHMIEvent is of (3 * 64 bits) size and well packed
+ * V2 of struct OpalHMIEvent is of (4 * 64 bits) size and well packed
* structure. Hence use uint64_t pointer to pass entire structure
* using 4 params in generic message format.
*/
@@ -170,7 +198,8 @@ static int queue_hmi_event(struct OpalHMIEvent *hmi_evt, int recover)
/* queue up for delivery to host. */
return opal_queue_msg(OPAL_MSG_HMI_EVT, NULL, NULL,
- hmi_data[0], hmi_data[1], hmi_data[2]);
+ hmi_data[0], hmi_data[1], hmi_data[2],
+ hmi_data[3]);
}
static int is_capp_recoverable(int chip_id)
@@ -221,11 +250,81 @@ static int decode_one_malfunction(int flat_chip_id, struct OpalHMIEvent *hmi_evt
return 0;
}
+static bool decode_core_fir(struct cpu_thread *cpu,
+ struct OpalHMIEvent *hmi_evt)
+{
+ uint64_t core_fir;
+ uint32_t core_id;
+ int i;
+ bool found = false;
+
+ /* Sanity check */
+ if (!cpu || !hmi_evt)
+ return false;
+
+ core_id = pir_to_core_id(cpu->pir);
+
+ /* Get CORE FIR register value. */
+ if (xscom_read(cpu->chip_id, XSCOM_ADDR_P8_EX(core_id, CORE_FIR),
+ &core_fir) != 0) {
+ prerror("HMI: XSCOM error reading CORE FIR\n");
+ return false;
+ }
+
+ prlog(PR_INFO, "HMI: CHIP ID: %x, CORE ID: %x, FIR: %016llx\n",
+ cpu->chip_id, core_id, core_fir);
+
+ /* Check CORE FIR bits and populate HMI event with error info. */
+ for (i = 0; i < ARRAY_SIZE(xstop_bits); i++) {
+ if (core_fir & PPC_BIT(xstop_bits[i].bit)) {
+ found = true;
+ hmi_evt->u.xstop_error.xstop_reason
+ |= xstop_bits[i].reason;
+ }
+ }
+ return found;
+}
+
+static void find_core_checkstop_reason(struct OpalHMIEvent *hmi_evt,
+ int *event_generated)
+{
+ struct cpu_thread *cpu;
+
+ /* Initialize HMI event */
+ hmi_evt->severity = OpalHMI_SEV_FATAL;
+ hmi_evt->type = OpalHMI_ERROR_MALFUNC_ALERT;
+ hmi_evt->u.xstop_error.xstop_type = CHECKSTOP_TYPE_CORE;
+
+ /*
+ * Check CORE FIRs and find the reason for core checkstop.
+ * Send a separate HMI event for each core that has checkstopped.
+ */
+ for_each_cpu(cpu) {
+ /* GARDed CPUs are marked unavailable. Skip them. */
+ if (cpu->state == cpu_state_unavailable)
+ continue;
+
+ /* Only check on primaries (ie. core), not threads */
+ if (cpu->is_secondary)
+ continue;
+
+ /* Initialize xstop_error fields. */
+ hmi_evt->u.xstop_error.xstop_reason = 0;
+ hmi_evt->u.xstop_error.u.pir = cpu->pir;
+
+ if (decode_core_fir(cpu, hmi_evt)) {
+ queue_hmi_event(hmi_evt, 0);
+ *event_generated = 1;
+ }
+ }
+}
+
static int decode_malfunction(struct OpalHMIEvent *hmi_evt)
{
int i;
int recover = -1;
uint64_t malf_alert;
+ int event_generated = 0;
xscom_read(this_cpu()->chip_id, 0x2020011, &malf_alert);
@@ -233,8 +332,26 @@ static int decode_malfunction(struct OpalHMIEvent *hmi_evt)
if (malf_alert & PPC_BIT(i)) {
recover = decode_one_malfunction(i, hmi_evt);
xscom_write(this_cpu()->chip_id, 0x02020011, ~PPC_BIT(i));
+ if (recover) {
+ queue_hmi_event(hmi_evt, recover);
+ event_generated = 1;
+ }
}
+ if (recover != -1) {
+ find_core_checkstop_reason(hmi_evt, &event_generated);
+
+ /*
+ * In case, if we fail to find checkstop reason send an
+ * unknown HMI event.
+ */
+ if (!event_generated) {
+ hmi_evt->u.xstop_error.xstop_type =
+ CHECKSTOP_TYPE_UNKNOWN;
+ hmi_evt->u.xstop_error.xstop_reason = 0;
+ }
+ }
+
return recover;
}
@@ -345,7 +462,7 @@ static int64_t opal_handle_hmi(void)
BUILD_ASSERT(sizeof(struct opal_msg) >= sizeof(struct OpalHMIEvent));
memset(&hmi_evt, 0, sizeof(struct OpalHMIEvent));
- hmi_evt.version = OpalHMIEvt_V1;
+ hmi_evt.version = OpalHMIEvt_V2;
lock(&hmi_lock);
hmer = mfspr(SPR_HMER); /* Get HMER register value */
More information about the Skiboot
mailing list