[Skiboot] [PATCH 7/7] hmi: Add handling for NPU checkstops

Russell Currey ruscur at russell.cc
Tue Mar 15 18:33:57 AEDT 2016


If the NPU detects an unrecoverable error, it will send a HMI.  This is
problematic since unhandled HMIs will checkstop the entire system, which
is not the intended behaviour of a NPU failure.  Instead, the NPU
emulated PCI devices should be fenced as part of EEH.

Add support for handling NPU HMIs.  This works by finding the NPU
responsible for the HMI, checking its error registers, and sending a
recoverable HMI event.  The NPU itself cannot actually recover, but the
system should not be brought down.  Fence mode is set on the NPU, such
that any further operations on the NPU will trigger EEH, and it will be
subsequently fenced from the system.

Signed-off-by: Russell Currey <ruscur at russell.cc>
---
 core/hmi.c         | 78 +++++++++++++++++++++++++++++++++++++++++++++++++++++-
 include/opal-api.h |  1 +
 2 files changed, 78 insertions(+), 1 deletion(-)

diff --git a/core/hmi.c b/core/hmi.c
index faf99ca..2c1ac95 100644
--- a/core/hmi.c
+++ b/core/hmi.c
@@ -24,6 +24,8 @@
 #include <pci.h>
 #include <cpu.h>
 #include <chip.h>
+#include <npu-regs.h>
+#include <npu.h>
 
 /*
  * HMER register layout:
@@ -433,6 +435,77 @@ static void find_nx_checkstop_reason(int flat_chip_id,
 	*event_generated = 1;
 }
 
+static void find_npu_checkstop_reason(int flat_chip_id,
+				      struct OpalHMIEvent *hmi_evt,
+				      int *event_generated)
+{
+	struct phb *phb;
+	struct npu *p = NULL;
+
+	int i;
+
+	uint64_t npu_fir;
+	uint64_t npu_fir_mask;
+	uint64_t npu_fir_action0;
+	uint64_t npu_fir_action1;
+	uint64_t fatal_errors;
+
+	/* Find the NPU on the chip associated with the HMI. */
+	for (i = 0; i < MAX_PHB_ID; i++) {
+		phb = pci_get_phb(i);
+
+		/* If we've gone through all PHBs, we're done. */
+		if (phb == NULL)
+			break;
+
+		if (dt_node_is_compatible(phb->dt_node, "ibm,power8-npu-pciex") &&
+		    (dt_get_chip_id(phb->dt_node) == flat_chip_id)) {
+			p = phb_to_npu(phb);
+			break;
+		}
+	}
+
+	/* If we didn't find a NPU on the chip, it's not our checkstop. */
+	if (p == NULL)
+		return;
+
+	/* Read all the registers necessary to find a checkstop condition. */
+	if (xscom_read(flat_chip_id,
+		       p->at_xscom + NX_FIR, &npu_fir) ||
+	    xscom_read(flat_chip_id,
+		       p->at_xscom + NX_FIR_MASK, &npu_fir_mask) ||
+	    xscom_read(flat_chip_id,
+		       p->at_xscom + NX_FIR_ACTION0, &npu_fir_action0) ||
+	    xscom_read(flat_chip_id,
+		       p->at_xscom + NX_FIR_ACTION1, &npu_fir_action1)) {
+		prerror("HMI: Couldn't read NPU registers with XSCOM\n");
+		return;
+	}
+
+	fatal_errors = npu_fir & ~npu_fir_mask & npu_fir_action0 & npu_fir_action1;
+
+	/* If there's no errors, we don't need to do anything. */
+	if (!fatal_errors)
+		return;
+
+	prlog(PR_DEBUG,
+	      "NPU: FIR %llx FIR mask %llx FIR ACTION0 %llx FIR ACTION1 %llx\n",
+	      npu_fir, npu_fir_mask, npu_fir_action0, npu_fir_action1);
+
+	/* Set the NPU to fenced since it can't recover. */
+	p->fenced = true;
+
+	/* Set up the HMI event */
+	hmi_evt->severity = OpalHMI_SEV_WARNING;
+	hmi_evt->type = OpalHMI_ERROR_MALFUNC_ALERT;
+	hmi_evt->u.xstop_error.xstop_type = CHECKSTOP_TYPE_NPU;
+	hmi_evt->u.xstop_error.u.chip_id = flat_chip_id;
+
+	/* The HMI is "recoverable" because it shouldn't crash the system */
+	queue_hmi_event(hmi_evt, 1);
+	*event_generated = 1;
+}
+
 static void decode_malfunction(struct OpalHMIEvent *hmi_evt)
 {
 	int i;
@@ -450,8 +523,11 @@ static void decode_malfunction(struct OpalHMIEvent *hmi_evt)
 				queue_hmi_event(hmi_evt, recover);
 				event_generated = 1;
 			}
-
 			find_nx_checkstop_reason(i, hmi_evt, &event_generated);
+			/* Only check for NPU errors if we have a NPU. */
+			if (PVR_TYPE(mfspr(SPR_PVR)) == PVR_TYPE_P8NVL)
+				find_npu_checkstop_reason(i, hmi_evt,
+							  &event_generated);
 		}
 
 	if (recover != -1) {
diff --git a/include/opal-api.h b/include/opal-api.h
index 369aa93..0b7b0bb 100644
--- a/include/opal-api.h
+++ b/include/opal-api.h
@@ -577,6 +577,7 @@ enum OpalHMI_XstopType {
 	CHECKSTOP_TYPE_UNKNOWN	=	0,
 	CHECKSTOP_TYPE_CORE	=	1,
 	CHECKSTOP_TYPE_NX	=	2,
+	CHECKSTOP_TYPE_NPU	=	3
 };
 
 enum OpalHMI_CoreXstopReason {
-- 
2.7.3



More information about the Skiboot mailing list