[Skiboot] [PATCH V2 7/7] hmi: Add handling for NPU checkstops

Russell Currey ruscur at russell.cc
Mon Mar 21 12:00:06 AEDT 2016


If the NPU detects an unrecoverable error, it will send a HMI.  This is
problematic since unhandled HMIs will checkstop the entire system, which
is not the intended behaviour of a NPU failure.  Instead, the NPU
emulated PCI devices should be fenced as part of EEH.

Add support for handling NPU HMIs.  This works by finding the NPU
responsible for the HMI, checking its error registers, and sending a
recoverable HMI event.  The NPU itself cannot actually recover, but the
system should not be brought down.  Fence mode is set on the NPU, such
that any further operations on the NPU will trigger EEH, and it will be
subsequently fenced from the system.

Signed-off-by: Russell Currey <ruscur at russell.cc>
---
V2:
	- use for_each_phb macro, dealing with non-continuous PHB IDs
	- add a comment mentioning a future >1 NPU per chip case
	- move the P8NVL check into the function, looks cleaner
---
 core/hmi.c         | 72 +++++++++++++++++++++++++++++++++++++++++++++++++++++-
 include/opal-api.h |  1 +
 2 files changed, 72 insertions(+), 1 deletion(-)

diff --git a/core/hmi.c b/core/hmi.c
index a4c7869..5d8020a 100644
--- a/core/hmi.c
+++ b/core/hmi.c
@@ -24,6 +24,8 @@
 #include <pci.h>
 #include <cpu.h>
 #include <chip.h>
+#include <npu-regs.h>
+#include <npu.h>
 
 /*
  * HMER register layout:
@@ -439,6 +441,74 @@ static void find_nx_checkstop_reason(int flat_chip_id,
 	*event_generated = 1;
 }
 
+static void find_npu_checkstop_reason(int flat_chip_id,
+				      struct OpalHMIEvent *hmi_evt,
+				      int *event_generated)
+{
+	struct phb *phb;
+	struct npu *p = NULL;
+
+	uint64_t npu_fir;
+	uint64_t npu_fir_mask;
+	uint64_t npu_fir_action0;
+	uint64_t npu_fir_action1;
+	uint64_t fatal_errors;
+
+	/* Only check for NPU errors if the chip has a NPU */
+	if (PVR_TYPE(mfspr(SPR_PVR)) != PVR_TYPE_P8NVL)
+		return;
+
+	/* Find the NPU on the chip associated with the HMI. */
+	for_each_phb(phb) {
+		/* NOTE: if a chip ever has >1 NPU this will need adjusting */
+		if (dt_node_is_compatible(phb->dt_node, "ibm,power8-npu-pciex") &&
+		    (dt_get_chip_id(phb->dt_node) == flat_chip_id)) {
+			p = phb_to_npu(phb);
+			break;
+		}
+	}
+
+	/* If we didn't find a NPU on the chip, it's not our checkstop. */
+	if (p == NULL)
+		return;
+
+	/* Read all the registers necessary to find a checkstop condition. */
+	if (xscom_read(flat_chip_id,
+		       p->at_xscom + NX_FIR, &npu_fir) ||
+	    xscom_read(flat_chip_id,
+		       p->at_xscom + NX_FIR_MASK, &npu_fir_mask) ||
+	    xscom_read(flat_chip_id,
+		       p->at_xscom + NX_FIR_ACTION0, &npu_fir_action0) ||
+	    xscom_read(flat_chip_id,
+		       p->at_xscom + NX_FIR_ACTION1, &npu_fir_action1)) {
+		prerror("HMI: Couldn't read NPU registers with XSCOM\n");
+		return;
+	}
+
+	fatal_errors = npu_fir & ~npu_fir_mask & npu_fir_action0 & npu_fir_action1;
+
+	/* If there's no errors, we don't need to do anything. */
+	if (!fatal_errors)
+		return;
+
+	prlog(PR_DEBUG,
+	      "NPU: FIR %llx FIR mask %llx FIR ACTION0 %llx FIR ACTION1 %llx\n",
+	      npu_fir, npu_fir_mask, npu_fir_action0, npu_fir_action1);
+
+	/* Set the NPU to fenced since it can't recover. */
+	p->fenced = true;
+
+	/* Set up the HMI event */
+	hmi_evt->severity = OpalHMI_SEV_WARNING;
+	hmi_evt->type = OpalHMI_ERROR_MALFUNC_ALERT;
+	hmi_evt->u.xstop_error.xstop_type = CHECKSTOP_TYPE_NPU;
+	hmi_evt->u.xstop_error.u.chip_id = flat_chip_id;
+
+	/* The HMI is "recoverable" because it shouldn't crash the system */
+	queue_hmi_event(hmi_evt, 1);
+	*event_generated = 1;
+}
+
 static void decode_malfunction(struct OpalHMIEvent *hmi_evt)
 {
 	int i;
@@ -456,8 +526,8 @@ static void decode_malfunction(struct OpalHMIEvent *hmi_evt)
 				queue_hmi_event(hmi_evt, recover);
 				event_generated = 1;
 			}
-
 			find_nx_checkstop_reason(i, hmi_evt, &event_generated);
+			find_npu_checkstop_reason(i, hmi_evt, &event_generated);
 		}
 
 	if (recover != -1) {
diff --git a/include/opal-api.h b/include/opal-api.h
index 369aa93..0b7b0bb 100644
--- a/include/opal-api.h
+++ b/include/opal-api.h
@@ -577,6 +577,7 @@ enum OpalHMI_XstopType {
 	CHECKSTOP_TYPE_UNKNOWN	=	0,
 	CHECKSTOP_TYPE_CORE	=	1,
 	CHECKSTOP_TYPE_NX	=	2,
+	CHECKSTOP_TYPE_NPU	=	3
 };
 
 enum OpalHMI_CoreXstopReason {
-- 
2.7.3



More information about the Skiboot mailing list