[Skiboot] [PATCH] nvlink: Print error message when NPU is fenced

Russell Currey ruscur at russell.cc
Mon Jun 20 16:41:36 AEST 2016


NPU fences aren't recoverable, and as such, would require user
intervention to have a working system again.  The fence will be picked up
by the kernel through EEH, but this doesn't happen until the NPU is used
for something.  So, let's print a message so it's obvious when this
happens.

A helper function was added to reduce duplication.  This also enables code
in skiboot to un-fence a NPU, which is useful to NPU developers but very
stupid otherwise.

Signed-off-by: Russell Currey <ruscur at russell.cc>
---
 core/hmi.c    |  2 +-
 hw/npu.c      | 13 ++++++++++++-
 include/npu.h |  2 ++
 3 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/core/hmi.c b/core/hmi.c
index 53581f1..67a9423 100644
--- a/core/hmi.c
+++ b/core/hmi.c
@@ -519,7 +519,7 @@ static void find_npu_checkstop_reason(int flat_chip_id,
 	      npu_fir, npu_fir_mask, npu_fir_action0, npu_fir_action1);
 
 	/* Set the NPU to fenced since it can't recover. */
-	p->fenced = true;
+	npu_set_fence_state(p, true);
 
 	/* Set up the HMI event */
 	hmi_evt->severity = OpalHMI_SEV_WARNING;
diff --git a/hw/npu.c b/hw/npu.c
index e444b96..c0c8631 100644
--- a/hw/npu.c
+++ b/hw/npu.c
@@ -1083,6 +1083,17 @@ static int64_t npu_eeh_next_error(struct phb *phb,
 	return OPAL_SUCCESS;
 }
 
+/* For use in error injection and handling. */
+void npu_set_fence_state(struct npu *p, bool fence) {
+	p->fenced = fence;
+
+	if (fence)
+		prlog(PR_ERR, "NPU: Chip %x is fenced, reboot required.\n",
+		      p->chip_id);
+	else
+		prlog(PR_WARNING, "NPU: un-fencing is dangerous and should \
+		      only be used for development purposes.");
+}
 
 /* Sets the NPU to trigger an error when a DMA occurs */
 static int64_t npu_err_inject(struct phb *phb, uint32_t pe_num,
@@ -1116,7 +1127,7 @@ static int64_t npu_err_inject(struct phb *phb, uint32_t pe_num,
 		return OPAL_PARAMETER;
 	} else if (type == 1) {
 		/* Emulate fence mode. */
-		p->fenced = true;
+		npu_set_fence_state(p, true);
 	} else {
 		/* Cause a freeze with an invalid MMIO write. */
 		in_be64((void *)dev->bar.base);
diff --git a/include/npu.h b/include/npu.h
index ff6201e..259e803 100644
--- a/include/npu.h
+++ b/include/npu.h
@@ -201,6 +201,8 @@ int64_t npu_dev_procedure_write(struct npu_dev_trap *trap,
 				uint32_t size,
 				uint32_t data);
 
+void npu_set_fence_state(struct npu *p, bool fence);
+
 #define NPUDBG(p, fmt, a...)	prlog(PR_DEBUG, "NPU%d: " fmt, \
 				      (p)->phb.opal_id, ##a)
 #define NPUINF(p, fmt, a...)	prlog(PR_INFO,  "NPU%d: " fmt, \
-- 
2.9.0



More information about the Skiboot mailing list