[Skiboot] [PATCH] nvlink: Print error message when NPU is fenced
Russell Currey
ruscur at russell.cc
Mon Jun 20 16:41:36 AEST 2016
NPU fences aren't recoverable, and as such, would require user
intervention to have a working system again. The fence will be picked up
by the kernel through EEH, but this doesn't happen until the NPU is used
for something. So, let's print a message so it's obvious when this
happens.
A helper function was added to reduce duplication. This also enables code
in skiboot to un-fence a NPU, which is useful to NPU developers but very
stupid otherwise.
Signed-off-by: Russell Currey <ruscur at russell.cc>
---
core/hmi.c | 2 +-
hw/npu.c | 13 ++++++++++++-
include/npu.h | 2 ++
3 files changed, 15 insertions(+), 2 deletions(-)
diff --git a/core/hmi.c b/core/hmi.c
index 53581f1..67a9423 100644
--- a/core/hmi.c
+++ b/core/hmi.c
@@ -519,7 +519,7 @@ static void find_npu_checkstop_reason(int flat_chip_id,
npu_fir, npu_fir_mask, npu_fir_action0, npu_fir_action1);
/* Set the NPU to fenced since it can't recover. */
- p->fenced = true;
+ npu_set_fence_state(p, true);
/* Set up the HMI event */
hmi_evt->severity = OpalHMI_SEV_WARNING;
diff --git a/hw/npu.c b/hw/npu.c
index e444b96..c0c8631 100644
--- a/hw/npu.c
+++ b/hw/npu.c
@@ -1083,6 +1083,17 @@ static int64_t npu_eeh_next_error(struct phb *phb,
return OPAL_SUCCESS;
}
+/* For use in error injection and handling. */
+void npu_set_fence_state(struct npu *p, bool fence) {
+ p->fenced = fence;
+
+ if (fence)
+ prlog(PR_ERR, "NPU: Chip %x is fenced, reboot required.\n",
+ p->chip_id);
+ else
+ prlog(PR_WARNING, "NPU: un-fencing is dangerous and should \
+ only be used for development purposes.");
+}
/* Sets the NPU to trigger an error when a DMA occurs */
static int64_t npu_err_inject(struct phb *phb, uint32_t pe_num,
@@ -1116,7 +1127,7 @@ static int64_t npu_err_inject(struct phb *phb, uint32_t pe_num,
return OPAL_PARAMETER;
} else if (type == 1) {
/* Emulate fence mode. */
- p->fenced = true;
+ npu_set_fence_state(p, true);
} else {
/* Cause a freeze with an invalid MMIO write. */
in_be64((void *)dev->bar.base);
diff --git a/include/npu.h b/include/npu.h
index ff6201e..259e803 100644
--- a/include/npu.h
+++ b/include/npu.h
@@ -201,6 +201,8 @@ int64_t npu_dev_procedure_write(struct npu_dev_trap *trap,
uint32_t size,
uint32_t data);
+void npu_set_fence_state(struct npu *p, bool fence);
+
#define NPUDBG(p, fmt, a...) prlog(PR_DEBUG, "NPU%d: " fmt, \
(p)->phb.opal_id, ##a)
#define NPUINF(p, fmt, a...) prlog(PR_INFO, "NPU%d: " fmt, \
--
2.9.0
More information about the Skiboot
mailing list