[Skiboot] [PATCH 5/5] nvlink: Add primitive EEH support for NPU devices
Russell Currey
ruscur at russell.cc
Thu Jan 7 14:36:32 AEDT 2016
Implements Extended Error Handling callbacks for NVLink devices.
At present, this supports fence mode emulation, and some easily detectable
freezes. There is a lot of work still to be done here, but this enables
EEH to work as expected in some specific scenarios.
Signed-off-by: Russell Currey <ruscur at russell.cc>
---
hw/npu.c | 52 +++++++++++++++++++++++++++++++++++++++++++++++-----
1 file changed, 47 insertions(+), 5 deletions(-)
diff --git a/hw/npu.c b/hw/npu.c
index ba61d2d..1750182 100644
--- a/hw/npu.c
+++ b/hw/npu.c
@@ -766,7 +766,6 @@ static void npu_err_interrupt(void *data, uint32_t isn)
prerror("Invalid NPU error interrupt received\n");
break;
case 6 ... 7:
- NPUERR(p, "Error handling not implemented\n");
opal_update_pending_evt(OPAL_EVENT_PCI_ERROR,
OPAL_EVENT_PCI_ERROR);
}
@@ -991,6 +990,13 @@ static int64_t npu_power_state(struct phb *phb __unused)
return OPAL_SHPC_POWER_ON;
}
+static int64_t npu_hreset(struct phb *phb __unused)
+{
+ prlog(PR_DEBUG, "NPU: driver should call reset procedure here\n");
+
+ return OPAL_SUCCESS;
+}
+
static int64_t npu_freset(struct phb *phb __unused)
{
/* FIXME: PHB fundamental reset, which need to be
@@ -1000,7 +1006,7 @@ static int64_t npu_freset(struct phb *phb __unused)
return OPAL_SUCCESS;
}
-static int64_t npu_freeze_status(struct phb *phb __unused,
+static int64_t npu_freeze_status(struct phb *phb,
uint64_t pe_number __unused,
uint8_t *freeze_state,
uint16_t *pci_error_type __unused,
@@ -1012,7 +1018,43 @@ static int64_t npu_freeze_status(struct phb *phb __unused,
* introduce another PHB callback to translate it. For now,
* it keeps the skiboot PCI enumeration going.
*/
- *freeze_state = OPAL_EEH_STOPPED_NOT_FROZEN;
+ struct npu *p = phb_to_npu(phb);
+ if (p->fenced)
+ *freeze_state = OPAL_EEH_STOPPED_MMIO_DMA_FREEZE;
+ else
+ *freeze_state = OPAL_EEH_STOPPED_NOT_FROZEN;
+ return OPAL_SUCCESS;
+}
+
+static int64_t npu_eeh_next_error(struct phb *phb,
+ uint64_t *first_frozen_pe,
+ uint16_t *pci_error_type,
+ uint16_t *severity)
+{
+ struct npu *p = phb_to_npu(phb);
+ int i;
+ uint64_t result = 0;
+ *first_frozen_pe = -1;
+ *pci_error_type = OPAL_EEH_NO_ERROR;
+ *severity = OPAL_EEH_SEV_NO_ERROR;
+
+ if (p->fenced) {
+ *pci_error_type = OPAL_EEH_PHB_ERROR;
+ *severity = OPAL_EEH_SEV_PHB_FENCED;
+ return OPAL_SUCCESS;
+ }
+
+ npu_ioda_sel(p, NPU_IODA_TBL_PESTB, 0, true);
+ for (i = 0; i < p->total_devices; i++) {
+ result = in_be64(p->at_regs + NPU_IODA_DATA0);
+ if (result > 0) {
+ *first_frozen_pe = i;
+ *pci_error_type = OPAL_EEH_PE_ERROR;
+ *severity = OPAL_EEH_SEV_PE_ER;
+ break;
+ }
+ }
+
return OPAL_SUCCESS;
}
@@ -1076,14 +1118,14 @@ static const struct phb_ops npu_ops = {
.power_state = npu_power_state,
.slot_power_off = NULL,
.slot_power_on = NULL,
- .hot_reset = NULL,
+ .hot_reset = npu_hreset,
.fundamental_reset = npu_freset,
.complete_reset = NULL,
.poll = NULL,
.eeh_freeze_status = npu_freeze_status,
.eeh_freeze_clear = NULL,
.eeh_freeze_set = NULL,
- .next_error = NULL,
+ .next_error = npu_eeh_next_error,
.err_inject = npu_err_inject,
.get_diag_data = NULL,
.get_diag_data2 = NULL,
--
2.6.4
More information about the Skiboot
mailing list