[Skiboot] [PATCH 5/5] nvlink: Add primitive EEH support for NPU devices

Russell Currey ruscur at russell.cc
Thu Jan 7 14:36:32 AEDT 2016


Implements Extended Error Handling callbacks for NVLink devices.

At present, this supports fence mode emulation, and some easily detectable
freezes.  There is a lot of work still to be done here, but this enables
EEH to work as expected in some specific scenarios.

Signed-off-by: Russell Currey <ruscur at russell.cc>
---
 hw/npu.c | 52 +++++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 47 insertions(+), 5 deletions(-)

diff --git a/hw/npu.c b/hw/npu.c
index ba61d2d..1750182 100644
--- a/hw/npu.c
+++ b/hw/npu.c
@@ -766,7 +766,6 @@ static void npu_err_interrupt(void *data, uint32_t isn)
 		prerror("Invalid NPU error interrupt received\n");
 		break;
 	case 6 ... 7:
-		NPUERR(p, "Error handling not implemented\n");
 		opal_update_pending_evt(OPAL_EVENT_PCI_ERROR,
 					OPAL_EVENT_PCI_ERROR);
 	}
@@ -991,6 +990,13 @@ static int64_t npu_power_state(struct phb *phb __unused)
 	return OPAL_SHPC_POWER_ON;
 }
 
+static int64_t npu_hreset(struct phb *phb __unused)
+{
+	prlog(PR_DEBUG, "NPU: driver should call reset procedure here\n");
+
+	return OPAL_SUCCESS;
+}
+
 static int64_t npu_freset(struct phb *phb __unused)
 {
 	/* FIXME: PHB fundamental reset, which need to be
@@ -1000,7 +1006,7 @@ static int64_t npu_freset(struct phb *phb __unused)
 	return OPAL_SUCCESS;
 }
 
-static int64_t npu_freeze_status(struct phb *phb __unused,
+static int64_t npu_freeze_status(struct phb *phb,
 				     uint64_t pe_number __unused,
 				     uint8_t *freeze_state,
 				     uint16_t *pci_error_type __unused,
@@ -1012,7 +1018,43 @@ static int64_t npu_freeze_status(struct phb *phb __unused,
 	 * introduce another PHB callback to translate it. For now,
 	 * it keeps the skiboot PCI enumeration going.
 	 */
-	*freeze_state = OPAL_EEH_STOPPED_NOT_FROZEN;
+	struct npu *p = phb_to_npu(phb);
+	if (p->fenced)
+		*freeze_state = OPAL_EEH_STOPPED_MMIO_DMA_FREEZE;
+	else
+		*freeze_state = OPAL_EEH_STOPPED_NOT_FROZEN;
+	return OPAL_SUCCESS;
+}
+
+static int64_t npu_eeh_next_error(struct phb *phb,
+				  uint64_t *first_frozen_pe,
+				  uint16_t *pci_error_type,
+				  uint16_t *severity)
+{
+	struct npu *p = phb_to_npu(phb);
+	int i;
+	uint64_t result = 0;
+	*first_frozen_pe = -1;
+	*pci_error_type = OPAL_EEH_NO_ERROR;
+	*severity = OPAL_EEH_SEV_NO_ERROR;
+
+	if (p->fenced) {
+		*pci_error_type = OPAL_EEH_PHB_ERROR;
+		*severity = OPAL_EEH_SEV_PHB_FENCED;
+		return OPAL_SUCCESS;
+	}
+
+	npu_ioda_sel(p, NPU_IODA_TBL_PESTB, 0, true);
+	for (i = 0; i < p->total_devices; i++) {
+		result = in_be64(p->at_regs + NPU_IODA_DATA0);
+		if (result > 0) {
+			*first_frozen_pe = i;
+			*pci_error_type = OPAL_EEH_PE_ERROR;
+			*severity = OPAL_EEH_SEV_PE_ER;
+			break;
+		}
+	}
+
 	return OPAL_SUCCESS;
 }
 
@@ -1076,14 +1118,14 @@ static const struct phb_ops npu_ops = {
 	.power_state		= npu_power_state,
 	.slot_power_off		= NULL,
 	.slot_power_on		= NULL,
-	.hot_reset		= NULL,
+	.hot_reset		= npu_hreset,
 	.fundamental_reset	= npu_freset,
 	.complete_reset		= NULL,
 	.poll			= NULL,
 	.eeh_freeze_status	= npu_freeze_status,
 	.eeh_freeze_clear	= NULL,
 	.eeh_freeze_set		= NULL,
-	.next_error		= NULL,
+	.next_error		= npu_eeh_next_error,
 	.err_inject		= npu_err_inject,
 	.get_diag_data		= NULL,
 	.get_diag_data2		= NULL,
-- 
2.6.4



More information about the Skiboot mailing list