[Skiboot] [PATCH 13/16] npu2-opencapi: Improve error reporting to the OS

Frederic Barrat fbarrat at linux.ibm.com
Mon Sep 9 22:31:48 AEST 2019


When resetting an opencapi link, the brick will be fenced
temporarily. Therefore we can't rely on the fencing state of the brick
any more to check for the health of an opencapi PHB, as we could
report errors if queried for a PHB state at the same time a link is
being reset.

Instead, we flag the device as 'broken' when an error interrupt is
received, just before raising an event to the OS. When the OS is
querying for the state of a PHB, we only have to look at the 'broken'
attribute.

Note that there's no recovery possible on P9 when an error interrupt
is received unexpectedly, as recovery is not supported by hardware. So
when a device/link is marked as 'broken', it stays broken. All the OS
can do is log the error and notify the drivers.

Signed-off-by: Frederic Barrat <fbarrat at linux.ibm.com>
---
 hw/npu2-common.c   |  7 +++++++
 hw/npu2-opencapi.c | 21 +++++++++++++++++----
 include/npu2.h     |  5 +++++
 3 files changed, 29 insertions(+), 4 deletions(-)

diff --git a/hw/npu2-common.c b/hw/npu2-common.c
index 6d5c35af..51ecd0c8 100644
--- a/hw/npu2-common.c
+++ b/hw/npu2-common.c
@@ -406,6 +406,13 @@ static void npu2_err_interrupt(struct irq_source *is, uint32_t isn)
 			p->chip_id, irq_name);
 		free(irq_name);
 		show_all_regs(p, brick);
+		/*
+		 * P9 NPU doesn't support recovering a link going down
+		 * unexpectedly. So we mark the device as broken and
+		 * report it to the OS, so that the error is logged
+		 * and the drivers notified.
+		 */
+		npu2_opencapi_set_broken(p, brick);
 		opal_update_pending_evt(OPAL_EVENT_PCI_ERROR,
 					OPAL_EVENT_PCI_ERROR);
 		break;
diff --git a/hw/npu2-opencapi.c b/hw/npu2-opencapi.c
index efec162d..c8bc64d1 100644
--- a/hw/npu2-opencapi.c
+++ b/hw/npu2-opencapi.c
@@ -1463,14 +1463,12 @@ static int64_t npu2_opencapi_eeh_next_error(struct phb *phb,
 				   uint16_t *severity)
 {
 	struct npu2_dev *dev = phb_to_npu2_dev_ocapi(phb);
-	uint64_t reg;
 
 	if (!first_frozen_pe || !pci_error_type || !severity)
 		return OPAL_PARAMETER;
 
-	reg = npu2_read(dev->npu, NPU2_MISC_FENCE_STATE);
-	if (reg & PPC_BIT(dev->brick_index)) {
-		OCAPIERR(dev, "Brick %d fenced!\n", dev->brick_index);
+	if (dev->flags & NPU2_DEV_BROKEN) {
+		OCAPIDBG(dev, "Reporting device as broken\n");
 		*first_frozen_pe = dev->linux_pe;
 		*pci_error_type = OPAL_EEH_PHB_ERROR;
 		*severity = OPAL_EEH_SEV_PHB_DEAD;
@@ -1820,6 +1818,21 @@ static const struct phb_ops npu2_opencapi_ops = {
 	.tce_kill		= NULL,
 };
 
+void npu2_opencapi_set_broken(struct npu2 *npu, int brick)
+{
+	struct phb *phb;
+	struct npu2_dev *dev;
+
+	for_each_phb(phb) {
+		if (phb->phb_type == phb_type_npu_v2_opencapi) {
+			dev = phb_to_npu2_dev_ocapi(phb);
+			if (dev->npu == npu &&
+			    dev->brick_index == brick)
+				dev->flags |= NPU2_DEV_BROKEN;
+		}
+	}
+}
+
 static int64_t opal_npu_spa_setup(uint64_t phb_id, uint32_t __unused bdfn,
 				uint64_t addr, uint64_t PE_mask)
 {
diff --git a/include/npu2.h b/include/npu2.h
index 6171cd3c..d2a3430e 100644
--- a/include/npu2.h
+++ b/include/npu2.h
@@ -118,6 +118,8 @@ struct npu2_dev_nvlink {
 	const char		*slot_label;
 };
 
+#define NPU2_DEV_BROKEN		0x1
+
 struct npu2_dev {
 	enum npu2_dev_type	type;
 	uint32_t		link_index;
@@ -126,6 +128,7 @@ struct npu2_dev {
 	struct dt_node		*dt_node;
 	struct npu2_pcie_bar	bars[2];
 	struct npu2		*npu;
+	long			flags;
 
 	uint32_t		bdfn;
 
@@ -251,4 +254,6 @@ int64_t npu2_map_lpar(struct phb *phb, uint64_t bdf, uint64_t lparid,
 int64_t npu2_set_relaxed_order(struct phb *phb, uint32_t gcid, int pec,
 			       bool enable);
 
+void npu2_opencapi_set_broken(struct npu2 *npu, int brick);
+
 #endif /* __NPU2_H */
-- 
2.21.0



More information about the Skiboot mailing list