[Skiboot] [PATCH v2 4/7] hw/npu2: Setup an error interrupt on some opencapi FIRs

Frederic Barrat fbarrat at linux.ibm.com
Tue Mar 26 05:29:04 AEDT 2019


Many errors reported in the NPU FIR2 register, mostly catching
unexpected errors on the opencapi link are defined as 'brick fatal' in
the workbook, yet the default action is set to system checkstop. It's
possible to see those errors during AFU development, where the AFU may
send unexpected packets on the link, therefore triggering those
errors. Checkstopping the system in this case is clearly extreme, as
the error could be contained to the brick and proper analysis of a
checkstop is not trivial outside of a bringup environment.

This patch changes the default action of those errors so that the NPU
will raise an interrupt instead. Follow-up patches will log
proper information so that the error can be debugged and linux can
catch the event.

Signed-off-by: Frederic Barrat <fbarrat at linux.ibm.com>
---
Changelog
v2: no change

hw/npu2-common.c    | 27 +++++++++++++++++++++------
 hw/npu2-opencapi.c  | 39 ++++++++++++++++++++++++++++++++-------
 include/npu2-regs.h |  5 ++++-
 3 files changed, 57 insertions(+), 14 deletions(-)

diff --git a/hw/npu2-common.c b/hw/npu2-common.c
index 0b46f68c..ccbbbbca 100644
--- a/hw/npu2-common.c
+++ b/hw/npu2-common.c
@@ -108,8 +108,12 @@ static uint64_t npu2_ipi_attributes(struct irq_source *is __unused, uint32_t isn
 	struct npu2 *p = is->data;
 	uint32_t idx = isn - p->base_lsi;
 
-	if (idx == 18)
-		/* TCE Interrupt - used to detect a frozen PE */
+	if ((idx == 18) || (idx >= 27 && idx <= 34))
+		/*
+		 * level 18: TCE Interrupt - used to detect a frozen PE (nvlink)
+		 * level 27-30: OTL interrupt (opencapi)
+		 * level 31-34: XSL interrupt (opencapi)
+		 */
 		return IRQ_ATTR_TARGET_OPAL | IRQ_ATTR_TARGET_RARE | IRQ_ATTR_TYPE_MSI;
 	else
 		return IRQ_ATTR_TARGET_LINUX;
@@ -166,14 +170,25 @@ static void npu2_err_interrupt(struct irq_source *is, uint32_t isn)
 {
 	struct npu2 *p = is->data;
 	uint32_t idx = isn - p->base_lsi;
+	int brick;
 
-	if (idx != 18) {
+	switch (idx) {
+	case 18:
+		opal_update_pending_evt(OPAL_EVENT_PCI_ERROR,
+					OPAL_EVENT_PCI_ERROR);
+		break;
+	case 27 ... 34:
+		/* opencapi only */
+		brick = 2 + ((idx - 27) % 4);
+		prlog(PR_ERR, "NPU[%d] error interrupt for brick %d\n",
+			p->chip_id, brick);
+		opal_update_pending_evt(OPAL_EVENT_PCI_ERROR,
+					OPAL_EVENT_PCI_ERROR);
+		break;
+	default:
 		prerror("OPAL received unknown NPU2 interrupt %d\n", idx);
 		return;
 	}
-
-	opal_update_pending_evt(OPAL_EVENT_PCI_ERROR,
-				OPAL_EVENT_PCI_ERROR);
 }
 
 static const struct irq_source_ops npu2_ipi_ops = {
diff --git a/hw/npu2-opencapi.c b/hw/npu2-opencapi.c
index d32aaa53..285615a5 100644
--- a/hw/npu2-opencapi.c
+++ b/hw/npu2-opencapi.c
@@ -1509,9 +1509,9 @@ static void mask_nvlink_fir(struct npu2 *p)
 	 */
 
 	/* Mask FIRs */
-	xscom_read(p->chip_id, p->xscom_base + NPU2_MISC_FIR_MASK1, &reg);
+	xscom_read(p->chip_id, p->xscom_base + NPU2_MISC_FIR1_MASK, &reg);
 	reg = SETFIELD(PPC_BITMASK(0, 11), reg, 0xFFF);
-	xscom_write(p->chip_id, p->xscom_base + NPU2_MISC_FIR_MASK1, reg);
+	xscom_write(p->chip_id, p->xscom_base + NPU2_MISC_FIR1_MASK, reg);
 
 	/* freeze disable */
 	reg = npu2_scom_read(p->chip_id, p->xscom_base,
@@ -1535,17 +1535,42 @@ static void mask_nvlink_fir(struct npu2 *p)
 			NPU2_MISC_IRQ_ENABLE1, NPU2_MISC_DA_LEN_8B, reg);
 }
 
-static int enable_xsl_irq(struct npu2 *p)
+static int enable_interrupts(struct npu2 *p)
 {
-	uint64_t reg;
+	uint64_t reg, val_xsl, val_override;
+
+	/*
+	 * Enable translation interrupts for all bricks and override
+	 * every brick-fatal error to send an interrupt instead of
+	 * checkstopping.
+	 *
+	 * FIR bits configured to trigger an interrupt must have their
+	 * default action masked
+	 */
+	val_xsl = PPC_BIT(0) | PPC_BIT(1) | PPC_BIT(2) | PPC_BIT(3);
+	val_override = 0x0FFFEFC00FF1B000;
+
+	xscom_read(p->chip_id, p->xscom_base + NPU2_MISC_FIR2_MASK, &reg);
+	reg |= val_xsl | val_override;
+	xscom_write(p->chip_id, p->xscom_base + NPU2_MISC_FIR2_MASK, reg);
 
-	/* enable translation interrupts for all bricks */
 	reg = npu2_scom_read(p->chip_id, p->xscom_base, NPU2_MISC_IRQ_ENABLE2,
 			     NPU2_MISC_DA_LEN_8B);
-	reg |= PPC_BIT(0) | PPC_BIT(1) | PPC_BIT(2) | PPC_BIT(3);
+	reg |= val_xsl | val_override;
 	npu2_scom_write(p->chip_id, p->xscom_base, NPU2_MISC_IRQ_ENABLE2,
 			NPU2_MISC_DA_LEN_8B, reg);
 
+	/*
+	 * Make sure the brick is fenced on those errors.
+	 * Fencing is incompatible with freezing, but there's no
+	 * freeze defined for FIR2, so we don't have to worry about it
+	 */
+	reg = npu2_scom_read(p->chip_id, p->xscom_base, NPU2_MISC_FENCE_ENABLE2,
+			     NPU2_MISC_DA_LEN_8B);
+	reg |= val_override;
+	npu2_scom_write(p->chip_id, p->xscom_base, NPU2_MISC_FENCE_ENABLE2,
+			NPU2_MISC_DA_LEN_8B, reg);
+
 	mask_nvlink_fir(p);
 	return 0;
 }
@@ -1704,7 +1729,7 @@ int npu2_opencapi_init_npu(struct npu2 *npu)
 		address_translation_config(npu->chip_id, npu->xscom_base, dev->brick_index);
 	}
 
-	enable_xsl_irq(npu);
+	enable_interrupts(npu);
 
 	for (int i = 0; i < npu->total_devices; i++) {
 		dev = &npu->devices[i];
diff --git a/include/npu2-regs.h b/include/npu2-regs.h
index ca311097..939a23f5 100644
--- a/include/npu2-regs.h
+++ b/include/npu2-regs.h
@@ -480,10 +480,13 @@ void npu2_scom_write(uint64_t gcid, uint64_t scom_base,
 #define NPU2_MISC_IRQ_LOG13			NPU2_REG_OFFSET(NPU2_STACK_MISC, NPU2_BLOCK_MISC, 0x368)
 #define NPU2_MISC_IRQ_LOG14			NPU2_REG_OFFSET(NPU2_STACK_MISC, NPU2_BLOCK_MISC, 0x370)
 #define NPU2_MISC_IRQ_LOG15			NPU2_REG_OFFSET(NPU2_STACK_MISC, NPU2_BLOCK_MISC, 0x378)
+#define NPU2_MISC_FENCE_ENABLE2			NPU2_REG_OFFSET(NPU2_STACK_MISC, NPU2_BLOCK_MISC, 0x400)
 #define NPU2_MISC_IRQ_ENABLE2			NPU2_REG_OFFSET(NPU2_STACK_MISC, NPU2_BLOCK_MISC, 0x408)
 
 /* Misc register, direct access only */
-#define NPU2_MISC_FIR_MASK1		0x2C43
+#define NPU2_MISC_FIR0_MASK		0x2C03
+#define NPU2_MISC_FIR1_MASK		0x2C43
+#define NPU2_MISC_FIR2_MASK		0x2C83
 
 /* ATS block registers */
 #define NPU2_ATS_PMU_CTL			NPU2_REG_OFFSET(NPU2_STACK_MISC, NPU2_BLOCK_ATS, 0x000)
-- 
2.19.1



More information about the Skiboot mailing list