[Skiboot] [PATCH 6/7] hw/npu2: Dump (more) npu2 registers on link error and HMIs

Frederic Barrat fbarrat at linux.ibm.com
Thu Mar 21 05:35:21 AEDT 2019


We were already logging some NPU registers during an HMI. This patch
cleans up a bit how it is done and separates what is global from what
is specific to nvlink or opencapi.

Since we can now receive an error interrupt when an opencapi link goes
down unexpectedly, we also dump the NPU state but we limit it to the
registers of the brick which hit the error.

The list of registers to dump was worked out with the hw team to
allow for proper debugging. For each register, we print the name as
found in the NPU workbook, the scom address and the register value.

Signed-off-by: Frederic Barrat <fbarrat at linux.ibm.com>
---
 core/hmi.c          |  58 +-------
 hw/npu2-common.c    | 312 ++++++++++++++++++++++++++++++++++++++++++++
 include/npu2-regs.h |  10 ++
 include/npu2.h      |   1 +
 4 files changed, 324 insertions(+), 57 deletions(-)

diff --git a/core/hmi.c b/core/hmi.c
index fbb182c3..26277fa6 100644
--- a/core/hmi.c
+++ b/core/hmi.c
@@ -594,60 +594,6 @@ static void find_nx_checkstop_reason(int flat_chip_id,
 	queue_hmi_event(hmi_evt, 0, out_flags);
 }
 
-/*
- * If the year is 2018 and you still see all these hardcoded, you
- * should really replace this with the neat macros that's in the
- * NPU2 code rather than this horrible listing of every single
- * NPU2 register hardcoded for a specific chip.
- *
- * I feel dirty having even written it.
- */
-static uint32_t npu2_scom_dump[] = {
-	0x5011017, 0x5011047, 0x5011077, 0x50110A7,
-	0x5011217, 0x5011247, 0x5011277, 0x50112A7,
-	0x5011417, 0x5011447, 0x5011477, 0x50114A7,
-	0x50110DA, 0x50112DA, 0x50114DA,
-	0x50110DB, 0x50112DB, 0x50114DB,
-	0x5011011, 0x5011041, 0x5011071, 0x50110A1,
-	0x5011211, 0x5011241, 0x5011271, 0x50112A1,
-	0x5011411, 0x5011441, 0x5011471, 0x50114A1,
-	0x5011018, 0x5011048, 0x5011078, 0x50110A8,
-	0x5011218, 0x5011248, 0x5011278, 0x50112A8,
-	0x5011418, 0x5011448, 0x5011478, 0x50114A8,
-	0x5011640,
-	0x5011114, 0x5011134, 0x5011314, 0x5011334,
-	0x5011514, 0x5011534, 0x5011118, 0x5011138,
-	0x5011318, 0x5011338, 0x5011518, 0x5011538,
-	0x50110D8, 0x50112D8, 0x50114D8,
-	0x50110D9, 0x50112D9, 0x50114D9,
-	0x5011019, 0x5011049, 0x5011079, 0x50110A9,
-	0x5011219, 0x5011249, 0x5011279, 0x50112A9,
-	0x5011419, 0x5011449, 0x5011479, 0x50114A9,
-	0x50110F4, 0x50112F4, 0x50114F4,
-	0x50110F5, 0x50112F5, 0x50114F5,
-	0x50110F6, 0x50112F6, 0x50114F6,
-	0x50110FD, 0x50112FD, 0x50114FD,
-	0x50110FE, 0x50112FE, 0x50114FE,
-	0x00
-};
-
-static void dump_scoms(int flat_chip_id, const char *unit, uint32_t *scoms,
-			const char *loc)
-{
-	uint64_t value;
-	int r;
-
-	while (*scoms != 0) {
-		value = 0;
-		r = _xscom_read(flat_chip_id, *scoms, &value, false);
-		if (r != OPAL_SUCCESS)
-			continue;
-		prlog(PR_ERR, "%s: [Loc: %s] P:%d 0x%08x=0x%016llx\n",
-		      unit, loc, flat_chip_id, *scoms, value);
-		scoms++;
-	}
-}
-
 static bool phb_is_npu2(struct dt_node *dn)
 {
 	return (dt_node_is_compatible(dn, "ibm,power9-npu-pciex") ||
@@ -731,9 +677,7 @@ static void find_npu2_checkstop_reason(int flat_chip_id,
 	npu2_hmi_verbose = true;
 
 	if (npu2_hmi_verbose) {
-		_xscom_lock();
-		dump_scoms(flat_chip_id, "NPU", npu2_scom_dump, loc);
-		_xscom_unlock();
+		npu2_dump_scoms(flat_chip_id);
 		prlog(PR_ERR, " _________________________ \n");
 		prlog(PR_ERR, "< It's Driver Debug time! >\n");
 		prlog(PR_ERR, " ------------------------- \n");
diff --git a/hw/npu2-common.c b/hw/npu2-common.c
index ccbbbbca..e370ba87 100644
--- a/hw/npu2-common.c
+++ b/hw/npu2-common.c
@@ -103,6 +103,317 @@ void npu2_write_mask_4b(struct npu2 *p, uint64_t reg, uint32_t val, uint32_t mas
 			(uint64_t)new_val << 32);
 }
 
+typedef struct {
+	const char *name;
+	uint32_t stack;
+	uint32_t block;
+	uint32_t offset;
+} npu2_scom_dump_t;
+
+static npu2_scom_dump_t npu2_scom_dump_global[] = {
+#define __NPU2_SCOM_DUMP(name, stack, block, offset) { name, stack, block, offset }
+	/* CQ State Machine */
+	__NPU2_SCOM_DUMP("CS.SM0.MISC.CERR_MESSAGE0",
+			NPU2_STACK_STCK_0, NPU2_BLOCK_SM_0, NPU2_C_ERR_RPT_MSG0),
+	__NPU2_SCOM_DUMP("CS.SM1.MISC.CERR_MESSAGE0",
+			NPU2_STACK_STCK_0, NPU2_BLOCK_SM_1, NPU2_C_ERR_RPT_MSG0),
+	__NPU2_SCOM_DUMP("CS.SM2.MISC.CERR_MESSAGE0",
+			NPU2_STACK_STCK_0, NPU2_BLOCK_SM_2, NPU2_C_ERR_RPT_MSG0),
+	__NPU2_SCOM_DUMP("CS.SM3.MISC.CERR_MESSAGE0",
+			NPU2_STACK_STCK_0, NPU2_BLOCK_SM_3, NPU2_C_ERR_RPT_MSG0),
+
+	__NPU2_SCOM_DUMP("CS.SM0.MISC.CERR_MESSAGE1",
+			NPU2_STACK_STCK_0, NPU2_BLOCK_SM_0, NPU2_C_ERR_RPT_MSG1),
+	__NPU2_SCOM_DUMP("CS.SM1.MISC.CERR_MESSAGE1",
+			NPU2_STACK_STCK_0, NPU2_BLOCK_SM_1, NPU2_C_ERR_RPT_MSG1),
+	__NPU2_SCOM_DUMP("CS.SM2.MISC.CERR_MESSAGE1",
+			NPU2_STACK_STCK_0, NPU2_BLOCK_SM_2, NPU2_C_ERR_RPT_MSG1),
+	__NPU2_SCOM_DUMP("CS.SM3.MISC.CERR_MESSAGE1",
+			NPU2_STACK_STCK_0, NPU2_BLOCK_SM_3, NPU2_C_ERR_RPT_MSG1),
+
+	__NPU2_SCOM_DUMP("CS.SM0.MISC.CERR_MESSAGE2",
+			NPU2_STACK_STCK_0, NPU2_BLOCK_SM_0, NPU2_C_ERR_RPT_MSG2),
+	__NPU2_SCOM_DUMP("CS.SM1.MISC.CERR_MESSAGE2",
+			NPU2_STACK_STCK_0, NPU2_BLOCK_SM_1, NPU2_C_ERR_RPT_MSG2),
+	__NPU2_SCOM_DUMP("CS.SM2.MISC.CERR_MESSAGE2",
+			NPU2_STACK_STCK_0, NPU2_BLOCK_SM_2, NPU2_C_ERR_RPT_MSG2),
+	__NPU2_SCOM_DUMP("CS.SM3.MISC.CERR_MESSAGE2",
+			NPU2_STACK_STCK_0, NPU2_BLOCK_SM_3, NPU2_C_ERR_RPT_MSG2),
+
+	__NPU2_SCOM_DUMP("CS.SM0.MISC.CERR_MESSAGE3",
+			NPU2_STACK_STCK_0, NPU2_BLOCK_SM_0, NPU2_C_ERR_RPT_MSG3),
+	__NPU2_SCOM_DUMP("CS.SM1.MISC.CERR_MESSAGE3",
+			NPU2_STACK_STCK_0, NPU2_BLOCK_SM_1, NPU2_C_ERR_RPT_MSG3),
+	__NPU2_SCOM_DUMP("CS.SM2.MISC.CERR_MESSAGE3",
+			NPU2_STACK_STCK_0, NPU2_BLOCK_SM_2, NPU2_C_ERR_RPT_MSG3),
+	__NPU2_SCOM_DUMP("CS.SM3.MISC.CERR_MESSAGE3",
+			NPU2_STACK_STCK_0, NPU2_BLOCK_SM_3, NPU2_C_ERR_RPT_MSG3),
+
+	__NPU2_SCOM_DUMP("CS.SM0.MISC.CERR_MESSAGE4",
+			NPU2_STACK_STCK_0, NPU2_BLOCK_SM_0, NPU2_C_ERR_RPT_MSG4),
+	__NPU2_SCOM_DUMP("CS.SM1.MISC.CERR_MESSAGE4",
+			NPU2_STACK_STCK_0, NPU2_BLOCK_SM_1, NPU2_C_ERR_RPT_MSG4),
+	__NPU2_SCOM_DUMP("CS.SM2.MISC.CERR_MESSAGE4",
+			NPU2_STACK_STCK_0, NPU2_BLOCK_SM_2, NPU2_C_ERR_RPT_MSG4),
+	__NPU2_SCOM_DUMP("CS.SM3.MISC.CERR_MESSAGE4",
+			NPU2_STACK_STCK_0, NPU2_BLOCK_SM_3, NPU2_C_ERR_RPT_MSG4),
+
+	__NPU2_SCOM_DUMP("CS.SM0.MISC.CERR_MESSAGE5",
+			NPU2_STACK_STCK_0, NPU2_BLOCK_SM_0, NPU2_C_ERR_RPT_MSG5),
+	__NPU2_SCOM_DUMP("CS.SM1.MISC.CERR_MESSAGE5",
+			NPU2_STACK_STCK_0, NPU2_BLOCK_SM_1, NPU2_C_ERR_RPT_MSG5),
+	__NPU2_SCOM_DUMP("CS.SM2.MISC.CERR_MESSAGE5",
+			NPU2_STACK_STCK_0, NPU2_BLOCK_SM_2, NPU2_C_ERR_RPT_MSG5),
+	__NPU2_SCOM_DUMP("CS.SM3.MISC.CERR_MESSAGE5",
+			NPU2_STACK_STCK_0, NPU2_BLOCK_SM_3, NPU2_C_ERR_RPT_MSG5),
+
+	__NPU2_SCOM_DUMP("CS.SM0.MISC.CERR_MESSAGE6",
+			NPU2_STACK_STCK_0, NPU2_BLOCK_SM_0, NPU2_C_ERR_RPT_MSG6),
+	__NPU2_SCOM_DUMP("CS.SM1.MISC.CERR_MESSAGE6",
+			NPU2_STACK_STCK_0, NPU2_BLOCK_SM_1, NPU2_C_ERR_RPT_MSG6),
+	__NPU2_SCOM_DUMP("CS.SM2.MISC.CERR_MESSAGE6",
+			NPU2_STACK_STCK_0, NPU2_BLOCK_SM_2, NPU2_C_ERR_RPT_MSG6),
+	__NPU2_SCOM_DUMP("CS.SM3.MISC.CERR_MESSAGE6",
+			NPU2_STACK_STCK_0, NPU2_BLOCK_SM_3, NPU2_C_ERR_RPT_MSG6),
+
+	__NPU2_SCOM_DUMP("CS.SM0.MISC.CERR_FIRST0",
+			NPU2_STACK_STCK_0, NPU2_BLOCK_SM_0, NPU2_C_ERR_RPT_FIRST0),
+	__NPU2_SCOM_DUMP("CS.SM1.MISC.CERR_FIRST0",
+			NPU2_STACK_STCK_0, NPU2_BLOCK_SM_1, NPU2_C_ERR_RPT_FIRST0),
+	__NPU2_SCOM_DUMP("CS.SM2.MISC.CERR_FIRST0",
+			NPU2_STACK_STCK_0, NPU2_BLOCK_SM_2, NPU2_C_ERR_RPT_FIRST0),
+	__NPU2_SCOM_DUMP("CS.SM3.MISC.CERR_FIRST0",
+			NPU2_STACK_STCK_0, NPU2_BLOCK_SM_3, NPU2_C_ERR_RPT_FIRST0),
+
+	__NPU2_SCOM_DUMP("CS.SM0.MISC.CERR_FIRST1",
+			NPU2_STACK_STCK_0, NPU2_BLOCK_SM_0, NPU2_C_ERR_RPT_FIRST1),
+	__NPU2_SCOM_DUMP("CS.SM1.MISC.CERR_FIRST1",
+			NPU2_STACK_STCK_0, NPU2_BLOCK_SM_1, NPU2_C_ERR_RPT_FIRST1),
+	__NPU2_SCOM_DUMP("CS.SM2.MISC.CERR_FIRST1",
+			NPU2_STACK_STCK_0, NPU2_BLOCK_SM_2, NPU2_C_ERR_RPT_FIRST1),
+	__NPU2_SCOM_DUMP("CS.SM3.MISC.CERR_FIRST1",
+			NPU2_STACK_STCK_0, NPU2_BLOCK_SM_3, NPU2_C_ERR_RPT_FIRST1),
+
+	__NPU2_SCOM_DUMP("CS.SM0.MISC.CERR_FIRST2",
+			NPU2_STACK_STCK_0, NPU2_BLOCK_SM_0, NPU2_C_ERR_RPT_FIRST2),
+	__NPU2_SCOM_DUMP("CS.SM1.MISC.CERR_FIRST2",
+			NPU2_STACK_STCK_0, NPU2_BLOCK_SM_1, NPU2_C_ERR_RPT_FIRST2),
+	__NPU2_SCOM_DUMP("CS.SM2.MISC.CERR_FIRST2",
+			NPU2_STACK_STCK_0, NPU2_BLOCK_SM_2, NPU2_C_ERR_RPT_FIRST2),
+	__NPU2_SCOM_DUMP("CS.SM3.MISC.CERR_FIRST2",
+			NPU2_STACK_STCK_0, NPU2_BLOCK_SM_3, NPU2_C_ERR_RPT_FIRST2),
+
+	/* CQ Control */
+	__NPU2_SCOM_DUMP("CS.CTL.MISC.CERR_MESSAGE0",
+			NPU2_STACK_STCK_0, NPU2_BLOCK_CTL, NPU2_CQ_C_ERR_RPT_MSG0),
+	__NPU2_SCOM_DUMP("CS.CTL.MISC.CERR_MESSAGE1",
+			NPU2_STACK_STCK_0, NPU2_BLOCK_CTL, NPU2_CQ_C_ERR_RPT_MSG1),
+	__NPU2_SCOM_DUMP("CS.CTL.MISC.CERR_FIRST0",
+			NPU2_STACK_STCK_0, NPU2_BLOCK_CTL, NPU2_CQ_C_ERR_RPT_FIRST0),
+	__NPU2_SCOM_DUMP("CS.CTL.MISC.CERR_FIRST1",
+			NPU2_STACK_STCK_0, NPU2_BLOCK_CTL, NPU2_CQ_C_ERR_RPT_FIRST1),
+
+	/* CQ Data */
+	__NPU2_SCOM_DUMP("DAT.MISC.CERR_ECC_HOLD",
+			NPU2_STACK_STCK_0, NPU2_BLOCK_DAT, NPU2_CQ_DAT_ECC_STATUS),
+	__NPU2_SCOM_DUMP("DAT.MISC.CERR_ECC_MASK",
+			NPU2_STACK_STCK_0, NPU2_BLOCK_DAT, NPU2_CQ_DAT_ECC_MASK),
+	__NPU2_SCOM_DUMP("DAT.MISC.CERR_ECC_FIRST",
+			NPU2_STACK_STCK_0, NPU2_BLOCK_DAT, NPU2_CQ_DAT_ECC_FIRST),
+	__NPU2_SCOM_DUMP("DAT.MISC.REM0",
+			NPU2_STACK_STCK_0, NPU2_BLOCK_DAT, NPU2_CQ_DAT_RAS_MSG0),
+	__NPU2_SCOM_DUMP("DAT.MISC.REM1",
+			NPU2_STACK_STCK_0, NPU2_BLOCK_DAT, NPU2_CQ_DAT_RAS_MSG1),
+
+	/* XTS */
+	__NPU2_SCOM_DUMP("XTS.REG.ERR_HOLD",
+			NPU2_STACK_MISC, NPU2_BLOCK_XTS, 0),
+
+	{ NULL, 0, 0, 0 }
+};
+
+static npu2_scom_dump_t npu2_scom_dump_nvlink[] = {
+	__NPU2_SCOM_DUMP("NTL0.REGS.CERR_FIRST1",
+			NPU2_STACK_STCK_0, NPU2_BLOCK_NTL0, NPU2_NTL_ERR_FIRST1_OFF),
+	__NPU2_SCOM_DUMP("NTL1.REGS.CERR_FIRST1",
+			NPU2_STACK_STCK_0, NPU2_BLOCK_NTL1, NPU2_NTL_ERR_FIRST1_OFF),
+	__NPU2_SCOM_DUMP("NTL0.REGS.CERR_FIRST2",
+			NPU2_STACK_STCK_0, NPU2_BLOCK_NTL0, NPU2_NTL_ERR_FIRST2_OFF),
+	__NPU2_SCOM_DUMP("NTL1.REGS.CERR_FIRST2",
+			NPU2_STACK_STCK_0, NPU2_BLOCK_NTL1, NPU2_NTL_ERR_FIRST2_OFF),
+	{ NULL, 0, 0, 0 }
+};
+
+static npu2_scom_dump_t npu2_scom_dump_ocapi[] = {
+	__NPU2_SCOM_DUMP("OTL0.MISC.C_ERR_RPT_HOLD0",
+			NPU2_STACK_STCK_0, NPU2_BLOCK_OTL0, NPU2_OTL_ERR_RPT_HOLD0),
+	__NPU2_SCOM_DUMP("OTL1.MISC.C_ERR_RPT_HOLD0",
+			NPU2_STACK_STCK_0, NPU2_BLOCK_OTL1, NPU2_OTL_ERR_RPT_HOLD0),
+	__NPU2_SCOM_DUMP("OTL0.MISC.OTL_REM0",
+			NPU2_STACK_STCK_0, NPU2_BLOCK_OTL0, NPU2_OTL_RAS_ERR_MSG0),
+	__NPU2_SCOM_DUMP("OTL1.MISC.OTL_REM0",
+			NPU2_STACK_STCK_0, NPU2_BLOCK_OTL1, NPU2_OTL_RAS_ERR_MSG0),
+	__NPU2_SCOM_DUMP("OTL0.MISC.ERROR_SIG_RXI",
+			NPU2_STACK_STCK_0, NPU2_BLOCK_OTL0, NPU2_OTL_RXI_ERR_SIG),
+	__NPU2_SCOM_DUMP("OTL1.MISC.ERROR_SIG_RXI",
+			NPU2_STACK_STCK_0, NPU2_BLOCK_OTL1, NPU2_OTL_RXI_ERR_SIG),
+	__NPU2_SCOM_DUMP("OTL0.MISC.ERROR_SIG_RXO",
+			NPU2_STACK_STCK_0, NPU2_BLOCK_OTL0, NPU2_OTL_RXO_ERR_SIG),
+	__NPU2_SCOM_DUMP("OTL1.MISC.ERROR_SIG_RXO",
+			NPU2_STACK_STCK_0, NPU2_BLOCK_OTL1, NPU2_OTL_RXO_ERR_SIG),
+	__NPU2_SCOM_DUMP("OTL0.MISC.C_ERR_RPT_HOLD1",
+			NPU2_STACK_STCK_0, NPU2_BLOCK_OTL0, NPU2_OTL_ERR_RPT_HOLD1),
+	__NPU2_SCOM_DUMP("OTL1.MISC.C_ERR_RPT_HOLD1",
+			NPU2_STACK_STCK_0, NPU2_BLOCK_OTL1, NPU2_OTL_ERR_RPT_HOLD1),
+	{ NULL, 0, 0, 0 }
+};
+
+static void print_one_reg(struct npu2 *npu, npu2_scom_dump_t *scom, int stack)
+{
+	uint64_t reg, val;
+
+	if (scom->stack != NPU2_STACK_STCK_0)
+		stack = scom->stack;
+
+	reg = NPU2_REG_OFFSET(stack, scom->block, scom->offset);
+	val = npu2_scom_read(npu->chip_id, npu->xscom_base,
+			reg, NPU2_MISC_DA_LEN_8B);
+
+	if (scom->stack != NPU2_STACK_STCK_0)
+		prlog(PR_ERR, "NPU[%d] %s 0x%llx = 0x%016llx\n",
+			npu->chip_id, scom->name, reg, val);
+	else
+		prlog(PR_ERR, "NPU[%d] STCK%d.%s 0x%llx = 0x%016llx\n",
+			npu->chip_id, stack - 4, scom->name, reg, val);
+}
+
+static void dump_npu2_regs_nvlink(struct npu2 *npu, int brick_index)
+{
+	uint32_t stack, ntl;
+	npu2_scom_dump_t *scom_reg;
+
+	stack = NPU2_STACK_STCK_0 + brick_index / 2;
+	ntl = NPU2_BLOCK_NTL0 + (brick_index % 2) * 2;
+
+	scom_reg = npu2_scom_dump_nvlink;
+	while (scom_reg->name) {
+		if (scom_reg->block == ntl)
+			print_one_reg(npu, scom_reg, stack);
+		scom_reg++;
+	}
+}
+
+static void dump_npu2_regs_opencapi(struct npu2 *npu, int brick_index)
+{
+	uint64_t val, addr;
+	uint32_t stack, otl;
+	npu2_scom_dump_t *scom_reg;
+
+	stack = NPU2_STACK_STCK_0 + brick_index / 2;
+	otl = NPU2_BLOCK_OTL0 + (brick_index % 2);
+
+	/* NPU registers */
+	scom_reg = npu2_scom_dump_ocapi;
+	while (scom_reg->name) {
+		if (scom_reg->block == otl)
+			print_one_reg(npu, scom_reg, stack);
+		scom_reg++;
+	}
+
+	/* Fabric registers */
+	addr = OB_ODL_STATUS(brick_index);
+	xscom_read(npu->chip_id, addr, &val);
+	prlog(PR_ERR, "NPU[%d] ODL status brick %d 0x%llx = 0x%016llx\n",
+		npu->chip_id, brick_index, addr, val);
+
+	addr = OB_ODL_TRAINING_STATUS(brick_index);
+	xscom_read(npu->chip_id, addr, &val);
+	prlog(PR_ERR, "NPU[%d] ODL training status brick %d 0x%llx = 0x%016llx\n",
+		npu->chip_id, brick_index, addr, val);
+
+	addr = OB_ODL_ENDPOINT_INFO(brick_index);
+	xscom_read(npu->chip_id, addr, &val);
+	prlog(PR_ERR, "NPU[%d] ODL endpoint info brick %d 0x%llx = 0x%016llx\n",
+		npu->chip_id, brick_index, addr, val);
+}
+
+static void dump_npu2_regs(struct npu2 *npu, int brick_index)
+{
+	int i, stack_min, stack_max;
+	uint64_t fir_val, mask_val, fir_addr, mask_addr;
+	struct npu2_dev *dev;
+	npu2_scom_dump_t *scom_reg;
+
+	if (brick_index != -1) {
+		stack_min = stack_max = NPU2_STACK_STCK_0 + brick_index / 2;
+	} else {
+		stack_min = NPU2_STACK_STCK_0;
+		stack_max = NPU2_STACK_STCK_2;
+		/* Avoid dumping unused stacks for opencapi on Lagrange */
+		if (npu->total_devices == 2)
+			stack_min = stack_max = NPU2_STACK_STCK_1;
+	}
+
+	/* NPU FIRs */
+	for (i = 0; i < NPU2_TOTAL_FIR_REGISTERS; i++) {
+		fir_addr  = NPU2_FIR_REGISTER_0 + i * NPU2_FIR_OFFSET;
+		mask_addr = NPU2_FIR_REGISTER_0 + i * NPU2_FIR_OFFSET + NPU2_FIR_MASK_OFFSET;
+		xscom_read(npu->chip_id, fir_addr, &fir_val);
+		xscom_read(npu->chip_id, mask_addr, &mask_val);
+		prlog(PR_ERR, "NPU[%d] FIR%d = 0x%016llx (mask 0x%016llx => 0x%016llx)\n",
+			npu->chip_id, i, fir_val, mask_val, fir_val & ~mask_val);
+	}
+
+	/* NPU global registers */
+	scom_reg = npu2_scom_dump_global;
+	while (scom_reg->name) {
+		if (scom_reg->stack != NPU2_STACK_STCK_0) // not a per-stack reg
+			print_one_reg(npu, scom_reg, -1);
+		else
+			for (i = stack_min; i <= stack_max; i++)
+				print_one_reg(npu, scom_reg, i);
+		scom_reg++;
+	}
+
+	/* nvlink- or opencapi-specific registers */
+	for (i = 0; i < npu->total_devices; i++) {
+		dev = &npu->devices[i];
+		if (brick_index == -1 || dev->brick_index == brick_index) {
+			if (dev->type == NPU2_DEV_TYPE_NVLINK)
+				dump_npu2_regs_nvlink(npu, dev->brick_index);
+			else if (dev->type == NPU2_DEV_TYPE_OPENCAPI)
+				dump_npu2_regs_opencapi(npu, dev->brick_index);
+		}
+	}
+}
+
+void npu2_dump_scoms(int chip_id)
+{
+	struct npu2 *npu;
+	struct phb *phb;
+	struct npu2_dev *dev;
+
+	/*
+	 * Look for the npu2 structure for that chip ID. We can access it
+	 * through the array of phbs, looking for a nvlink or opencapi
+	 * phb. We can have several entries, but they all point
+	 * to the same npu2 structure
+	 */
+	for_each_phb(phb) {
+		npu = NULL;
+		if (phb->phb_type == phb_type_npu_v2) {
+			npu = phb_to_npu2_nvlink(phb);
+		} else if (phb->phb_type == phb_type_npu_v2_opencapi) {
+			dev = phb_to_npu2_dev_ocapi(phb);
+			npu = dev->npu;
+		}
+		if (npu && npu->chip_id == chip_id) {
+			dump_npu2_regs(npu, -1 /* all bricks */);
+			break;
+		}
+	}
+}
+
 static uint64_t npu2_ipi_attributes(struct irq_source *is __unused, uint32_t isn __unused)
 {
 	struct npu2 *p = is->data;
@@ -182,6 +493,7 @@ static void npu2_err_interrupt(struct irq_source *is, uint32_t isn)
 		brick = 2 + ((idx - 27) % 4);
 		prlog(PR_ERR, "NPU[%d] error interrupt for brick %d\n",
 			p->chip_id, brick);
+		dump_npu2_regs(p, brick);
 		opal_update_pending_evt(OPAL_EVENT_PCI_ERROR,
 					OPAL_EVENT_PCI_ERROR);
 		break;
diff --git a/include/npu2-regs.h b/include/npu2-regs.h
index 939a23f5..ba10b8ea 100644
--- a/include/npu2-regs.h
+++ b/include/npu2-regs.h
@@ -203,6 +203,8 @@ void npu2_scom_write(uint64_t gcid, uint64_t scom_base,
 #define NPU2_PERF_MASK				0x110
 #define NPU2_DBG0_CFG				0x118
 #define NPU2_DBG1_CFG				0x120
+#define NPU2_C_ERR_RPT_MSG5			0x128
+#define NPU2_C_ERR_RPT_MSG6			0x130
 
 /* CTL block registers */
 #define NPU2_CQ_CTL_MISC_CFG			0x000
@@ -295,10 +297,12 @@ void npu2_scom_write(uint64_t gcid, uint64_t scom_base,
 #define NPU2_NTL_MISC_CFG3(ndev)		NPU2_NTL_REG_OFFSET(ndev, 0x008)
 #define NPU2_NTL_ERR_HOLD1(ndev)		NPU2_NTL_REG_OFFSET(ndev, 0x010)
 #define NPU2_NTL_ERR_MASK1(ndev)		NPU2_NTL_REG_OFFSET(ndev, 0x018)
+#define NPU2_NTL_ERR_FIRST1_OFF			0x020
 #define NPU2_NTL_ERR_FIRST1(ndev)		NPU2_NTL_REG_OFFSET(ndev, 0x020)
 #define NPU2_NTL_ERR_FIRST1_MASK(ndev)		NPU2_NTL_REG_OFFSET(ndev, 0x028)
 #define NPU2_NTL_ERR_HOLD2(ndev)		NPU2_NTL_REG_OFFSET(ndev, 0x030)
 #define NPU2_NTL_ERR_MASK2(ndev)		NPU2_NTL_REG_OFFSET(ndev, 0x038)
+#define NPU2_NTL_ERR_FIRST2_OFF			0x040
 #define NPU2_NTL_ERR_FIRST2(ndev)		NPU2_NTL_REG_OFFSET(ndev, 0x040)
 #define NPU2_NTL_ERR_FIRST2_MASK(ndev)		NPU2_NTL_REG_OFFSET(ndev, 0x048)
 #define NPU2_NTL_SCRATCH2(ndev)			NPU2_NTL_REG_OFFSET(ndev, 0x050)
@@ -402,6 +406,12 @@ void npu2_scom_write(uint64_t gcid, uint64_t scom_base,
 #define NPU2_OTL_OSL_DAR(stack, block)		NPU2_REG_OFFSET(stack, block, 0x008)
 #define NPU2_OTL_OSL_TFC(stack, block)		NPU2_REG_OFFSET(stack, block, 0x010)
 #define NPU2_OTL_OSL_PEHANDLE(stack, block)	NPU2_REG_OFFSET(stack, block, 0x018)
+#define NPU2_OTL_ERR_RPT_HOLD0			0x30
+#define NPU2_OTL_RAS_ERR_MSG0			0x68
+#define NPU2_OTL_RXI_ERR_SIG			0x70
+#define NPU2_OTL_RXO_ERR_SIG			0x78
+#define NPU2_OTL_ERR_RPT_HOLD1			0xB0
+
 
 /* Misc block registers. Unlike the SM/CTL/DAT/NTL registers above
  * there is only a single instance of each of these in the NPU so we
diff --git a/include/npu2.h b/include/npu2.h
index ef4e7aff..d58aab47 100644
--- a/include/npu2.h
+++ b/include/npu2.h
@@ -248,4 +248,5 @@ int64_t npu2_freeze_status(struct phb *phb __unused,
 			   uint8_t *freeze_state,
 			   uint16_t *pci_error_type __unused,
 			   uint16_t *severity __unused);
+void npu2_dump_scoms(int chip_id);
 #endif /* __NPU2_H */
-- 
2.19.1



More information about the Skiboot mailing list