[Skiboot] [PATCH v2 6/7] hw/npu2: Dump (more) npu2 registers on link error and HMIs

Thu Apr 4 16:52:35 AEDT 2019

On 26/3/19 5:29 am, Frederic Barrat wrote:
> We were already logging some NPU registers during an HMI. This patch
> cleans up a bit how it is done and separates what is global from what
> is specific to nvlink or opencapi.
> 
> Since we can now receive an error interrupt when an opencapi link goes
> down unexpectedly, we also dump the NPU state but we limit it to the
> registers of the brick which hit the error.
> 
> The list of registers to dump was worked out with the hw team to
> allow for proper debugging. For each register, we print the name as
> found in the NPU workbook, the scom address and the register value.
> 
> Signed-off-by: Frederic Barrat <fbarrat at linux.ibm.com>

> ---
> Changelog
> v2:
>    - Simplify per-stack and stack-independent register handling by
>      treating the XTS register separately
>    - use ARRAY_SIZE() to iterate over the registers to dump
> 
> 
>   core/hmi.c          |  58 +--------
>   hw/npu2-common.c    | 299 ++++++++++++++++++++++++++++++++++++++++++++
>   include/npu2-regs.h |  10 ++
>   include/npu2.h      |   1 +
>   4 files changed, 311 insertions(+), 57 deletions(-)
> 
> diff --git a/core/hmi.c b/core/hmi.c
> index fbb182c3..26277fa6 100644
> --- a/core/hmi.c
> +++ b/core/hmi.c
> @@ -594,60 +594,6 @@ static void find_nx_checkstop_reason(int flat_chip_id,
>   	queue_hmi_event(hmi_evt, 0, out_flags);
>   }
>   
> -/*
> - * If the year is 2018 and you still see all these hardcoded, you
> - * should really replace this with the neat macros that's in the
> - * NPU2 code rather than this horrible listing of every single
> - * NPU2 register hardcoded for a specific chip.
> - *
> - * I feel dirty having even written it.
> - */
> -static uint32_t npu2_scom_dump[] = {
> -	0x5011017, 0x5011047, 0x5011077, 0x50110A7,
> -	0x5011217, 0x5011247, 0x5011277, 0x50112A7,
> -	0x5011417, 0x5011447, 0x5011477, 0x50114A7,
> -	0x50110DA, 0x50112DA, 0x50114DA,
> -	0x50110DB, 0x50112DB, 0x50114DB,
> -	0x5011011, 0x5011041, 0x5011071, 0x50110A1,
> -	0x5011211, 0x5011241, 0x5011271, 0x50112A1,
> -	0x5011411, 0x5011441, 0x5011471, 0x50114A1,
> -	0x5011018, 0x5011048, 0x5011078, 0x50110A8,
> -	0x5011218, 0x5011248, 0x5011278, 0x50112A8,
> -	0x5011418, 0x5011448, 0x5011478, 0x50114A8,
> -	0x5011640,
> -	0x5011114, 0x5011134, 0x5011314, 0x5011334,
> -	0x5011514, 0x5011534, 0x5011118, 0x5011138,
> -	0x5011318, 0x5011338, 0x5011518, 0x5011538,
> -	0x50110D8, 0x50112D8, 0x50114D8,
> -	0x50110D9, 0x50112D9, 0x50114D9,
> -	0x5011019, 0x5011049, 0x5011079, 0x50110A9,
> -	0x5011219, 0x5011249, 0x5011279, 0x50112A9,
> -	0x5011419, 0x5011449, 0x5011479, 0x50114A9,
> -	0x50110F4, 0x50112F4, 0x50114F4,
> -	0x50110F5, 0x50112F5, 0x50114F5,
> -	0x50110F6, 0x50112F6, 0x50114F6,
> -	0x50110FD, 0x50112FD, 0x50114FD,
> -	0x50110FE, 0x50112FE, 0x50114FE,
> -	0x00
> -};
> -
> -static void dump_scoms(int flat_chip_id, const char *unit, uint32_t *scoms,
> -			const char *loc)
> -{
> -	uint64_t value;
> -	int r;
> -
> -	while (*scoms != 0) {
> -		value = 0;
> -		r = _xscom_read(flat_chip_id, *scoms, &value, false);
> -		if (r != OPAL_SUCCESS)
> -			continue;
> -		prlog(PR_ERR, "%s: [Loc: %s] P:%d 0x%08x=0x%016llx\n",
> -		      unit, loc, flat_chip_id, *scoms, value);
> -		scoms++;
> -	}
> -}
> -
>   static bool phb_is_npu2(struct dt_node *dn)
>   {
>   	return (dt_node_is_compatible(dn, "ibm,power9-npu-pciex") ||
> @@ -731,9 +677,7 @@ static void find_npu2_checkstop_reason(int flat_chip_id,
>   	npu2_hmi_verbose = true;
>   
>   	if (npu2_hmi_verbose) {
> -		_xscom_lock();
> -		dump_scoms(flat_chip_id, "NPU", npu2_scom_dump, loc);
> -		_xscom_unlock();
> +		npu2_dump_scoms(flat_chip_id);
>   		prlog(PR_ERR, " _________________________ \n");
>   		prlog(PR_ERR, "< It's Driver Debug time! >\n");
>   		prlog(PR_ERR, " ------------------------- \n");
> diff --git a/hw/npu2-common.c b/hw/npu2-common.c
> index ccbbbbca..def23728 100644
> --- a/hw/npu2-common.c
> +++ b/hw/npu2-common.c
> @@ -103,6 +103,304 @@ void npu2_write_mask_4b(struct npu2 *p, uint64_t reg, uint32_t val, uint32_t mas
>   			(uint64_t)new_val << 32);
>   }
>   
> +typedef struct {
> +	const char *name;
> +	uint32_t block;
> +	uint32_t offset;
> +} npu2_scom_dump_t;
> +
> +static npu2_scom_dump_t npu2_scom_dump_global[] = {
> +#define __NPU2_SCOM_DUMP(name, block, offset) { name, block, offset }

What's the benefit of using this macro? And if we are going to use it, 
can we define it outside of the array definition?

> +	/* CQ State Machine */
> +	__NPU2_SCOM_DUMP("CS.SM0.MISC.CERR_MESSAGE0",
> +			NPU2_BLOCK_SM_0, NPU2_C_ERR_RPT_MSG0),
> +	__NPU2_SCOM_DUMP("CS.SM1.MISC.CERR_MESSAGE0",
> +			NPU2_BLOCK_SM_1, NPU2_C_ERR_RPT_MSG0),
> +	__NPU2_SCOM_DUMP("CS.SM2.MISC.CERR_MESSAGE0",
> +			NPU2_BLOCK_SM_2, NPU2_C_ERR_RPT_MSG0),
> +	__NPU2_SCOM_DUMP("CS.SM3.MISC.CERR_MESSAGE0",
> +			NPU2_BLOCK_SM_3, NPU2_C_ERR_RPT_MSG0),
> +
> +	__NPU2_SCOM_DUMP("CS.SM0.MISC.CERR_MESSAGE1",
> +			NPU2_BLOCK_SM_0, NPU2_C_ERR_RPT_MSG1),
> +	__NPU2_SCOM_DUMP("CS.SM1.MISC.CERR_MESSAGE1",
> +			NPU2_BLOCK_SM_1, NPU2_C_ERR_RPT_MSG1),
> +	__NPU2_SCOM_DUMP("CS.SM2.MISC.CERR_MESSAGE1",
> +			NPU2_BLOCK_SM_2, NPU2_C_ERR_RPT_MSG1),
> +	__NPU2_SCOM_DUMP("CS.SM3.MISC.CERR_MESSAGE1",
> +			NPU2_BLOCK_SM_3, NPU2_C_ERR_RPT_MSG1),
> +
> +	__NPU2_SCOM_DUMP("CS.SM0.MISC.CERR_MESSAGE2",
> +			NPU2_BLOCK_SM_0, NPU2_C_ERR_RPT_MSG2),
> +	__NPU2_SCOM_DUMP("CS.SM1.MISC.CERR_MESSAGE2",
> +			NPU2_BLOCK_SM_1, NPU2_C_ERR_RPT_MSG2),
> +	__NPU2_SCOM_DUMP("CS.SM2.MISC.CERR_MESSAGE2",
> +			NPU2_BLOCK_SM_2, NPU2_C_ERR_RPT_MSG2),
> +	__NPU2_SCOM_DUMP("CS.SM3.MISC.CERR_MESSAGE2",
> +			NPU2_BLOCK_SM_3, NPU2_C_ERR_RPT_MSG2),
> +
> +	__NPU2_SCOM_DUMP("CS.SM0.MISC.CERR_MESSAGE3",
> +			NPU2_BLOCK_SM_0, NPU2_C_ERR_RPT_MSG3),
> +	__NPU2_SCOM_DUMP("CS.SM1.MISC.CERR_MESSAGE3",
> +			NPU2_BLOCK_SM_1, NPU2_C_ERR_RPT_MSG3),
> +	__NPU2_SCOM_DUMP("CS.SM2.MISC.CERR_MESSAGE3",
> +			NPU2_BLOCK_SM_2, NPU2_C_ERR_RPT_MSG3),
> +	__NPU2_SCOM_DUMP("CS.SM3.MISC.CERR_MESSAGE3",
> +			NPU2_BLOCK_SM_3, NPU2_C_ERR_RPT_MSG3),
> +
> +	__NPU2_SCOM_DUMP("CS.SM0.MISC.CERR_MESSAGE4",
> +			NPU2_BLOCK_SM_0, NPU2_C_ERR_RPT_MSG4),
> +	__NPU2_SCOM_DUMP("CS.SM1.MISC.CERR_MESSAGE4",
> +			NPU2_BLOCK_SM_1, NPU2_C_ERR_RPT_MSG4),
> +	__NPU2_SCOM_DUMP("CS.SM2.MISC.CERR_MESSAGE4",
> +			NPU2_BLOCK_SM_2, NPU2_C_ERR_RPT_MSG4),
> +	__NPU2_SCOM_DUMP("CS.SM3.MISC.CERR_MESSAGE4",
> +			NPU2_BLOCK_SM_3, NPU2_C_ERR_RPT_MSG4),
> +
> +	__NPU2_SCOM_DUMP("CS.SM0.MISC.CERR_MESSAGE5",
> +			NPU2_BLOCK_SM_0, NPU2_C_ERR_RPT_MSG5),
> +	__NPU2_SCOM_DUMP("CS.SM1.MISC.CERR_MESSAGE5",
> +			NPU2_BLOCK_SM_1, NPU2_C_ERR_RPT_MSG5),
> +	__NPU2_SCOM_DUMP("CS.SM2.MISC.CERR_MESSAGE5",
> +			NPU2_BLOCK_SM_2, NPU2_C_ERR_RPT_MSG5),
> +	__NPU2_SCOM_DUMP("CS.SM3.MISC.CERR_MESSAGE5",
> +			NPU2_BLOCK_SM_3, NPU2_C_ERR_RPT_MSG5),
> +
> +	__NPU2_SCOM_DUMP("CS.SM0.MISC.CERR_MESSAGE6",
> +			NPU2_BLOCK_SM_0, NPU2_C_ERR_RPT_MSG6),
> +	__NPU2_SCOM_DUMP("CS.SM1.MISC.CERR_MESSAGE6",
> +			NPU2_BLOCK_SM_1, NPU2_C_ERR_RPT_MSG6),
> +	__NPU2_SCOM_DUMP("CS.SM2.MISC.CERR_MESSAGE6",
> +			NPU2_BLOCK_SM_2, NPU2_C_ERR_RPT_MSG6),
> +	__NPU2_SCOM_DUMP("CS.SM3.MISC.CERR_MESSAGE6",
> +			NPU2_BLOCK_SM_3, NPU2_C_ERR_RPT_MSG6),
> +
> +	__NPU2_SCOM_DUMP("CS.SM0.MISC.CERR_FIRST0",
> +			NPU2_BLOCK_SM_0, NPU2_C_ERR_RPT_FIRST0),
> +	__NPU2_SCOM_DUMP("CS.SM1.MISC.CERR_FIRST0",
> +			NPU2_BLOCK_SM_1, NPU2_C_ERR_RPT_FIRST0),
> +	__NPU2_SCOM_DUMP("CS.SM2.MISC.CERR_FIRST0",
> +			NPU2_BLOCK_SM_2, NPU2_C_ERR_RPT_FIRST0),
> +	__NPU2_SCOM_DUMP("CS.SM3.MISC.CERR_FIRST0",
> +			NPU2_BLOCK_SM_3, NPU2_C_ERR_RPT_FIRST0),
> +
> +	__NPU2_SCOM_DUMP("CS.SM0.MISC.CERR_FIRST1",
> +			NPU2_BLOCK_SM_0, NPU2_C_ERR_RPT_FIRST1),
> +	__NPU2_SCOM_DUMP("CS.SM1.MISC.CERR_FIRST1",
> +			NPU2_BLOCK_SM_1, NPU2_C_ERR_RPT_FIRST1),
> +	__NPU2_SCOM_DUMP("CS.SM2.MISC.CERR_FIRST1",
> +			NPU2_BLOCK_SM_2, NPU2_C_ERR_RPT_FIRST1),
> +	__NPU2_SCOM_DUMP("CS.SM3.MISC.CERR_FIRST1",
> +			NPU2_BLOCK_SM_3, NPU2_C_ERR_RPT_FIRST1),
> +
> +	__NPU2_SCOM_DUMP("CS.SM0.MISC.CERR_FIRST2",
> +			NPU2_BLOCK_SM_0, NPU2_C_ERR_RPT_FIRST2),
> +	__NPU2_SCOM_DUMP("CS.SM1.MISC.CERR_FIRST2",
> +			NPU2_BLOCK_SM_1, NPU2_C_ERR_RPT_FIRST2),
> +	__NPU2_SCOM_DUMP("CS.SM2.MISC.CERR_FIRST2",
> +			NPU2_BLOCK_SM_2, NPU2_C_ERR_RPT_FIRST2),
> +	__NPU2_SCOM_DUMP("CS.SM3.MISC.CERR_FIRST2",
> +			NPU2_BLOCK_SM_3, NPU2_C_ERR_RPT_FIRST2),
> +
> +	/* CQ Control */
> +	__NPU2_SCOM_DUMP("CS.CTL.MISC.CERR_MESSAGE0",
> +			NPU2_BLOCK_CTL, NPU2_CQ_C_ERR_RPT_MSG0),
> +	__NPU2_SCOM_DUMP("CS.CTL.MISC.CERR_MESSAGE1",
> +			NPU2_BLOCK_CTL, NPU2_CQ_C_ERR_RPT_MSG1),
> +	__NPU2_SCOM_DUMP("CS.CTL.MISC.CERR_FIRST0",
> +			NPU2_BLOCK_CTL, NPU2_CQ_C_ERR_RPT_FIRST0),
> +	__NPU2_SCOM_DUMP("CS.CTL.MISC.CERR_FIRST1",
> +			NPU2_BLOCK_CTL, NPU2_CQ_C_ERR_RPT_FIRST1),
> +
> +	/* CQ Data */
> +	__NPU2_SCOM_DUMP("DAT.MISC.CERR_ECC_HOLD",
> +			NPU2_BLOCK_DAT, NPU2_CQ_DAT_ECC_STATUS),
> +	__NPU2_SCOM_DUMP("DAT.MISC.CERR_ECC_MASK",
> +			NPU2_BLOCK_DAT, NPU2_CQ_DAT_ECC_MASK),
> +	__NPU2_SCOM_DUMP("DAT.MISC.CERR_ECC_FIRST",
> +			NPU2_BLOCK_DAT, NPU2_CQ_DAT_ECC_FIRST),
> +	__NPU2_SCOM_DUMP("DAT.MISC.REM0",
> +			NPU2_BLOCK_DAT, NPU2_CQ_DAT_RAS_MSG0),
> +	__NPU2_SCOM_DUMP("DAT.MISC.REM1",
> +			NPU2_BLOCK_DAT, NPU2_CQ_DAT_RAS_MSG1),
> +};
> +
> +static npu2_scom_dump_t npu2_scom_dump_nvlink[] = {
> +	__NPU2_SCOM_DUMP("NTL0.REGS.CERR_FIRST1",
> +			NPU2_BLOCK_NTL0, NPU2_NTL_ERR_FIRST1_OFF),
> +	__NPU2_SCOM_DUMP("NTL1.REGS.CERR_FIRST1",
> +			NPU2_BLOCK_NTL1, NPU2_NTL_ERR_FIRST1_OFF),
> +	__NPU2_SCOM_DUMP("NTL0.REGS.CERR_FIRST2",
> +			NPU2_BLOCK_NTL0, NPU2_NTL_ERR_FIRST2_OFF),
> +	__NPU2_SCOM_DUMP("NTL1.REGS.CERR_FIRST2",
> +			NPU2_BLOCK_NTL1, NPU2_NTL_ERR_FIRST2_OFF),
> +};
> +
> +static npu2_scom_dump_t npu2_scom_dump_ocapi[] = {
> +	__NPU2_SCOM_DUMP("OTL0.MISC.C_ERR_RPT_HOLD0",
> +			NPU2_BLOCK_OTL0, NPU2_OTL_ERR_RPT_HOLD0),
> +	__NPU2_SCOM_DUMP("OTL1.MISC.C_ERR_RPT_HOLD0",
> +			NPU2_BLOCK_OTL1, NPU2_OTL_ERR_RPT_HOLD0),
> +	__NPU2_SCOM_DUMP("OTL0.MISC.OTL_REM0",
> +			NPU2_BLOCK_OTL0, NPU2_OTL_RAS_ERR_MSG0),
> +	__NPU2_SCOM_DUMP("OTL1.MISC.OTL_REM0",
> +			NPU2_BLOCK_OTL1, NPU2_OTL_RAS_ERR_MSG0),
> +	__NPU2_SCOM_DUMP("OTL0.MISC.ERROR_SIG_RXI",
> +			NPU2_BLOCK_OTL0, NPU2_OTL_RXI_ERR_SIG),
> +	__NPU2_SCOM_DUMP("OTL1.MISC.ERROR_SIG_RXI",
> +			NPU2_BLOCK_OTL1, NPU2_OTL_RXI_ERR_SIG),
> +	__NPU2_SCOM_DUMP("OTL0.MISC.ERROR_SIG_RXO",
> +			NPU2_BLOCK_OTL0, NPU2_OTL_RXO_ERR_SIG),
> +	__NPU2_SCOM_DUMP("OTL1.MISC.ERROR_SIG_RXO",
> +			NPU2_BLOCK_OTL1, NPU2_OTL_RXO_ERR_SIG),
> +	__NPU2_SCOM_DUMP("OTL0.MISC.C_ERR_RPT_HOLD1",
> +			NPU2_BLOCK_OTL0, NPU2_OTL_ERR_RPT_HOLD1),
> +	__NPU2_SCOM_DUMP("OTL1.MISC.C_ERR_RPT_HOLD1",
> +			NPU2_BLOCK_OTL1, NPU2_OTL_ERR_RPT_HOLD1),
> +};
> +
> +static void print_one_reg(struct npu2 *npu, npu2_scom_dump_t *scom, int stack)
> +{
> +	uint64_t reg, val;
> +
> +	reg = NPU2_REG_OFFSET(stack, scom->block, scom->offset);
> +	val = npu2_scom_read(npu->chip_id, npu->xscom_base,
> +			reg, NPU2_MISC_DA_LEN_8B);
> +
> +	prlog(PR_ERR, "NPU[%d] STCK%d.%s 0x%llx = 0x%016llx\n",
> +		npu->chip_id, stack - 4, scom->name, reg, val);
> +}
> +
> +static void dump_npu2_regs_nvlink(struct npu2 *npu, int brick_index)
> +{
> +	uint32_t stack, ntl;
> +	int i;
> +
> +	stack = NPU2_STACK_STCK_0 + brick_index / 2;
> +	ntl = NPU2_BLOCK_NTL0 + (brick_index % 2) * 2;
> +
> +	for (i = 0; i < ARRAY_SIZE(npu2_scom_dump_nvlink); i++) {
> +		if (npu2_scom_dump_nvlink[i].block == ntl)
> +			print_one_reg(npu, &npu2_scom_dump_nvlink[i], stack);
> +	}
> +}
> +
> +static void dump_npu2_regs_opencapi(struct npu2 *npu, int brick_index)
> +{
> +	uint64_t val, addr;
> +	uint32_t stack, otl;
> +	int i;
> +
> +	stack = NPU2_STACK_STCK_0 + brick_index / 2;
> +	otl = NPU2_BLOCK_OTL0 + (brick_index % 2);
> +
> +	/* NPU registers */
> +	for (i = 0; i < ARRAY_SIZE(npu2_scom_dump_ocapi); i++) {
> +		if (npu2_scom_dump_ocapi[i].block == otl)
> +			print_one_reg(npu, &npu2_scom_dump_ocapi[i], stack);
> +	}
> +
> +	/* Fabric registers */
> +	addr = OB_ODL_STATUS(brick_index);
> +	xscom_read(npu->chip_id, addr, &val);
> +	prlog(PR_ERR, "NPU[%d] ODL status brick %d 0x%llx = 0x%016llx\n",
> +		npu->chip_id, brick_index, addr, val);
> +
> +	addr = OB_ODL_TRAINING_STATUS(brick_index);
> +	xscom_read(npu->chip_id, addr, &val);
> +	prlog(PR_ERR, "NPU[%d] ODL training status brick %d 0x%llx = 0x%016llx\n",
> +		npu->chip_id, brick_index, addr, val);
> +
> +	addr = OB_ODL_ENDPOINT_INFO(brick_index);
> +	xscom_read(npu->chip_id, addr, &val);
> +	prlog(PR_ERR, "NPU[%d] ODL endpoint info brick %d 0x%llx = 0x%016llx\n",
> +		npu->chip_id, brick_index, addr, val);
> +}
> +
> +static void dump_npu2_regs(struct npu2 *npu, int brick_index) > +{
> +	int i, stack, stack_min, stack_max;
> +	uint64_t fir_val, mask_val, fir_addr, mask_addr;
> +	uint64_t reg, val;
> +	struct npu2_dev *dev;
> +
> +	if (brick_index != -1) {
> +		stack_min = stack_max = NPU2_STACK_STCK_0 + brick_index / 2;
> +	} else {
> +		stack_min = NPU2_STACK_STCK_0;
> +		stack_max = NPU2_STACK_STCK_2;
> +		/* Avoid dumping unused stacks for opencapi on Lagrange */
> +		if (npu->total_devices == 2)
> +			stack_min = stack_max = NPU2_STACK_STCK_1;
> +	}
> +
> +	/* NPU FIRs */
> +	for (i = 0; i < NPU2_TOTAL_FIR_REGISTERS; i++) {
> +		fir_addr  = NPU2_FIR_REGISTER_0 + i * NPU2_FIR_OFFSET;
> +		mask_addr = fir_addr + NPU2_FIR_MASK_OFFSET;
> +		xscom_read(npu->chip_id, fir_addr, &fir_val);
> +		xscom_read(npu->chip_id, mask_addr, &mask_val);
> +		prlog(PR_ERR, "NPU[%d] FIR%d = 0x%016llx (mask 0x%016llx => 0x%016llx)\n",
> +			npu->chip_id, i, fir_val, mask_val, fir_val & ~mask_val);
> +	}
> +
> +	/* NPU global, per-stack registers */
> +	for (i = 0; i < ARRAY_SIZE(npu2_scom_dump_global); i++) {
> +		for (stack = stack_min; stack <= stack_max; stack++)
> +			print_one_reg(npu, &npu2_scom_dump_global[i], stack);
> +	}
> +
> +	/*
> +	 * NPU global registers, stack independent
> +	 *
> +	 * We have only one for now, so dump it directly
> +	 */
> +	reg = NPU2_REG_OFFSET(NPU2_STACK_MISC, NPU2_BLOCK_XTS, 0);
> +	val = npu2_scom_read(npu->chip_id, npu->xscom_base,
> +			reg, NPU2_MISC_DA_LEN_8B);
> +
> +	prlog(PR_ERR, "NPU[%d] XTS.REG.ERR_HOLD 0x%llx = 0x%016llx\n",
> +		npu->chip_id, reg, val);
> +
> +	/* nvlink- or opencapi-specific registers */
> +	for (i = 0; i < npu->total_devices; i++) {
> +		dev = &npu->devices[i];
> +		if (brick_index == -1 || dev->brick_index == brick_index) {
> +			if (dev->type == NPU2_DEV_TYPE_NVLINK)
> +				dump_npu2_regs_nvlink(npu, dev->brick_index);
> +			else if (dev->type == NPU2_DEV_TYPE_OPENCAPI)
> +				dump_npu2_regs_opencapi(npu, dev->brick_index);
> +		}
> +	}
> +}
> +
> +void npu2_dump_scoms(int chip_id)

Can we rename this something that's less likely to get mentally confused 
with "dump_npu2_regs"? I don't have suggestions.

> +{
> +	struct npu2 *npu;
> +	struct phb *phb;
> +	struct npu2_dev *dev;
> +
> +	/*
> +	 * Look for the npu2 structure for that chip ID. We can access it
> +	 * through the array of phbs, looking for a nvlink or opencapi
> +	 * phb. We can have several entries, but they all point
> +	 * to the same npu2 structure
> +	 */
> +	for_each_phb(phb) {
> +		npu = NULL;
> +		if (phb->phb_type == phb_type_npu_v2) {
> +			npu = phb_to_npu2_nvlink(phb);
> +		} else if (phb->phb_type == phb_type_npu_v2_opencapi) {
> +			dev = phb_to_npu2_dev_ocapi(phb);
> +			npu = dev->npu;
> +		}
> +		if (npu && npu->chip_id == chip_id) {
> +			dump_npu2_regs(npu, -1 /* all bricks */);
> +			break;
> +		}
> +	}
> +}
> +
>   static uint64_t npu2_ipi_attributes(struct irq_source *is __unused, uint32_t isn __unused)
>   {
>   	struct npu2 *p = is->data;
> @@ -182,6 +480,7 @@ static void npu2_err_interrupt(struct irq_source *is, uint32_t isn)
>   		brick = 2 + ((idx - 27) % 4);
>   		prlog(PR_ERR, "NPU[%d] error interrupt for brick %d\n",
>   			p->chip_id, brick);
> +		dump_npu2_regs(p, brick);
>   		opal_update_pending_evt(OPAL_EVENT_PCI_ERROR,
>   					OPAL_EVENT_PCI_ERROR);
>   		break;
> diff --git a/include/npu2-regs.h b/include/npu2-regs.h
> index 939a23f5..ba10b8ea 100644
> --- a/include/npu2-regs.h
> +++ b/include/npu2-regs.h
> @@ -203,6 +203,8 @@ void npu2_scom_write(uint64_t gcid, uint64_t scom_base,
>   #define NPU2_PERF_MASK				0x110
>   #define NPU2_DBG0_CFG				0x118
>   #define NPU2_DBG1_CFG				0x120
> +#define NPU2_C_ERR_RPT_MSG5			0x128
> +#define NPU2_C_ERR_RPT_MSG6			0x130
>   
>   /* CTL block registers */
>   #define NPU2_CQ_CTL_MISC_CFG			0x000
> @@ -295,10 +297,12 @@ void npu2_scom_write(uint64_t gcid, uint64_t scom_base,
>   #define NPU2_NTL_MISC_CFG3(ndev)		NPU2_NTL_REG_OFFSET(ndev, 0x008)
>   #define NPU2_NTL_ERR_HOLD1(ndev)		NPU2_NTL_REG_OFFSET(ndev, 0x010)
>   #define NPU2_NTL_ERR_MASK1(ndev)		NPU2_NTL_REG_OFFSET(ndev, 0x018)
> +#define NPU2_NTL_ERR_FIRST1_OFF			0x020
>   #define NPU2_NTL_ERR_FIRST1(ndev)		NPU2_NTL_REG_OFFSET(ndev, 0x020)
>   #define NPU2_NTL_ERR_FIRST1_MASK(ndev)		NPU2_NTL_REG_OFFSET(ndev, 0x028)
>   #define NPU2_NTL_ERR_HOLD2(ndev)		NPU2_NTL_REG_OFFSET(ndev, 0x030)
>   #define NPU2_NTL_ERR_MASK2(ndev)		NPU2_NTL_REG_OFFSET(ndev, 0x038)
> +#define NPU2_NTL_ERR_FIRST2_OFF			0x040
>   #define NPU2_NTL_ERR_FIRST2(ndev)		NPU2_NTL_REG_OFFSET(ndev, 0x040)
>   #define NPU2_NTL_ERR_FIRST2_MASK(ndev)		NPU2_NTL_REG_OFFSET(ndev, 0x048)
>   #define NPU2_NTL_SCRATCH2(ndev)			NPU2_NTL_REG_OFFSET(ndev, 0x050)
> @@ -402,6 +406,12 @@ void npu2_scom_write(uint64_t gcid, uint64_t scom_base,
>   #define NPU2_OTL_OSL_DAR(stack, block)		NPU2_REG_OFFSET(stack, block, 0x008)
>   #define NPU2_OTL_OSL_TFC(stack, block)		NPU2_REG_OFFSET(stack, block, 0x010)
>   #define NPU2_OTL_OSL_PEHANDLE(stack, block)	NPU2_REG_OFFSET(stack, block, 0x018)
> +#define NPU2_OTL_ERR_RPT_HOLD0			0x30
> +#define NPU2_OTL_RAS_ERR_MSG0			0x68
> +#define NPU2_OTL_RXI_ERR_SIG			0x70
> +#define NPU2_OTL_RXO_ERR_SIG			0x78
> +#define NPU2_OTL_ERR_RPT_HOLD1			0xB0
> +
>   
>   /* Misc block registers. Unlike the SM/CTL/DAT/NTL registers above
>    * there is only a single instance of each of these in the NPU so we
> diff --git a/include/npu2.h b/include/npu2.h
> index ef4e7aff..d58aab47 100644
> --- a/include/npu2.h
> +++ b/include/npu2.h
> @@ -248,4 +248,5 @@ int64_t npu2_freeze_status(struct phb *phb __unused,
>   			   uint8_t *freeze_state,
>   			   uint16_t *pci_error_type __unused,
>   			   uint16_t *severity __unused);
> +void npu2_dump_scoms(int chip_id);
>   #endif /* __NPU2_H */
> 

-- 
Andrew Donnellan              OzLabs, ADL Canberra
andrew.donnellan at au1.ibm.com  IBM Australia Limited