[Skiboot] Fwd: [PATCH 6/7] hw/npu2: Dump (more) npu2 registers on link error and HMIs

Fri Mar 22 03:56:42 AEDT 2019

Hi Alexey,

[ replying to the list ]

Thanks for the review! Many good points, a few more details below.

Le 21/03/2019 à 05:21, Alexey Kardashevskiy a écrit :
> Hi,
> 
> I think I saw a similar one from Andrew as well so I just relaxed and I
> am waiting till Stewart takes any :)  Comments below.

:-)
Though I think you're referring to a different patch from Andrew, who is 
cleaning up a bunch of scom accesses in npu2_nvlink_init_npu()

> On 21/03/2019 07:25, Frederic Barrat wrote:
>> Hi Alexey,
>>
>> I forgot to CC you on that patch... It's related to something you had
>> sent before, where you were cleaning up how we dump npu2 registers on
>> HMI. Comments welcome :-)
>>
>>    Fred
>>
>>
>> -------- Message transféré --------
>> Sujet : [Skiboot] [PATCH 6/7] hw/npu2: Dump (more) npu2 registers on
>> link error and HMIs
>> Date : Wed, 20 Mar 2019 19:35:21 +0100
>> De : Frederic Barrat <fbarrat at linux.ibm.com>
>> Pour : skiboot at lists.ozlabs.org, andrew.donnellan at au1.ibm.com
>> Copie à : clombard at linux.ibm.com, arbab at linux.ibm.com
>>
>> We were already logging some NPU registers during an HMI. This patch
>> cleans up a bit how it is done and separates what is global from what
>> is specific to nvlink or opencapi.
>>
>> Since we can now receive an error interrupt when an opencapi link goes
>> down unexpectedly, we also dump the NPU state but we limit it to the
>> registers of the brick which hit the error.
>>
>> The list of registers to dump was worked out with the hw team to
>> allow for proper debugging. For each register, we print the name as
>> found in the NPU workbook, the scom address and the register value.
>>
>> Signed-off-by: Frederic Barrat <fbarrat at linux.ibm.com>
>> ---
>>   core/hmi.c          |  58 +-------
>>   hw/npu2-common.c    | 312 ++++++++++++++++++++++++++++++++++++++++++++
>>   include/npu2-regs.h |  10 ++
>>   include/npu2.h      |   1 +
>>   4 files changed, 324 insertions(+), 57 deletions(-)
>>
>> diff --git a/core/hmi.c b/core/hmi.c
>> index fbb182c3..26277fa6 100644
>> --- a/core/hmi.c
>> +++ b/core/hmi.c
>> @@ -594,60 +594,6 @@ static void find_nx_checkstop_reason(int flat_chip_id,
>>       queue_hmi_event(hmi_evt, 0, out_flags);
>>   }
>>   -/*
>> - * If the year is 2018 and you still see all these hardcoded, you
>> - * should really replace this with the neat macros that's in the
>> - * NPU2 code rather than this horrible listing of every single
>> - * NPU2 register hardcoded for a specific chip.
>> - *
>> - * I feel dirty having even written it.
>> - */
>> -static uint32_t npu2_scom_dump[] = {
>> -    0x5011017, 0x5011047, 0x5011077, 0x50110A7,
>> -    0x5011217, 0x5011247, 0x5011277, 0x50112A7,
>> -    0x5011417, 0x5011447, 0x5011477, 0x50114A7,
>> -    0x50110DA, 0x50112DA, 0x50114DA,
>> -    0x50110DB, 0x50112DB, 0x50114DB,
>> -    0x5011011, 0x5011041, 0x5011071, 0x50110A1,
>> -    0x5011211, 0x5011241, 0x5011271, 0x50112A1,
>> -    0x5011411, 0x5011441, 0x5011471, 0x50114A1,
>> -    0x5011018, 0x5011048, 0x5011078, 0x50110A8,
>> -    0x5011218, 0x5011248, 0x5011278, 0x50112A8,
>> -    0x5011418, 0x5011448, 0x5011478, 0x50114A8,
>> -    0x5011640,
>> -    0x5011114, 0x5011134, 0x5011314, 0x5011334,
>> -    0x5011514, 0x5011534, 0x5011118, 0x5011138,
>> -    0x5011318, 0x5011338, 0x5011518, 0x5011538,
>> -    0x50110D8, 0x50112D8, 0x50114D8,
>> -    0x50110D9, 0x50112D9, 0x50114D9,
>> -    0x5011019, 0x5011049, 0x5011079, 0x50110A9,
>> -    0x5011219, 0x5011249, 0x5011279, 0x50112A9,
>> -    0x5011419, 0x5011449, 0x5011479, 0x50114A9,
>> -    0x50110F4, 0x50112F4, 0x50114F4,
>> -    0x50110F5, 0x50112F5, 0x50114F5,
>> -    0x50110F6, 0x50112F6, 0x50114F6,
>> -    0x50110FD, 0x50112FD, 0x50114FD,
>> -    0x50110FE, 0x50112FE, 0x50114FE,
>> -    0x00
>> -};
> 
> 
> I cannot confirm if all of these are replaced but since it was not
> extremely usable in the first place - anything is better :)

I went through the list and I'm pretty sure they are all there. 
Actually, there are a few more that the NPU team asked to add.

> 
>> -
>> -static void dump_scoms(int flat_chip_id, const char *unit, uint32_t
>> *scoms,
>> -            const char *loc)
>> -{
>> -    uint64_t value;
>> -    int r;
>> -
>> -    while (*scoms != 0) {
>> -        value = 0;
>> -        r = _xscom_read(flat_chip_id, *scoms, &value, false);
>> -        if (r != OPAL_SUCCESS)
>> -            continue;
>> -        prlog(PR_ERR, "%s: [Loc: %s] P:%d 0x%08x=0x%016llx\n",
>> -              unit, loc, flat_chip_id, *scoms, value);
>> -        scoms++;
>> -    }
>> -}
>> -
>>   static bool phb_is_npu2(struct dt_node *dn)
>>   {
>>       return (dt_node_is_compatible(dn, "ibm,power9-npu-pciex") ||
>> @@ -731,9 +677,7 @@ static void find_npu2_checkstop_reason(int
>> flat_chip_id,
>>       npu2_hmi_verbose = true;
>>        if (npu2_hmi_verbose) {
>> -        _xscom_lock();
>> -        dump_scoms(flat_chip_id, "NPU", npu2_scom_dump, loc);
>> -        _xscom_unlock();
>> +        npu2_dump_scoms(flat_chip_id);
>>           prlog(PR_ERR, " _________________________ \n");
>>           prlog(PR_ERR, "< It's Driver Debug time! >\n");
>>           prlog(PR_ERR, " ------------------------- \n");
>> diff --git a/hw/npu2-common.c b/hw/npu2-common.c
>> index ccbbbbca..e370ba87 100644
>> --- a/hw/npu2-common.c
>> +++ b/hw/npu2-common.c
>> @@ -103,6 +103,317 @@ void npu2_write_mask_4b(struct npu2 *p, uint64_t
>> reg, uint32_t val, uint32_t mas
>>               (uint64_t)new_val << 32);
>>   }
>>   +typedef struct {
>> +    const char *name;
>> +    uint32_t stack;
> 
> This one is NPU2_STACK_STCK_0 almost always, with just a single
> exception. You could just dump "XTS.REG.ERR_HOLD" separately and
> simplify the arrays. You could even s/__NPU2_SCOM_DUMP/__DUMP/g and make
> even item of the array be a single line => easier to read.

Indeed, there's only 1 register which is not per-stack. I took it out of 
the array and print it separately, as it also simplify things, as you 
noted below.

>> +    uint32_t block;
> 
> 
> not "ntl"? Although I am still struggling with all these
> bricks/stacks/links/ntl/otl/blocks/whatever so never mind...

I did mean "block" :-) The block value could be ntl, or otl, or others.

>> +    uint32_t offset;
>> +} npu2_scom_dump_t;
>> +
>> +static npu2_scom_dump_t npu2_scom_dump_global[] = {
>> +#define __NPU2_SCOM_DUMP(name, stack, block, offset) { name, stack,
>> block, offset }
>> +    /* CQ State Machine */
>> +    __NPU2_SCOM_DUMP("CS.SM0.MISC.CERR_MESSAGE0",
>> +            NPU2_STACK_STCK_0, NPU2_BLOCK_SM_0, NPU2_C_ERR_RPT_MSG0),
>> +    __NPU2_SCOM_DUMP("CS.SM1.MISC.CERR_MESSAGE0",
>> +            NPU2_STACK_STCK_0, NPU2_BLOCK_SM_1, NPU2_C_ERR_RPT_MSG0),
>> +    __NPU2_SCOM_DUMP("CS.SM2.MISC.CERR_MESSAGE0",
>> +            NPU2_STACK_STCK_0, NPU2_BLOCK_SM_2, NPU2_C_ERR_RPT_MSG0),
>> +    __NPU2_SCOM_DUMP("CS.SM3.MISC.CERR_MESSAGE0",
>> +            NPU2_STACK_STCK_0, NPU2_BLOCK_SM_3, NPU2_C_ERR_RPT_MSG0),
>> +
>> +    __NPU2_SCOM_DUMP("CS.SM0.MISC.CERR_MESSAGE1",
>> +            NPU2_STACK_STCK_0, NPU2_BLOCK_SM_0, NPU2_C_ERR_RPT_MSG1),
>> +    __NPU2_SCOM_DUMP("CS.SM1.MISC.CERR_MESSAGE1",
>> +            NPU2_STACK_STCK_0, NPU2_BLOCK_SM_1, NPU2_C_ERR_RPT_MSG1),
>> +    __NPU2_SCOM_DUMP("CS.SM2.MISC.CERR_MESSAGE1",
>> +            NPU2_STACK_STCK_0, NPU2_BLOCK_SM_2, NPU2_C_ERR_RPT_MSG1),
>> +    __NPU2_SCOM_DUMP("CS.SM3.MISC.CERR_MESSAGE1",
>> +            NPU2_STACK_STCK_0, NPU2_BLOCK_SM_3, NPU2_C_ERR_RPT_MSG1),
>> +
>> +    __NPU2_SCOM_DUMP("CS.SM0.MISC.CERR_MESSAGE2",
>> +            NPU2_STACK_STCK_0, NPU2_BLOCK_SM_0, NPU2_C_ERR_RPT_MSG2),
>> +    __NPU2_SCOM_DUMP("CS.SM1.MISC.CERR_MESSAGE2",
>> +            NPU2_STACK_STCK_0, NPU2_BLOCK_SM_1, NPU2_C_ERR_RPT_MSG2),
>> +    __NPU2_SCOM_DUMP("CS.SM2.MISC.CERR_MESSAGE2",
>> +            NPU2_STACK_STCK_0, NPU2_BLOCK_SM_2, NPU2_C_ERR_RPT_MSG2),
>> +    __NPU2_SCOM_DUMP("CS.SM3.MISC.CERR_MESSAGE2",
>> +            NPU2_STACK_STCK_0, NPU2_BLOCK_SM_3, NPU2_C_ERR_RPT_MSG2),
>> +
>> +    __NPU2_SCOM_DUMP("CS.SM0.MISC.CERR_MESSAGE3",
>> +            NPU2_STACK_STCK_0, NPU2_BLOCK_SM_0, NPU2_C_ERR_RPT_MSG3),
>> +    __NPU2_SCOM_DUMP("CS.SM1.MISC.CERR_MESSAGE3",
>> +            NPU2_STACK_STCK_0, NPU2_BLOCK_SM_1, NPU2_C_ERR_RPT_MSG3),
>> +    __NPU2_SCOM_DUMP("CS.SM2.MISC.CERR_MESSAGE3",
>> +            NPU2_STACK_STCK_0, NPU2_BLOCK_SM_2, NPU2_C_ERR_RPT_MSG3),
>> +    __NPU2_SCOM_DUMP("CS.SM3.MISC.CERR_MESSAGE3",
>> +            NPU2_STACK_STCK_0, NPU2_BLOCK_SM_3, NPU2_C_ERR_RPT_MSG3),
>> +
>> +    __NPU2_SCOM_DUMP("CS.SM0.MISC.CERR_MESSAGE4",
>> +            NPU2_STACK_STCK_0, NPU2_BLOCK_SM_0, NPU2_C_ERR_RPT_MSG4),
>> +    __NPU2_SCOM_DUMP("CS.SM1.MISC.CERR_MESSAGE4",
>> +            NPU2_STACK_STCK_0, NPU2_BLOCK_SM_1, NPU2_C_ERR_RPT_MSG4),
>> +    __NPU2_SCOM_DUMP("CS.SM2.MISC.CERR_MESSAGE4",
>> +            NPU2_STACK_STCK_0, NPU2_BLOCK_SM_2, NPU2_C_ERR_RPT_MSG4),
>> +    __NPU2_SCOM_DUMP("CS.SM3.MISC.CERR_MESSAGE4",
>> +            NPU2_STACK_STCK_0, NPU2_BLOCK_SM_3, NPU2_C_ERR_RPT_MSG4),
>> +
>> +    __NPU2_SCOM_DUMP("CS.SM0.MISC.CERR_MESSAGE5",
>> +            NPU2_STACK_STCK_0, NPU2_BLOCK_SM_0, NPU2_C_ERR_RPT_MSG5),
>> +    __NPU2_SCOM_DUMP("CS.SM1.MISC.CERR_MESSAGE5",
>> +            NPU2_STACK_STCK_0, NPU2_BLOCK_SM_1, NPU2_C_ERR_RPT_MSG5),
>> +    __NPU2_SCOM_DUMP("CS.SM2.MISC.CERR_MESSAGE5",
>> +            NPU2_STACK_STCK_0, NPU2_BLOCK_SM_2, NPU2_C_ERR_RPT_MSG5),
>> +    __NPU2_SCOM_DUMP("CS.SM3.MISC.CERR_MESSAGE5",
>> +            NPU2_STACK_STCK_0, NPU2_BLOCK_SM_3, NPU2_C_ERR_RPT_MSG5),
>> +
>> +    __NPU2_SCOM_DUMP("CS.SM0.MISC.CERR_MESSAGE6",
>> +            NPU2_STACK_STCK_0, NPU2_BLOCK_SM_0, NPU2_C_ERR_RPT_MSG6),
>> +    __NPU2_SCOM_DUMP("CS.SM1.MISC.CERR_MESSAGE6",
>> +            NPU2_STACK_STCK_0, NPU2_BLOCK_SM_1, NPU2_C_ERR_RPT_MSG6),
>> +    __NPU2_SCOM_DUMP("CS.SM2.MISC.CERR_MESSAGE6",
>> +            NPU2_STACK_STCK_0, NPU2_BLOCK_SM_2, NPU2_C_ERR_RPT_MSG6),
>> +    __NPU2_SCOM_DUMP("CS.SM3.MISC.CERR_MESSAGE6",
>> +            NPU2_STACK_STCK_0, NPU2_BLOCK_SM_3, NPU2_C_ERR_RPT_MSG6),
>> +
>> +    __NPU2_SCOM_DUMP("CS.SM0.MISC.CERR_FIRST0",
>> +            NPU2_STACK_STCK_0, NPU2_BLOCK_SM_0, NPU2_C_ERR_RPT_FIRST0),
>> +    __NPU2_SCOM_DUMP("CS.SM1.MISC.CERR_FIRST0",
>> +            NPU2_STACK_STCK_0, NPU2_BLOCK_SM_1, NPU2_C_ERR_RPT_FIRST0),
>> +    __NPU2_SCOM_DUMP("CS.SM2.MISC.CERR_FIRST0",
>> +            NPU2_STACK_STCK_0, NPU2_BLOCK_SM_2, NPU2_C_ERR_RPT_FIRST0),
>> +    __NPU2_SCOM_DUMP("CS.SM3.MISC.CERR_FIRST0",
>> +            NPU2_STACK_STCK_0, NPU2_BLOCK_SM_3, NPU2_C_ERR_RPT_FIRST0),
>> +
>> +    __NPU2_SCOM_DUMP("CS.SM0.MISC.CERR_FIRST1",
>> +            NPU2_STACK_STCK_0, NPU2_BLOCK_SM_0, NPU2_C_ERR_RPT_FIRST1),
>> +    __NPU2_SCOM_DUMP("CS.SM1.MISC.CERR_FIRST1",
>> +            NPU2_STACK_STCK_0, NPU2_BLOCK_SM_1, NPU2_C_ERR_RPT_FIRST1),
>> +    __NPU2_SCOM_DUMP("CS.SM2.MISC.CERR_FIRST1",
>> +            NPU2_STACK_STCK_0, NPU2_BLOCK_SM_2, NPU2_C_ERR_RPT_FIRST1),
>> +    __NPU2_SCOM_DUMP("CS.SM3.MISC.CERR_FIRST1",
>> +            NPU2_STACK_STCK_0, NPU2_BLOCK_SM_3, NPU2_C_ERR_RPT_FIRST1),
>> +
>> +    __NPU2_SCOM_DUMP("CS.SM0.MISC.CERR_FIRST2",
>> +            NPU2_STACK_STCK_0, NPU2_BLOCK_SM_0, NPU2_C_ERR_RPT_FIRST2),
>> +    __NPU2_SCOM_DUMP("CS.SM1.MISC.CERR_FIRST2",
>> +            NPU2_STACK_STCK_0, NPU2_BLOCK_SM_1, NPU2_C_ERR_RPT_FIRST2),
>> +    __NPU2_SCOM_DUMP("CS.SM2.MISC.CERR_FIRST2",
>> +            NPU2_STACK_STCK_0, NPU2_BLOCK_SM_2, NPU2_C_ERR_RPT_FIRST2),
>> +    __NPU2_SCOM_DUMP("CS.SM3.MISC.CERR_FIRST2",
>> +            NPU2_STACK_STCK_0, NPU2_BLOCK_SM_3, NPU2_C_ERR_RPT_FIRST2),
>> +
>> +    /* CQ Control */
>> +    __NPU2_SCOM_DUMP("CS.CTL.MISC.CERR_MESSAGE0",
>> +            NPU2_STACK_STCK_0, NPU2_BLOCK_CTL, NPU2_CQ_C_ERR_RPT_MSG0),
>> +    __NPU2_SCOM_DUMP("CS.CTL.MISC.CERR_MESSAGE1",
>> +            NPU2_STACK_STCK_0, NPU2_BLOCK_CTL, NPU2_CQ_C_ERR_RPT_MSG1),
>> +    __NPU2_SCOM_DUMP("CS.CTL.MISC.CERR_FIRST0",
>> +            NPU2_STACK_STCK_0, NPU2_BLOCK_CTL, NPU2_CQ_C_ERR_RPT_FIRST0),
>> +    __NPU2_SCOM_DUMP("CS.CTL.MISC.CERR_FIRST1",
>> +            NPU2_STACK_STCK_0, NPU2_BLOCK_CTL, NPU2_CQ_C_ERR_RPT_FIRST1),
>> +
>> +    /* CQ Data */
>> +    __NPU2_SCOM_DUMP("DAT.MISC.CERR_ECC_HOLD",
>> +            NPU2_STACK_STCK_0, NPU2_BLOCK_DAT, NPU2_CQ_DAT_ECC_STATUS),
>> +    __NPU2_SCOM_DUMP("DAT.MISC.CERR_ECC_MASK",
>> +            NPU2_STACK_STCK_0, NPU2_BLOCK_DAT, NPU2_CQ_DAT_ECC_MASK),
>> +    __NPU2_SCOM_DUMP("DAT.MISC.CERR_ECC_FIRST",
>> +            NPU2_STACK_STCK_0, NPU2_BLOCK_DAT, NPU2_CQ_DAT_ECC_FIRST),
>> +    __NPU2_SCOM_DUMP("DAT.MISC.REM0",
>> +            NPU2_STACK_STCK_0, NPU2_BLOCK_DAT, NPU2_CQ_DAT_RAS_MSG0),
>> +    __NPU2_SCOM_DUMP("DAT.MISC.REM1",
>> +            NPU2_STACK_STCK_0, NPU2_BLOCK_DAT, NPU2_CQ_DAT_RAS_MSG1),
>> +
>> +    /* XTS */
>> +    __NPU2_SCOM_DUMP("XTS.REG.ERR_HOLD",
>> +            NPU2_STACK_MISC, NPU2_BLOCK_XTS, 0),
>> +
>> +    { NULL, 0, 0, 0 }
>> +};
>> +
>> +static npu2_scom_dump_t npu2_scom_dump_nvlink[] = {
>> +    __NPU2_SCOM_DUMP("NTL0.REGS.CERR_FIRST1",
>> +            NPU2_STACK_STCK_0, NPU2_BLOCK_NTL0, NPU2_NTL_ERR_FIRST1_OFF),
>> +    __NPU2_SCOM_DUMP("NTL1.REGS.CERR_FIRST1",
>> +            NPU2_STACK_STCK_0, NPU2_BLOCK_NTL1, NPU2_NTL_ERR_FIRST1_OFF),
>> +    __NPU2_SCOM_DUMP("NTL0.REGS.CERR_FIRST2",
>> +            NPU2_STACK_STCK_0, NPU2_BLOCK_NTL0, NPU2_NTL_ERR_FIRST2_OFF),
>> +    __NPU2_SCOM_DUMP("NTL1.REGS.CERR_FIRST2",
>> +            NPU2_STACK_STCK_0, NPU2_BLOCK_NTL1, NPU2_NTL_ERR_FIRST2_OFF),
>> +    { NULL, 0, 0, 0 }
>> +};
>> +
>> +static npu2_scom_dump_t npu2_scom_dump_ocapi[] = {
>> +    __NPU2_SCOM_DUMP("OTL0.MISC.C_ERR_RPT_HOLD0",
>> +            NPU2_STACK_STCK_0, NPU2_BLOCK_OTL0, NPU2_OTL_ERR_RPT_HOLD0),
>> +    __NPU2_SCOM_DUMP("OTL1.MISC.C_ERR_RPT_HOLD0",
>> +            NPU2_STACK_STCK_0, NPU2_BLOCK_OTL1, NPU2_OTL_ERR_RPT_HOLD0),
>> +    __NPU2_SCOM_DUMP("OTL0.MISC.OTL_REM0",
>> +            NPU2_STACK_STCK_0, NPU2_BLOCK_OTL0, NPU2_OTL_RAS_ERR_MSG0),
>> +    __NPU2_SCOM_DUMP("OTL1.MISC.OTL_REM0",
>> +            NPU2_STACK_STCK_0, NPU2_BLOCK_OTL1, NPU2_OTL_RAS_ERR_MSG0),
>> +    __NPU2_SCOM_DUMP("OTL0.MISC.ERROR_SIG_RXI",
>> +            NPU2_STACK_STCK_0, NPU2_BLOCK_OTL0, NPU2_OTL_RXI_ERR_SIG),
>> +    __NPU2_SCOM_DUMP("OTL1.MISC.ERROR_SIG_RXI",
>> +            NPU2_STACK_STCK_0, NPU2_BLOCK_OTL1, NPU2_OTL_RXI_ERR_SIG),
>> +    __NPU2_SCOM_DUMP("OTL0.MISC.ERROR_SIG_RXO",
>> +            NPU2_STACK_STCK_0, NPU2_BLOCK_OTL0, NPU2_OTL_RXO_ERR_SIG),
>> +    __NPU2_SCOM_DUMP("OTL1.MISC.ERROR_SIG_RXO",
>> +            NPU2_STACK_STCK_0, NPU2_BLOCK_OTL1, NPU2_OTL_RXO_ERR_SIG),
>> +    __NPU2_SCOM_DUMP("OTL0.MISC.C_ERR_RPT_HOLD1",
>> +            NPU2_STACK_STCK_0, NPU2_BLOCK_OTL0, NPU2_OTL_ERR_RPT_HOLD1),
>> +    __NPU2_SCOM_DUMP("OTL1.MISC.C_ERR_RPT_HOLD1",
>> +            NPU2_STACK_STCK_0, NPU2_BLOCK_OTL1, NPU2_OTL_ERR_RPT_HOLD1),
>> +    { NULL, 0, 0, 0 }
>> +};
>> +
>> +static void print_one_reg(struct npu2 *npu, npu2_scom_dump_t *scom, int
>> stack)
>> +{
>> +    uint64_t reg, val;
>> +
>> +    if (scom->stack != NPU2_STACK_STCK_0)
>> +        stack = scom->stack;
>> +
>> +    reg = NPU2_REG_OFFSET(stack, scom->block, scom->offset);
>> +    val = npu2_scom_read(npu->chip_id, npu->xscom_base,
>> +            reg, NPU2_MISC_DA_LEN_8B);
>> +
>> +    if (scom->stack != NPU2_STACK_STCK_0)
>> +        prlog(PR_ERR, "NPU[%d] %s 0x%llx = 0x%016llx\n",
>> +            npu->chip_id, scom->name, reg, val);
>> +    else
>> +        prlog(PR_ERR, "NPU[%d] STCK%d.%s 0x%llx = 0x%016llx\n",
>> +            npu->chip_id, stack - 4, scom->name, reg, val);
> 
> 
> The scom->stack vs stack logic is unclear here. When would not they be
> equal? Why would you treat NPU2_STACK_STCK_0 any different? Or it is all
> about separate handling of "XTS.REG.ERR_HOLD"?
> 
> 

It was for registers which are not per-stack, but as you noted, I ended 
up with only one XTS register. So yes, it is overkill. I'll simplify 
things by taking the XTS register outside of the array, so that function 
becomes readable.

>> +}
>> +
>> +static void dump_npu2_regs_nvlink(struct npu2 *npu, int brick_index)
>> +{
>> +    uint32_t stack, ntl;
>> +    npu2_scom_dump_t *scom_reg;
>> +
>> +    stack = NPU2_STACK_STCK_0 + brick_index / 2;
>> +    ntl = NPU2_BLOCK_NTL0 + (brick_index % 2) * 2;
>> +
>> +    scom_reg = npu2_scom_dump_nvlink;
> 
> 
> I find it weird to iterate till the null item here as all these arrays
> are static, the size is known, you never pass these arrays to any helper
> (you pass only a single item). ARRAY_SIZE() would look cleaner imo.

Good point. I'll switch to ARRAY_SIZE()

> 
> 
>> +    while (scom_reg->name) {
>> +        if (scom_reg->block == ntl)
>> +            print_one_reg(npu, scom_reg, stack);
>> +        scom_reg++;
>> +    }
>> +}
>> +
>> +static void dump_npu2_regs_opencapi(struct npu2 *npu, int brick_index)
>> +{
>> +    uint64_t val, addr;
>> +    uint32_t stack, otl;
>> +    npu2_scom_dump_t *scom_reg;
>> +
>> +    stack = NPU2_STACK_STCK_0 + brick_index / 2;
>> +    otl = NPU2_BLOCK_OTL0 + (brick_index % 2);
>> +
>> +    /* NPU registers */
>> +    scom_reg = npu2_scom_dump_ocapi;
>> +    while (scom_reg->name) {
>> +        if (scom_reg->block == otl)
>> +            print_one_reg(npu, scom_reg, stack);
>> +        scom_reg++;
>> +    }
>> +
>> +    /* Fabric registers */
>> +    addr = OB_ODL_STATUS(brick_index);
>> +    xscom_read(npu->chip_id, addr, &val);
>> +    prlog(PR_ERR, "NPU[%d] ODL status brick %d 0x%llx = 0x%016llx\n",
>> +        npu->chip_id, brick_index, addr, val);
> 
> 
> Nit: this could make a nice helper like:
> 
> xscom_read_err(npu->chip_id, brick_index, OB_ODL_STATUS(brick_index), msg)
> 

OK, I'll see if I can even push to have one single function printing, to 
guarantee to have a common format, see if it's worth it.

>> +
>> +    addr = OB_ODL_TRAINING_STATUS(brick_index);
>> +    xscom_read(npu->chip_id, addr, &val);
>> +    prlog(PR_ERR, "NPU[%d] ODL training status brick %d 0x%llx =
>> 0x%016llx\n",
>> +        npu->chip_id, brick_index, addr, val);
>> +
>> +    addr = OB_ODL_ENDPOINT_INFO(brick_index);
>> +    xscom_read(npu->chip_id, addr, &val);
>> +    prlog(PR_ERR, "NPU[%d] ODL endpoint info brick %d 0x%llx =
>> 0x%016llx\n",
>> +        npu->chip_id, brick_index, addr, val);
>> +}
>> +
>> +static void dump_npu2_regs(struct npu2 *npu, int brick_index)
>> +{
>> +    int i, stack_min, stack_max;
>> +    uint64_t fir_val, mask_val, fir_addr, mask_addr;
>> +    struct npu2_dev *dev;
>> +    npu2_scom_dump_t *scom_reg;
>> +
>> +    if (brick_index != -1) {
>> +        stack_min = stack_max = NPU2_STACK_STCK_0 + brick_index / 2;
>> +    } else {
>> +        stack_min = NPU2_STACK_STCK_0;
>> +        stack_max = NPU2_STACK_STCK_2;
>> +        /* Avoid dumping unused stacks for opencapi on Lagrange */
>> +        if (npu->total_devices == 2)
>> +            stack_min = stack_max = NPU2_STACK_STCK_1;
>> +    }
>> +
>> +    /* NPU FIRs */
>> +    for (i = 0; i < NPU2_TOTAL_FIR_REGISTERS; i++) {
>> +        fir_addr  = NPU2_FIR_REGISTER_0 + i * NPU2_FIR_OFFSET;
>> +        mask_addr = NPU2_FIR_REGISTER_0 + i * NPU2_FIR_OFFSET +
>> NPU2_FIR_MASK_OFFSET;
> 
> Tiny nit:
>        mask_addr = fir_addr + NPU2_FIR_MASK_OFFSET;

OK.

Thanks a lot, I'll be sending a v2 shortly.

   Fred

>> +        xscom_read(npu->chip_id, fir_addr, &fir_val);
>> +        xscom_read(npu->chip_id, mask_addr, &mask_val);
>> +        prlog(PR_ERR, "NPU[%d] FIR%d = 0x%016llx (mask 0x%016llx =>
>> 0x%016llx)\n",
>> +            npu->chip_id, i, fir_val, mask_val, fir_val & ~mask_val);
>> +    }
>> +
>> +    /* NPU global registers */
>> +    scom_reg = npu2_scom_dump_global;
>> +    while (scom_reg->name) {
>> +        if (scom_reg->stack != NPU2_STACK_STCK_0) // not a per-stack reg
>> +            print_one_reg(npu, scom_reg, -1);
>> +        else
>> +            for (i = stack_min; i <= stack_max; i++)
>> +                print_one_reg(npu, scom_reg, i);
>> +        scom_reg++;
>> +    }
>> +
>> +    /* nvlink- or opencapi-specific registers */
>> +    for (i = 0; i < npu->total_devices; i++) {
>> +        dev = &npu->devices[i];
>> +        if (brick_index == -1 || dev->brick_index == brick_index) {
>> +            if (dev->type == NPU2_DEV_TYPE_NVLINK)
>> +                dump_npu2_regs_nvlink(npu, dev->brick_index);
>> +            else if (dev->type == NPU2_DEV_TYPE_OPENCAPI)
>> +                dump_npu2_regs_opencapi(npu, dev->brick_index);
>> +        }
>> +    }
>> +}
>> +
>> +void npu2_dump_scoms(int chip_id)
>> +{
>> +    struct npu2 *npu;
>> +    struct phb *phb;
>> +    struct npu2_dev *dev;
>> +
>> +    /*
>> +     * Look for the npu2 structure for that chip ID. We can access it
>> +     * through the array of phbs, looking for a nvlink or opencapi
>> +     * phb. We can have several entries, but they all point
>> +     * to the same npu2 structure
>> +     */
>> +    for_each_phb(phb) {
>> +        npu = NULL;
>> +        if (phb->phb_type == phb_type_npu_v2) {
>> +            npu = phb_to_npu2_nvlink(phb);
>> +        } else if (phb->phb_type == phb_type_npu_v2_opencapi) {
>> +            dev = phb_to_npu2_dev_ocapi(phb);
>> +            npu = dev->npu;
>> +        }
>> +        if (npu && npu->chip_id == chip_id) {
>> +            dump_npu2_regs(npu, -1 /* all bricks */);
>> +            break;
>> +        }
>> +    }
>> +}
>> +
>>   static uint64_t npu2_ipi_attributes(struct irq_source *is __unused,
>> uint32_t isn __unused)
>>   {
>>       struct npu2 *p = is->data;
>> @@ -182,6 +493,7 @@ static void npu2_err_interrupt(struct irq_source
>> *is, uint32_t isn)
>>           brick = 2 + ((idx - 27) % 4);
>>           prlog(PR_ERR, "NPU[%d] error interrupt for brick %d\n",
>>               p->chip_id, brick);
>> +        dump_npu2_regs(p, brick);
>>           opal_update_pending_evt(OPAL_EVENT_PCI_ERROR,
>>                       OPAL_EVENT_PCI_ERROR);
>>           break;
>> diff --git a/include/npu2-regs.h b/include/npu2-regs.h
>> index 939a23f5..ba10b8ea 100644
>> --- a/include/npu2-regs.h
>> +++ b/include/npu2-regs.h
>> @@ -203,6 +203,8 @@ void npu2_scom_write(uint64_t gcid, uint64_t scom_base,
>>   #define NPU2_PERF_MASK                0x110
>>   #define NPU2_DBG0_CFG                0x118
>>   #define NPU2_DBG1_CFG                0x120
>> +#define NPU2_C_ERR_RPT_MSG5            0x128
>> +#define NPU2_C_ERR_RPT_MSG6            0x130
>>    /* CTL block registers */
>>   #define NPU2_CQ_CTL_MISC_CFG            0x000
>> @@ -295,10 +297,12 @@ void npu2_scom_write(uint64_t gcid, uint64_t
>> scom_base,
>>   #define NPU2_NTL_MISC_CFG3(ndev)        NPU2_NTL_REG_OFFSET(ndev, 0x008)
>>   #define NPU2_NTL_ERR_HOLD1(ndev)        NPU2_NTL_REG_OFFSET(ndev, 0x010)
>>   #define NPU2_NTL_ERR_MASK1(ndev)        NPU2_NTL_REG_OFFSET(ndev, 0x018)
>> +#define NPU2_NTL_ERR_FIRST1_OFF            0x020
>>   #define NPU2_NTL_ERR_FIRST1(ndev)        NPU2_NTL_REG_OFFSET(ndev, 0x020)
>>   #define NPU2_NTL_ERR_FIRST1_MASK(ndev)        NPU2_NTL_REG_OFFSET(ndev,
>> 0x028)
>>   #define NPU2_NTL_ERR_HOLD2(ndev)        NPU2_NTL_REG_OFFSET(ndev, 0x030)
>>   #define NPU2_NTL_ERR_MASK2(ndev)        NPU2_NTL_REG_OFFSET(ndev, 0x038)
>> +#define NPU2_NTL_ERR_FIRST2_OFF            0x040
>>   #define NPU2_NTL_ERR_FIRST2(ndev)        NPU2_NTL_REG_OFFSET(ndev, 0x040)
>>   #define NPU2_NTL_ERR_FIRST2_MASK(ndev)        NPU2_NTL_REG_OFFSET(ndev,
>> 0x048)
>>   #define NPU2_NTL_SCRATCH2(ndev)            NPU2_NTL_REG_OFFSET(ndev,
>> 0x050)
>> @@ -402,6 +406,12 @@ void npu2_scom_write(uint64_t gcid, uint64_t
>> scom_base,
>>   #define NPU2_OTL_OSL_DAR(stack, block)        NPU2_REG_OFFSET(stack,
>> block, 0x008)
>>   #define NPU2_OTL_OSL_TFC(stack, block)        NPU2_REG_OFFSET(stack,
>> block, 0x010)
>>   #define NPU2_OTL_OSL_PEHANDLE(stack, block)    NPU2_REG_OFFSET(stack,
>> block, 0x018)
>> +#define NPU2_OTL_ERR_RPT_HOLD0            0x30
>> +#define NPU2_OTL_RAS_ERR_MSG0            0x68
>> +#define NPU2_OTL_RXI_ERR_SIG            0x70
>> +#define NPU2_OTL_RXO_ERR_SIG            0x78
>> +#define NPU2_OTL_ERR_RPT_HOLD1            0xB0
>> +
>>    /* Misc block registers. Unlike the SM/CTL/DAT/NTL registers above
>>    * there is only a single instance of each of these in the NPU so we
>> diff --git a/include/npu2.h b/include/npu2.h
>> index ef4e7aff..d58aab47 100644
>> --- a/include/npu2.h
>> +++ b/include/npu2.h
>> @@ -248,4 +248,5 @@ int64_t npu2_freeze_status(struct phb *phb __unused,
>>                  uint8_t *freeze_state,
>>                  uint16_t *pci_error_type __unused,
>>                  uint16_t *severity __unused);
>> +void npu2_dump_scoms(int chip_id);
>>   #endif /* __NPU2_H */
>