[Skiboot] Fwd: [PATCH 6/7] hw/npu2: Dump (more) npu2 registers on link error and HMIs
Frederic Barrat
fbarrat at linux.ibm.com
Fri Mar 22 03:56:42 AEDT 2019
Hi Alexey,
[ replying to the list ]
Thanks for the review! Many good points, a few more details below.
Le 21/03/2019 à 05:21, Alexey Kardashevskiy a écrit :
> Hi,
>
> I think I saw a similar one from Andrew as well so I just relaxed and I
> am waiting till Stewart takes any :) Comments below.
:-)
Though I think you're referring to a different patch from Andrew, who is
cleaning up a bunch of scom accesses in npu2_nvlink_init_npu()
> On 21/03/2019 07:25, Frederic Barrat wrote:
>> Hi Alexey,
>>
>> I forgot to CC you on that patch... It's related to something you had
>> sent before, where you were cleaning up how we dump npu2 registers on
>> HMI. Comments welcome :-)
>>
>> Fred
>>
>>
>> -------- Message transféré --------
>> Sujet : [Skiboot] [PATCH 6/7] hw/npu2: Dump (more) npu2 registers on
>> link error and HMIs
>> Date : Wed, 20 Mar 2019 19:35:21 +0100
>> De : Frederic Barrat <fbarrat at linux.ibm.com>
>> Pour : skiboot at lists.ozlabs.org, andrew.donnellan at au1.ibm.com
>> Copie à : clombard at linux.ibm.com, arbab at linux.ibm.com
>>
>> We were already logging some NPU registers during an HMI. This patch
>> cleans up a bit how it is done and separates what is global from what
>> is specific to nvlink or opencapi.
>>
>> Since we can now receive an error interrupt when an opencapi link goes
>> down unexpectedly, we also dump the NPU state but we limit it to the
>> registers of the brick which hit the error.
>>
>> The list of registers to dump was worked out with the hw team to
>> allow for proper debugging. For each register, we print the name as
>> found in the NPU workbook, the scom address and the register value.
>>
>> Signed-off-by: Frederic Barrat <fbarrat at linux.ibm.com>
>> ---
>> core/hmi.c | 58 +-------
>> hw/npu2-common.c | 312 ++++++++++++++++++++++++++++++++++++++++++++
>> include/npu2-regs.h | 10 ++
>> include/npu2.h | 1 +
>> 4 files changed, 324 insertions(+), 57 deletions(-)
>>
>> diff --git a/core/hmi.c b/core/hmi.c
>> index fbb182c3..26277fa6 100644
>> --- a/core/hmi.c
>> +++ b/core/hmi.c
>> @@ -594,60 +594,6 @@ static void find_nx_checkstop_reason(int flat_chip_id,
>> queue_hmi_event(hmi_evt, 0, out_flags);
>> }
>> -/*
>> - * If the year is 2018 and you still see all these hardcoded, you
>> - * should really replace this with the neat macros that's in the
>> - * NPU2 code rather than this horrible listing of every single
>> - * NPU2 register hardcoded for a specific chip.
>> - *
>> - * I feel dirty having even written it.
>> - */
>> -static uint32_t npu2_scom_dump[] = {
>> - 0x5011017, 0x5011047, 0x5011077, 0x50110A7,
>> - 0x5011217, 0x5011247, 0x5011277, 0x50112A7,
>> - 0x5011417, 0x5011447, 0x5011477, 0x50114A7,
>> - 0x50110DA, 0x50112DA, 0x50114DA,
>> - 0x50110DB, 0x50112DB, 0x50114DB,
>> - 0x5011011, 0x5011041, 0x5011071, 0x50110A1,
>> - 0x5011211, 0x5011241, 0x5011271, 0x50112A1,
>> - 0x5011411, 0x5011441, 0x5011471, 0x50114A1,
>> - 0x5011018, 0x5011048, 0x5011078, 0x50110A8,
>> - 0x5011218, 0x5011248, 0x5011278, 0x50112A8,
>> - 0x5011418, 0x5011448, 0x5011478, 0x50114A8,
>> - 0x5011640,
>> - 0x5011114, 0x5011134, 0x5011314, 0x5011334,
>> - 0x5011514, 0x5011534, 0x5011118, 0x5011138,
>> - 0x5011318, 0x5011338, 0x5011518, 0x5011538,
>> - 0x50110D8, 0x50112D8, 0x50114D8,
>> - 0x50110D9, 0x50112D9, 0x50114D9,
>> - 0x5011019, 0x5011049, 0x5011079, 0x50110A9,
>> - 0x5011219, 0x5011249, 0x5011279, 0x50112A9,
>> - 0x5011419, 0x5011449, 0x5011479, 0x50114A9,
>> - 0x50110F4, 0x50112F4, 0x50114F4,
>> - 0x50110F5, 0x50112F5, 0x50114F5,
>> - 0x50110F6, 0x50112F6, 0x50114F6,
>> - 0x50110FD, 0x50112FD, 0x50114FD,
>> - 0x50110FE, 0x50112FE, 0x50114FE,
>> - 0x00
>> -};
>
>
> I cannot confirm if all of these are replaced but since it was not
> extremely usable in the first place - anything is better :)
I went through the list and I'm pretty sure they are all there.
Actually, there are a few more that the NPU team asked to add.
>
>> -
>> -static void dump_scoms(int flat_chip_id, const char *unit, uint32_t
>> *scoms,
>> - const char *loc)
>> -{
>> - uint64_t value;
>> - int r;
>> -
>> - while (*scoms != 0) {
>> - value = 0;
>> - r = _xscom_read(flat_chip_id, *scoms, &value, false);
>> - if (r != OPAL_SUCCESS)
>> - continue;
>> - prlog(PR_ERR, "%s: [Loc: %s] P:%d 0x%08x=0x%016llx\n",
>> - unit, loc, flat_chip_id, *scoms, value);
>> - scoms++;
>> - }
>> -}
>> -
>> static bool phb_is_npu2(struct dt_node *dn)
>> {
>> return (dt_node_is_compatible(dn, "ibm,power9-npu-pciex") ||
>> @@ -731,9 +677,7 @@ static void find_npu2_checkstop_reason(int
>> flat_chip_id,
>> npu2_hmi_verbose = true;
>> if (npu2_hmi_verbose) {
>> - _xscom_lock();
>> - dump_scoms(flat_chip_id, "NPU", npu2_scom_dump, loc);
>> - _xscom_unlock();
>> + npu2_dump_scoms(flat_chip_id);
>> prlog(PR_ERR, " _________________________ \n");
>> prlog(PR_ERR, "< It's Driver Debug time! >\n");
>> prlog(PR_ERR, " ------------------------- \n");
>> diff --git a/hw/npu2-common.c b/hw/npu2-common.c
>> index ccbbbbca..e370ba87 100644
>> --- a/hw/npu2-common.c
>> +++ b/hw/npu2-common.c
>> @@ -103,6 +103,317 @@ void npu2_write_mask_4b(struct npu2 *p, uint64_t
>> reg, uint32_t val, uint32_t mas
>> (uint64_t)new_val << 32);
>> }
>> +typedef struct {
>> + const char *name;
>> + uint32_t stack;
>
> This one is NPU2_STACK_STCK_0 almost always, with just a single
> exception. You could just dump "XTS.REG.ERR_HOLD" separately and
> simplify the arrays. You could even s/__NPU2_SCOM_DUMP/__DUMP/g and make
> even item of the array be a single line => easier to read.
Indeed, there's only 1 register which is not per-stack. I took it out of
the array and print it separately, as it also simplify things, as you
noted below.
>> + uint32_t block;
>
>
> not "ntl"? Although I am still struggling with all these
> bricks/stacks/links/ntl/otl/blocks/whatever so never mind...
I did mean "block" :-) The block value could be ntl, or otl, or others.
>> + uint32_t offset;
>> +} npu2_scom_dump_t;
>> +
>> +static npu2_scom_dump_t npu2_scom_dump_global[] = {
>> +#define __NPU2_SCOM_DUMP(name, stack, block, offset) { name, stack,
>> block, offset }
>> + /* CQ State Machine */
>> + __NPU2_SCOM_DUMP("CS.SM0.MISC.CERR_MESSAGE0",
>> + NPU2_STACK_STCK_0, NPU2_BLOCK_SM_0, NPU2_C_ERR_RPT_MSG0),
>> + __NPU2_SCOM_DUMP("CS.SM1.MISC.CERR_MESSAGE0",
>> + NPU2_STACK_STCK_0, NPU2_BLOCK_SM_1, NPU2_C_ERR_RPT_MSG0),
>> + __NPU2_SCOM_DUMP("CS.SM2.MISC.CERR_MESSAGE0",
>> + NPU2_STACK_STCK_0, NPU2_BLOCK_SM_2, NPU2_C_ERR_RPT_MSG0),
>> + __NPU2_SCOM_DUMP("CS.SM3.MISC.CERR_MESSAGE0",
>> + NPU2_STACK_STCK_0, NPU2_BLOCK_SM_3, NPU2_C_ERR_RPT_MSG0),
>> +
>> + __NPU2_SCOM_DUMP("CS.SM0.MISC.CERR_MESSAGE1",
>> + NPU2_STACK_STCK_0, NPU2_BLOCK_SM_0, NPU2_C_ERR_RPT_MSG1),
>> + __NPU2_SCOM_DUMP("CS.SM1.MISC.CERR_MESSAGE1",
>> + NPU2_STACK_STCK_0, NPU2_BLOCK_SM_1, NPU2_C_ERR_RPT_MSG1),
>> + __NPU2_SCOM_DUMP("CS.SM2.MISC.CERR_MESSAGE1",
>> + NPU2_STACK_STCK_0, NPU2_BLOCK_SM_2, NPU2_C_ERR_RPT_MSG1),
>> + __NPU2_SCOM_DUMP("CS.SM3.MISC.CERR_MESSAGE1",
>> + NPU2_STACK_STCK_0, NPU2_BLOCK_SM_3, NPU2_C_ERR_RPT_MSG1),
>> +
>> + __NPU2_SCOM_DUMP("CS.SM0.MISC.CERR_MESSAGE2",
>> + NPU2_STACK_STCK_0, NPU2_BLOCK_SM_0, NPU2_C_ERR_RPT_MSG2),
>> + __NPU2_SCOM_DUMP("CS.SM1.MISC.CERR_MESSAGE2",
>> + NPU2_STACK_STCK_0, NPU2_BLOCK_SM_1, NPU2_C_ERR_RPT_MSG2),
>> + __NPU2_SCOM_DUMP("CS.SM2.MISC.CERR_MESSAGE2",
>> + NPU2_STACK_STCK_0, NPU2_BLOCK_SM_2, NPU2_C_ERR_RPT_MSG2),
>> + __NPU2_SCOM_DUMP("CS.SM3.MISC.CERR_MESSAGE2",
>> + NPU2_STACK_STCK_0, NPU2_BLOCK_SM_3, NPU2_C_ERR_RPT_MSG2),
>> +
>> + __NPU2_SCOM_DUMP("CS.SM0.MISC.CERR_MESSAGE3",
>> + NPU2_STACK_STCK_0, NPU2_BLOCK_SM_0, NPU2_C_ERR_RPT_MSG3),
>> + __NPU2_SCOM_DUMP("CS.SM1.MISC.CERR_MESSAGE3",
>> + NPU2_STACK_STCK_0, NPU2_BLOCK_SM_1, NPU2_C_ERR_RPT_MSG3),
>> + __NPU2_SCOM_DUMP("CS.SM2.MISC.CERR_MESSAGE3",
>> + NPU2_STACK_STCK_0, NPU2_BLOCK_SM_2, NPU2_C_ERR_RPT_MSG3),
>> + __NPU2_SCOM_DUMP("CS.SM3.MISC.CERR_MESSAGE3",
>> + NPU2_STACK_STCK_0, NPU2_BLOCK_SM_3, NPU2_C_ERR_RPT_MSG3),
>> +
>> + __NPU2_SCOM_DUMP("CS.SM0.MISC.CERR_MESSAGE4",
>> + NPU2_STACK_STCK_0, NPU2_BLOCK_SM_0, NPU2_C_ERR_RPT_MSG4),
>> + __NPU2_SCOM_DUMP("CS.SM1.MISC.CERR_MESSAGE4",
>> + NPU2_STACK_STCK_0, NPU2_BLOCK_SM_1, NPU2_C_ERR_RPT_MSG4),
>> + __NPU2_SCOM_DUMP("CS.SM2.MISC.CERR_MESSAGE4",
>> + NPU2_STACK_STCK_0, NPU2_BLOCK_SM_2, NPU2_C_ERR_RPT_MSG4),
>> + __NPU2_SCOM_DUMP("CS.SM3.MISC.CERR_MESSAGE4",
>> + NPU2_STACK_STCK_0, NPU2_BLOCK_SM_3, NPU2_C_ERR_RPT_MSG4),
>> +
>> + __NPU2_SCOM_DUMP("CS.SM0.MISC.CERR_MESSAGE5",
>> + NPU2_STACK_STCK_0, NPU2_BLOCK_SM_0, NPU2_C_ERR_RPT_MSG5),
>> + __NPU2_SCOM_DUMP("CS.SM1.MISC.CERR_MESSAGE5",
>> + NPU2_STACK_STCK_0, NPU2_BLOCK_SM_1, NPU2_C_ERR_RPT_MSG5),
>> + __NPU2_SCOM_DUMP("CS.SM2.MISC.CERR_MESSAGE5",
>> + NPU2_STACK_STCK_0, NPU2_BLOCK_SM_2, NPU2_C_ERR_RPT_MSG5),
>> + __NPU2_SCOM_DUMP("CS.SM3.MISC.CERR_MESSAGE5",
>> + NPU2_STACK_STCK_0, NPU2_BLOCK_SM_3, NPU2_C_ERR_RPT_MSG5),
>> +
>> + __NPU2_SCOM_DUMP("CS.SM0.MISC.CERR_MESSAGE6",
>> + NPU2_STACK_STCK_0, NPU2_BLOCK_SM_0, NPU2_C_ERR_RPT_MSG6),
>> + __NPU2_SCOM_DUMP("CS.SM1.MISC.CERR_MESSAGE6",
>> + NPU2_STACK_STCK_0, NPU2_BLOCK_SM_1, NPU2_C_ERR_RPT_MSG6),
>> + __NPU2_SCOM_DUMP("CS.SM2.MISC.CERR_MESSAGE6",
>> + NPU2_STACK_STCK_0, NPU2_BLOCK_SM_2, NPU2_C_ERR_RPT_MSG6),
>> + __NPU2_SCOM_DUMP("CS.SM3.MISC.CERR_MESSAGE6",
>> + NPU2_STACK_STCK_0, NPU2_BLOCK_SM_3, NPU2_C_ERR_RPT_MSG6),
>> +
>> + __NPU2_SCOM_DUMP("CS.SM0.MISC.CERR_FIRST0",
>> + NPU2_STACK_STCK_0, NPU2_BLOCK_SM_0, NPU2_C_ERR_RPT_FIRST0),
>> + __NPU2_SCOM_DUMP("CS.SM1.MISC.CERR_FIRST0",
>> + NPU2_STACK_STCK_0, NPU2_BLOCK_SM_1, NPU2_C_ERR_RPT_FIRST0),
>> + __NPU2_SCOM_DUMP("CS.SM2.MISC.CERR_FIRST0",
>> + NPU2_STACK_STCK_0, NPU2_BLOCK_SM_2, NPU2_C_ERR_RPT_FIRST0),
>> + __NPU2_SCOM_DUMP("CS.SM3.MISC.CERR_FIRST0",
>> + NPU2_STACK_STCK_0, NPU2_BLOCK_SM_3, NPU2_C_ERR_RPT_FIRST0),
>> +
>> + __NPU2_SCOM_DUMP("CS.SM0.MISC.CERR_FIRST1",
>> + NPU2_STACK_STCK_0, NPU2_BLOCK_SM_0, NPU2_C_ERR_RPT_FIRST1),
>> + __NPU2_SCOM_DUMP("CS.SM1.MISC.CERR_FIRST1",
>> + NPU2_STACK_STCK_0, NPU2_BLOCK_SM_1, NPU2_C_ERR_RPT_FIRST1),
>> + __NPU2_SCOM_DUMP("CS.SM2.MISC.CERR_FIRST1",
>> + NPU2_STACK_STCK_0, NPU2_BLOCK_SM_2, NPU2_C_ERR_RPT_FIRST1),
>> + __NPU2_SCOM_DUMP("CS.SM3.MISC.CERR_FIRST1",
>> + NPU2_STACK_STCK_0, NPU2_BLOCK_SM_3, NPU2_C_ERR_RPT_FIRST1),
>> +
>> + __NPU2_SCOM_DUMP("CS.SM0.MISC.CERR_FIRST2",
>> + NPU2_STACK_STCK_0, NPU2_BLOCK_SM_0, NPU2_C_ERR_RPT_FIRST2),
>> + __NPU2_SCOM_DUMP("CS.SM1.MISC.CERR_FIRST2",
>> + NPU2_STACK_STCK_0, NPU2_BLOCK_SM_1, NPU2_C_ERR_RPT_FIRST2),
>> + __NPU2_SCOM_DUMP("CS.SM2.MISC.CERR_FIRST2",
>> + NPU2_STACK_STCK_0, NPU2_BLOCK_SM_2, NPU2_C_ERR_RPT_FIRST2),
>> + __NPU2_SCOM_DUMP("CS.SM3.MISC.CERR_FIRST2",
>> + NPU2_STACK_STCK_0, NPU2_BLOCK_SM_3, NPU2_C_ERR_RPT_FIRST2),
>> +
>> + /* CQ Control */
>> + __NPU2_SCOM_DUMP("CS.CTL.MISC.CERR_MESSAGE0",
>> + NPU2_STACK_STCK_0, NPU2_BLOCK_CTL, NPU2_CQ_C_ERR_RPT_MSG0),
>> + __NPU2_SCOM_DUMP("CS.CTL.MISC.CERR_MESSAGE1",
>> + NPU2_STACK_STCK_0, NPU2_BLOCK_CTL, NPU2_CQ_C_ERR_RPT_MSG1),
>> + __NPU2_SCOM_DUMP("CS.CTL.MISC.CERR_FIRST0",
>> + NPU2_STACK_STCK_0, NPU2_BLOCK_CTL, NPU2_CQ_C_ERR_RPT_FIRST0),
>> + __NPU2_SCOM_DUMP("CS.CTL.MISC.CERR_FIRST1",
>> + NPU2_STACK_STCK_0, NPU2_BLOCK_CTL, NPU2_CQ_C_ERR_RPT_FIRST1),
>> +
>> + /* CQ Data */
>> + __NPU2_SCOM_DUMP("DAT.MISC.CERR_ECC_HOLD",
>> + NPU2_STACK_STCK_0, NPU2_BLOCK_DAT, NPU2_CQ_DAT_ECC_STATUS),
>> + __NPU2_SCOM_DUMP("DAT.MISC.CERR_ECC_MASK",
>> + NPU2_STACK_STCK_0, NPU2_BLOCK_DAT, NPU2_CQ_DAT_ECC_MASK),
>> + __NPU2_SCOM_DUMP("DAT.MISC.CERR_ECC_FIRST",
>> + NPU2_STACK_STCK_0, NPU2_BLOCK_DAT, NPU2_CQ_DAT_ECC_FIRST),
>> + __NPU2_SCOM_DUMP("DAT.MISC.REM0",
>> + NPU2_STACK_STCK_0, NPU2_BLOCK_DAT, NPU2_CQ_DAT_RAS_MSG0),
>> + __NPU2_SCOM_DUMP("DAT.MISC.REM1",
>> + NPU2_STACK_STCK_0, NPU2_BLOCK_DAT, NPU2_CQ_DAT_RAS_MSG1),
>> +
>> + /* XTS */
>> + __NPU2_SCOM_DUMP("XTS.REG.ERR_HOLD",
>> + NPU2_STACK_MISC, NPU2_BLOCK_XTS, 0),
>> +
>> + { NULL, 0, 0, 0 }
>> +};
>> +
>> +static npu2_scom_dump_t npu2_scom_dump_nvlink[] = {
>> + __NPU2_SCOM_DUMP("NTL0.REGS.CERR_FIRST1",
>> + NPU2_STACK_STCK_0, NPU2_BLOCK_NTL0, NPU2_NTL_ERR_FIRST1_OFF),
>> + __NPU2_SCOM_DUMP("NTL1.REGS.CERR_FIRST1",
>> + NPU2_STACK_STCK_0, NPU2_BLOCK_NTL1, NPU2_NTL_ERR_FIRST1_OFF),
>> + __NPU2_SCOM_DUMP("NTL0.REGS.CERR_FIRST2",
>> + NPU2_STACK_STCK_0, NPU2_BLOCK_NTL0, NPU2_NTL_ERR_FIRST2_OFF),
>> + __NPU2_SCOM_DUMP("NTL1.REGS.CERR_FIRST2",
>> + NPU2_STACK_STCK_0, NPU2_BLOCK_NTL1, NPU2_NTL_ERR_FIRST2_OFF),
>> + { NULL, 0, 0, 0 }
>> +};
>> +
>> +static npu2_scom_dump_t npu2_scom_dump_ocapi[] = {
>> + __NPU2_SCOM_DUMP("OTL0.MISC.C_ERR_RPT_HOLD0",
>> + NPU2_STACK_STCK_0, NPU2_BLOCK_OTL0, NPU2_OTL_ERR_RPT_HOLD0),
>> + __NPU2_SCOM_DUMP("OTL1.MISC.C_ERR_RPT_HOLD0",
>> + NPU2_STACK_STCK_0, NPU2_BLOCK_OTL1, NPU2_OTL_ERR_RPT_HOLD0),
>> + __NPU2_SCOM_DUMP("OTL0.MISC.OTL_REM0",
>> + NPU2_STACK_STCK_0, NPU2_BLOCK_OTL0, NPU2_OTL_RAS_ERR_MSG0),
>> + __NPU2_SCOM_DUMP("OTL1.MISC.OTL_REM0",
>> + NPU2_STACK_STCK_0, NPU2_BLOCK_OTL1, NPU2_OTL_RAS_ERR_MSG0),
>> + __NPU2_SCOM_DUMP("OTL0.MISC.ERROR_SIG_RXI",
>> + NPU2_STACK_STCK_0, NPU2_BLOCK_OTL0, NPU2_OTL_RXI_ERR_SIG),
>> + __NPU2_SCOM_DUMP("OTL1.MISC.ERROR_SIG_RXI",
>> + NPU2_STACK_STCK_0, NPU2_BLOCK_OTL1, NPU2_OTL_RXI_ERR_SIG),
>> + __NPU2_SCOM_DUMP("OTL0.MISC.ERROR_SIG_RXO",
>> + NPU2_STACK_STCK_0, NPU2_BLOCK_OTL0, NPU2_OTL_RXO_ERR_SIG),
>> + __NPU2_SCOM_DUMP("OTL1.MISC.ERROR_SIG_RXO",
>> + NPU2_STACK_STCK_0, NPU2_BLOCK_OTL1, NPU2_OTL_RXO_ERR_SIG),
>> + __NPU2_SCOM_DUMP("OTL0.MISC.C_ERR_RPT_HOLD1",
>> + NPU2_STACK_STCK_0, NPU2_BLOCK_OTL0, NPU2_OTL_ERR_RPT_HOLD1),
>> + __NPU2_SCOM_DUMP("OTL1.MISC.C_ERR_RPT_HOLD1",
>> + NPU2_STACK_STCK_0, NPU2_BLOCK_OTL1, NPU2_OTL_ERR_RPT_HOLD1),
>> + { NULL, 0, 0, 0 }
>> +};
>> +
>> +static void print_one_reg(struct npu2 *npu, npu2_scom_dump_t *scom, int
>> stack)
>> +{
>> + uint64_t reg, val;
>> +
>> + if (scom->stack != NPU2_STACK_STCK_0)
>> + stack = scom->stack;
>> +
>> + reg = NPU2_REG_OFFSET(stack, scom->block, scom->offset);
>> + val = npu2_scom_read(npu->chip_id, npu->xscom_base,
>> + reg, NPU2_MISC_DA_LEN_8B);
>> +
>> + if (scom->stack != NPU2_STACK_STCK_0)
>> + prlog(PR_ERR, "NPU[%d] %s 0x%llx = 0x%016llx\n",
>> + npu->chip_id, scom->name, reg, val);
>> + else
>> + prlog(PR_ERR, "NPU[%d] STCK%d.%s 0x%llx = 0x%016llx\n",
>> + npu->chip_id, stack - 4, scom->name, reg, val);
>
>
> The scom->stack vs stack logic is unclear here. When would not they be
> equal? Why would you treat NPU2_STACK_STCK_0 any different? Or it is all
> about separate handling of "XTS.REG.ERR_HOLD"?
>
>
It was for registers which are not per-stack, but as you noted, I ended
up with only one XTS register. So yes, it is overkill. I'll simplify
things by taking the XTS register outside of the array, so that function
becomes readable.
>> +}
>> +
>> +static void dump_npu2_regs_nvlink(struct npu2 *npu, int brick_index)
>> +{
>> + uint32_t stack, ntl;
>> + npu2_scom_dump_t *scom_reg;
>> +
>> + stack = NPU2_STACK_STCK_0 + brick_index / 2;
>> + ntl = NPU2_BLOCK_NTL0 + (brick_index % 2) * 2;
>> +
>> + scom_reg = npu2_scom_dump_nvlink;
>
>
> I find it weird to iterate till the null item here as all these arrays
> are static, the size is known, you never pass these arrays to any helper
> (you pass only a single item). ARRAY_SIZE() would look cleaner imo.
Good point. I'll switch to ARRAY_SIZE()
>
>
>> + while (scom_reg->name) {
>> + if (scom_reg->block == ntl)
>> + print_one_reg(npu, scom_reg, stack);
>> + scom_reg++;
>> + }
>> +}
>> +
>> +static void dump_npu2_regs_opencapi(struct npu2 *npu, int brick_index)
>> +{
>> + uint64_t val, addr;
>> + uint32_t stack, otl;
>> + npu2_scom_dump_t *scom_reg;
>> +
>> + stack = NPU2_STACK_STCK_0 + brick_index / 2;
>> + otl = NPU2_BLOCK_OTL0 + (brick_index % 2);
>> +
>> + /* NPU registers */
>> + scom_reg = npu2_scom_dump_ocapi;
>> + while (scom_reg->name) {
>> + if (scom_reg->block == otl)
>> + print_one_reg(npu, scom_reg, stack);
>> + scom_reg++;
>> + }
>> +
>> + /* Fabric registers */
>> + addr = OB_ODL_STATUS(brick_index);
>> + xscom_read(npu->chip_id, addr, &val);
>> + prlog(PR_ERR, "NPU[%d] ODL status brick %d 0x%llx = 0x%016llx\n",
>> + npu->chip_id, brick_index, addr, val);
>
>
> Nit: this could make a nice helper like:
>
> xscom_read_err(npu->chip_id, brick_index, OB_ODL_STATUS(brick_index), msg)
>
OK, I'll see if I can even push to have one single function printing, to
guarantee to have a common format, see if it's worth it.
>> +
>> + addr = OB_ODL_TRAINING_STATUS(brick_index);
>> + xscom_read(npu->chip_id, addr, &val);
>> + prlog(PR_ERR, "NPU[%d] ODL training status brick %d 0x%llx =
>> 0x%016llx\n",
>> + npu->chip_id, brick_index, addr, val);
>> +
>> + addr = OB_ODL_ENDPOINT_INFO(brick_index);
>> + xscom_read(npu->chip_id, addr, &val);
>> + prlog(PR_ERR, "NPU[%d] ODL endpoint info brick %d 0x%llx =
>> 0x%016llx\n",
>> + npu->chip_id, brick_index, addr, val);
>> +}
>> +
>> +static void dump_npu2_regs(struct npu2 *npu, int brick_index)
>> +{
>> + int i, stack_min, stack_max;
>> + uint64_t fir_val, mask_val, fir_addr, mask_addr;
>> + struct npu2_dev *dev;
>> + npu2_scom_dump_t *scom_reg;
>> +
>> + if (brick_index != -1) {
>> + stack_min = stack_max = NPU2_STACK_STCK_0 + brick_index / 2;
>> + } else {
>> + stack_min = NPU2_STACK_STCK_0;
>> + stack_max = NPU2_STACK_STCK_2;
>> + /* Avoid dumping unused stacks for opencapi on Lagrange */
>> + if (npu->total_devices == 2)
>> + stack_min = stack_max = NPU2_STACK_STCK_1;
>> + }
>> +
>> + /* NPU FIRs */
>> + for (i = 0; i < NPU2_TOTAL_FIR_REGISTERS; i++) {
>> + fir_addr = NPU2_FIR_REGISTER_0 + i * NPU2_FIR_OFFSET;
>> + mask_addr = NPU2_FIR_REGISTER_0 + i * NPU2_FIR_OFFSET +
>> NPU2_FIR_MASK_OFFSET;
>
> Tiny nit:
> mask_addr = fir_addr + NPU2_FIR_MASK_OFFSET;
OK.
Thanks a lot, I'll be sending a v2 shortly.
Fred
>> + xscom_read(npu->chip_id, fir_addr, &fir_val);
>> + xscom_read(npu->chip_id, mask_addr, &mask_val);
>> + prlog(PR_ERR, "NPU[%d] FIR%d = 0x%016llx (mask 0x%016llx =>
>> 0x%016llx)\n",
>> + npu->chip_id, i, fir_val, mask_val, fir_val & ~mask_val);
>> + }
>> +
>> + /* NPU global registers */
>> + scom_reg = npu2_scom_dump_global;
>> + while (scom_reg->name) {
>> + if (scom_reg->stack != NPU2_STACK_STCK_0) // not a per-stack reg
>> + print_one_reg(npu, scom_reg, -1);
>> + else
>> + for (i = stack_min; i <= stack_max; i++)
>> + print_one_reg(npu, scom_reg, i);
>> + scom_reg++;
>> + }
>> +
>> + /* nvlink- or opencapi-specific registers */
>> + for (i = 0; i < npu->total_devices; i++) {
>> + dev = &npu->devices[i];
>> + if (brick_index == -1 || dev->brick_index == brick_index) {
>> + if (dev->type == NPU2_DEV_TYPE_NVLINK)
>> + dump_npu2_regs_nvlink(npu, dev->brick_index);
>> + else if (dev->type == NPU2_DEV_TYPE_OPENCAPI)
>> + dump_npu2_regs_opencapi(npu, dev->brick_index);
>> + }
>> + }
>> +}
>> +
>> +void npu2_dump_scoms(int chip_id)
>> +{
>> + struct npu2 *npu;
>> + struct phb *phb;
>> + struct npu2_dev *dev;
>> +
>> + /*
>> + * Look for the npu2 structure for that chip ID. We can access it
>> + * through the array of phbs, looking for a nvlink or opencapi
>> + * phb. We can have several entries, but they all point
>> + * to the same npu2 structure
>> + */
>> + for_each_phb(phb) {
>> + npu = NULL;
>> + if (phb->phb_type == phb_type_npu_v2) {
>> + npu = phb_to_npu2_nvlink(phb);
>> + } else if (phb->phb_type == phb_type_npu_v2_opencapi) {
>> + dev = phb_to_npu2_dev_ocapi(phb);
>> + npu = dev->npu;
>> + }
>> + if (npu && npu->chip_id == chip_id) {
>> + dump_npu2_regs(npu, -1 /* all bricks */);
>> + break;
>> + }
>> + }
>> +}
>> +
>> static uint64_t npu2_ipi_attributes(struct irq_source *is __unused,
>> uint32_t isn __unused)
>> {
>> struct npu2 *p = is->data;
>> @@ -182,6 +493,7 @@ static void npu2_err_interrupt(struct irq_source
>> *is, uint32_t isn)
>> brick = 2 + ((idx - 27) % 4);
>> prlog(PR_ERR, "NPU[%d] error interrupt for brick %d\n",
>> p->chip_id, brick);
>> + dump_npu2_regs(p, brick);
>> opal_update_pending_evt(OPAL_EVENT_PCI_ERROR,
>> OPAL_EVENT_PCI_ERROR);
>> break;
>> diff --git a/include/npu2-regs.h b/include/npu2-regs.h
>> index 939a23f5..ba10b8ea 100644
>> --- a/include/npu2-regs.h
>> +++ b/include/npu2-regs.h
>> @@ -203,6 +203,8 @@ void npu2_scom_write(uint64_t gcid, uint64_t scom_base,
>> #define NPU2_PERF_MASK 0x110
>> #define NPU2_DBG0_CFG 0x118
>> #define NPU2_DBG1_CFG 0x120
>> +#define NPU2_C_ERR_RPT_MSG5 0x128
>> +#define NPU2_C_ERR_RPT_MSG6 0x130
>> /* CTL block registers */
>> #define NPU2_CQ_CTL_MISC_CFG 0x000
>> @@ -295,10 +297,12 @@ void npu2_scom_write(uint64_t gcid, uint64_t
>> scom_base,
>> #define NPU2_NTL_MISC_CFG3(ndev) NPU2_NTL_REG_OFFSET(ndev, 0x008)
>> #define NPU2_NTL_ERR_HOLD1(ndev) NPU2_NTL_REG_OFFSET(ndev, 0x010)
>> #define NPU2_NTL_ERR_MASK1(ndev) NPU2_NTL_REG_OFFSET(ndev, 0x018)
>> +#define NPU2_NTL_ERR_FIRST1_OFF 0x020
>> #define NPU2_NTL_ERR_FIRST1(ndev) NPU2_NTL_REG_OFFSET(ndev, 0x020)
>> #define NPU2_NTL_ERR_FIRST1_MASK(ndev) NPU2_NTL_REG_OFFSET(ndev,
>> 0x028)
>> #define NPU2_NTL_ERR_HOLD2(ndev) NPU2_NTL_REG_OFFSET(ndev, 0x030)
>> #define NPU2_NTL_ERR_MASK2(ndev) NPU2_NTL_REG_OFFSET(ndev, 0x038)
>> +#define NPU2_NTL_ERR_FIRST2_OFF 0x040
>> #define NPU2_NTL_ERR_FIRST2(ndev) NPU2_NTL_REG_OFFSET(ndev, 0x040)
>> #define NPU2_NTL_ERR_FIRST2_MASK(ndev) NPU2_NTL_REG_OFFSET(ndev,
>> 0x048)
>> #define NPU2_NTL_SCRATCH2(ndev) NPU2_NTL_REG_OFFSET(ndev,
>> 0x050)
>> @@ -402,6 +406,12 @@ void npu2_scom_write(uint64_t gcid, uint64_t
>> scom_base,
>> #define NPU2_OTL_OSL_DAR(stack, block) NPU2_REG_OFFSET(stack,
>> block, 0x008)
>> #define NPU2_OTL_OSL_TFC(stack, block) NPU2_REG_OFFSET(stack,
>> block, 0x010)
>> #define NPU2_OTL_OSL_PEHANDLE(stack, block) NPU2_REG_OFFSET(stack,
>> block, 0x018)
>> +#define NPU2_OTL_ERR_RPT_HOLD0 0x30
>> +#define NPU2_OTL_RAS_ERR_MSG0 0x68
>> +#define NPU2_OTL_RXI_ERR_SIG 0x70
>> +#define NPU2_OTL_RXO_ERR_SIG 0x78
>> +#define NPU2_OTL_ERR_RPT_HOLD1 0xB0
>> +
>> /* Misc block registers. Unlike the SM/CTL/DAT/NTL registers above
>> * there is only a single instance of each of these in the NPU so we
>> diff --git a/include/npu2.h b/include/npu2.h
>> index ef4e7aff..d58aab47 100644
>> --- a/include/npu2.h
>> +++ b/include/npu2.h
>> @@ -248,4 +248,5 @@ int64_t npu2_freeze_status(struct phb *phb __unused,
>> uint8_t *freeze_state,
>> uint16_t *pci_error_type __unused,
>> uint16_t *severity __unused);
>> +void npu2_dump_scoms(int chip_id);
>> #endif /* __NPU2_H */
>
More information about the Skiboot
mailing list