[Skiboot] [PATCH v2 6/7] hw/npu2: Dump (more) npu2 registers on link error and HMIs
Andrew Donnellan
andrew.donnellan at au1.ibm.com
Thu Apr 4 16:52:35 AEDT 2019
On 26/3/19 5:29 am, Frederic Barrat wrote:
> We were already logging some NPU registers during an HMI. This patch
> cleans up a bit how it is done and separates what is global from what
> is specific to nvlink or opencapi.
>
> Since we can now receive an error interrupt when an opencapi link goes
> down unexpectedly, we also dump the NPU state but we limit it to the
> registers of the brick which hit the error.
>
> The list of registers to dump was worked out with the hw team to
> allow for proper debugging. For each register, we print the name as
> found in the NPU workbook, the scom address and the register value.
>
> Signed-off-by: Frederic Barrat <fbarrat at linux.ibm.com>
> ---
> Changelog
> v2:
> - Simplify per-stack and stack-independent register handling by
> treating the XTS register separately
> - use ARRAY_SIZE() to iterate over the registers to dump
>
>
> core/hmi.c | 58 +--------
> hw/npu2-common.c | 299 ++++++++++++++++++++++++++++++++++++++++++++
> include/npu2-regs.h | 10 ++
> include/npu2.h | 1 +
> 4 files changed, 311 insertions(+), 57 deletions(-)
>
> diff --git a/core/hmi.c b/core/hmi.c
> index fbb182c3..26277fa6 100644
> --- a/core/hmi.c
> +++ b/core/hmi.c
> @@ -594,60 +594,6 @@ static void find_nx_checkstop_reason(int flat_chip_id,
> queue_hmi_event(hmi_evt, 0, out_flags);
> }
>
> -/*
> - * If the year is 2018 and you still see all these hardcoded, you
> - * should really replace this with the neat macros that's in the
> - * NPU2 code rather than this horrible listing of every single
> - * NPU2 register hardcoded for a specific chip.
> - *
> - * I feel dirty having even written it.
> - */
> -static uint32_t npu2_scom_dump[] = {
> - 0x5011017, 0x5011047, 0x5011077, 0x50110A7,
> - 0x5011217, 0x5011247, 0x5011277, 0x50112A7,
> - 0x5011417, 0x5011447, 0x5011477, 0x50114A7,
> - 0x50110DA, 0x50112DA, 0x50114DA,
> - 0x50110DB, 0x50112DB, 0x50114DB,
> - 0x5011011, 0x5011041, 0x5011071, 0x50110A1,
> - 0x5011211, 0x5011241, 0x5011271, 0x50112A1,
> - 0x5011411, 0x5011441, 0x5011471, 0x50114A1,
> - 0x5011018, 0x5011048, 0x5011078, 0x50110A8,
> - 0x5011218, 0x5011248, 0x5011278, 0x50112A8,
> - 0x5011418, 0x5011448, 0x5011478, 0x50114A8,
> - 0x5011640,
> - 0x5011114, 0x5011134, 0x5011314, 0x5011334,
> - 0x5011514, 0x5011534, 0x5011118, 0x5011138,
> - 0x5011318, 0x5011338, 0x5011518, 0x5011538,
> - 0x50110D8, 0x50112D8, 0x50114D8,
> - 0x50110D9, 0x50112D9, 0x50114D9,
> - 0x5011019, 0x5011049, 0x5011079, 0x50110A9,
> - 0x5011219, 0x5011249, 0x5011279, 0x50112A9,
> - 0x5011419, 0x5011449, 0x5011479, 0x50114A9,
> - 0x50110F4, 0x50112F4, 0x50114F4,
> - 0x50110F5, 0x50112F5, 0x50114F5,
> - 0x50110F6, 0x50112F6, 0x50114F6,
> - 0x50110FD, 0x50112FD, 0x50114FD,
> - 0x50110FE, 0x50112FE, 0x50114FE,
> - 0x00
> -};
> -
> -static void dump_scoms(int flat_chip_id, const char *unit, uint32_t *scoms,
> - const char *loc)
> -{
> - uint64_t value;
> - int r;
> -
> - while (*scoms != 0) {
> - value = 0;
> - r = _xscom_read(flat_chip_id, *scoms, &value, false);
> - if (r != OPAL_SUCCESS)
> - continue;
> - prlog(PR_ERR, "%s: [Loc: %s] P:%d 0x%08x=0x%016llx\n",
> - unit, loc, flat_chip_id, *scoms, value);
> - scoms++;
> - }
> -}
> -
> static bool phb_is_npu2(struct dt_node *dn)
> {
> return (dt_node_is_compatible(dn, "ibm,power9-npu-pciex") ||
> @@ -731,9 +677,7 @@ static void find_npu2_checkstop_reason(int flat_chip_id,
> npu2_hmi_verbose = true;
>
> if (npu2_hmi_verbose) {
> - _xscom_lock();
> - dump_scoms(flat_chip_id, "NPU", npu2_scom_dump, loc);
> - _xscom_unlock();
> + npu2_dump_scoms(flat_chip_id);
> prlog(PR_ERR, " _________________________ \n");
> prlog(PR_ERR, "< It's Driver Debug time! >\n");
> prlog(PR_ERR, " ------------------------- \n");
> diff --git a/hw/npu2-common.c b/hw/npu2-common.c
> index ccbbbbca..def23728 100644
> --- a/hw/npu2-common.c
> +++ b/hw/npu2-common.c
> @@ -103,6 +103,304 @@ void npu2_write_mask_4b(struct npu2 *p, uint64_t reg, uint32_t val, uint32_t mas
> (uint64_t)new_val << 32);
> }
>
> +typedef struct {
> + const char *name;
> + uint32_t block;
> + uint32_t offset;
> +} npu2_scom_dump_t;
> +
> +static npu2_scom_dump_t npu2_scom_dump_global[] = {
> +#define __NPU2_SCOM_DUMP(name, block, offset) { name, block, offset }
What's the benefit of using this macro? And if we are going to use it,
can we define it outside of the array definition?
> + /* CQ State Machine */
> + __NPU2_SCOM_DUMP("CS.SM0.MISC.CERR_MESSAGE0",
> + NPU2_BLOCK_SM_0, NPU2_C_ERR_RPT_MSG0),
> + __NPU2_SCOM_DUMP("CS.SM1.MISC.CERR_MESSAGE0",
> + NPU2_BLOCK_SM_1, NPU2_C_ERR_RPT_MSG0),
> + __NPU2_SCOM_DUMP("CS.SM2.MISC.CERR_MESSAGE0",
> + NPU2_BLOCK_SM_2, NPU2_C_ERR_RPT_MSG0),
> + __NPU2_SCOM_DUMP("CS.SM3.MISC.CERR_MESSAGE0",
> + NPU2_BLOCK_SM_3, NPU2_C_ERR_RPT_MSG0),
> +
> + __NPU2_SCOM_DUMP("CS.SM0.MISC.CERR_MESSAGE1",
> + NPU2_BLOCK_SM_0, NPU2_C_ERR_RPT_MSG1),
> + __NPU2_SCOM_DUMP("CS.SM1.MISC.CERR_MESSAGE1",
> + NPU2_BLOCK_SM_1, NPU2_C_ERR_RPT_MSG1),
> + __NPU2_SCOM_DUMP("CS.SM2.MISC.CERR_MESSAGE1",
> + NPU2_BLOCK_SM_2, NPU2_C_ERR_RPT_MSG1),
> + __NPU2_SCOM_DUMP("CS.SM3.MISC.CERR_MESSAGE1",
> + NPU2_BLOCK_SM_3, NPU2_C_ERR_RPT_MSG1),
> +
> + __NPU2_SCOM_DUMP("CS.SM0.MISC.CERR_MESSAGE2",
> + NPU2_BLOCK_SM_0, NPU2_C_ERR_RPT_MSG2),
> + __NPU2_SCOM_DUMP("CS.SM1.MISC.CERR_MESSAGE2",
> + NPU2_BLOCK_SM_1, NPU2_C_ERR_RPT_MSG2),
> + __NPU2_SCOM_DUMP("CS.SM2.MISC.CERR_MESSAGE2",
> + NPU2_BLOCK_SM_2, NPU2_C_ERR_RPT_MSG2),
> + __NPU2_SCOM_DUMP("CS.SM3.MISC.CERR_MESSAGE2",
> + NPU2_BLOCK_SM_3, NPU2_C_ERR_RPT_MSG2),
> +
> + __NPU2_SCOM_DUMP("CS.SM0.MISC.CERR_MESSAGE3",
> + NPU2_BLOCK_SM_0, NPU2_C_ERR_RPT_MSG3),
> + __NPU2_SCOM_DUMP("CS.SM1.MISC.CERR_MESSAGE3",
> + NPU2_BLOCK_SM_1, NPU2_C_ERR_RPT_MSG3),
> + __NPU2_SCOM_DUMP("CS.SM2.MISC.CERR_MESSAGE3",
> + NPU2_BLOCK_SM_2, NPU2_C_ERR_RPT_MSG3),
> + __NPU2_SCOM_DUMP("CS.SM3.MISC.CERR_MESSAGE3",
> + NPU2_BLOCK_SM_3, NPU2_C_ERR_RPT_MSG3),
> +
> + __NPU2_SCOM_DUMP("CS.SM0.MISC.CERR_MESSAGE4",
> + NPU2_BLOCK_SM_0, NPU2_C_ERR_RPT_MSG4),
> + __NPU2_SCOM_DUMP("CS.SM1.MISC.CERR_MESSAGE4",
> + NPU2_BLOCK_SM_1, NPU2_C_ERR_RPT_MSG4),
> + __NPU2_SCOM_DUMP("CS.SM2.MISC.CERR_MESSAGE4",
> + NPU2_BLOCK_SM_2, NPU2_C_ERR_RPT_MSG4),
> + __NPU2_SCOM_DUMP("CS.SM3.MISC.CERR_MESSAGE4",
> + NPU2_BLOCK_SM_3, NPU2_C_ERR_RPT_MSG4),
> +
> + __NPU2_SCOM_DUMP("CS.SM0.MISC.CERR_MESSAGE5",
> + NPU2_BLOCK_SM_0, NPU2_C_ERR_RPT_MSG5),
> + __NPU2_SCOM_DUMP("CS.SM1.MISC.CERR_MESSAGE5",
> + NPU2_BLOCK_SM_1, NPU2_C_ERR_RPT_MSG5),
> + __NPU2_SCOM_DUMP("CS.SM2.MISC.CERR_MESSAGE5",
> + NPU2_BLOCK_SM_2, NPU2_C_ERR_RPT_MSG5),
> + __NPU2_SCOM_DUMP("CS.SM3.MISC.CERR_MESSAGE5",
> + NPU2_BLOCK_SM_3, NPU2_C_ERR_RPT_MSG5),
> +
> + __NPU2_SCOM_DUMP("CS.SM0.MISC.CERR_MESSAGE6",
> + NPU2_BLOCK_SM_0, NPU2_C_ERR_RPT_MSG6),
> + __NPU2_SCOM_DUMP("CS.SM1.MISC.CERR_MESSAGE6",
> + NPU2_BLOCK_SM_1, NPU2_C_ERR_RPT_MSG6),
> + __NPU2_SCOM_DUMP("CS.SM2.MISC.CERR_MESSAGE6",
> + NPU2_BLOCK_SM_2, NPU2_C_ERR_RPT_MSG6),
> + __NPU2_SCOM_DUMP("CS.SM3.MISC.CERR_MESSAGE6",
> + NPU2_BLOCK_SM_3, NPU2_C_ERR_RPT_MSG6),
> +
> + __NPU2_SCOM_DUMP("CS.SM0.MISC.CERR_FIRST0",
> + NPU2_BLOCK_SM_0, NPU2_C_ERR_RPT_FIRST0),
> + __NPU2_SCOM_DUMP("CS.SM1.MISC.CERR_FIRST0",
> + NPU2_BLOCK_SM_1, NPU2_C_ERR_RPT_FIRST0),
> + __NPU2_SCOM_DUMP("CS.SM2.MISC.CERR_FIRST0",
> + NPU2_BLOCK_SM_2, NPU2_C_ERR_RPT_FIRST0),
> + __NPU2_SCOM_DUMP("CS.SM3.MISC.CERR_FIRST0",
> + NPU2_BLOCK_SM_3, NPU2_C_ERR_RPT_FIRST0),
> +
> + __NPU2_SCOM_DUMP("CS.SM0.MISC.CERR_FIRST1",
> + NPU2_BLOCK_SM_0, NPU2_C_ERR_RPT_FIRST1),
> + __NPU2_SCOM_DUMP("CS.SM1.MISC.CERR_FIRST1",
> + NPU2_BLOCK_SM_1, NPU2_C_ERR_RPT_FIRST1),
> + __NPU2_SCOM_DUMP("CS.SM2.MISC.CERR_FIRST1",
> + NPU2_BLOCK_SM_2, NPU2_C_ERR_RPT_FIRST1),
> + __NPU2_SCOM_DUMP("CS.SM3.MISC.CERR_FIRST1",
> + NPU2_BLOCK_SM_3, NPU2_C_ERR_RPT_FIRST1),
> +
> + __NPU2_SCOM_DUMP("CS.SM0.MISC.CERR_FIRST2",
> + NPU2_BLOCK_SM_0, NPU2_C_ERR_RPT_FIRST2),
> + __NPU2_SCOM_DUMP("CS.SM1.MISC.CERR_FIRST2",
> + NPU2_BLOCK_SM_1, NPU2_C_ERR_RPT_FIRST2),
> + __NPU2_SCOM_DUMP("CS.SM2.MISC.CERR_FIRST2",
> + NPU2_BLOCK_SM_2, NPU2_C_ERR_RPT_FIRST2),
> + __NPU2_SCOM_DUMP("CS.SM3.MISC.CERR_FIRST2",
> + NPU2_BLOCK_SM_3, NPU2_C_ERR_RPT_FIRST2),
> +
> + /* CQ Control */
> + __NPU2_SCOM_DUMP("CS.CTL.MISC.CERR_MESSAGE0",
> + NPU2_BLOCK_CTL, NPU2_CQ_C_ERR_RPT_MSG0),
> + __NPU2_SCOM_DUMP("CS.CTL.MISC.CERR_MESSAGE1",
> + NPU2_BLOCK_CTL, NPU2_CQ_C_ERR_RPT_MSG1),
> + __NPU2_SCOM_DUMP("CS.CTL.MISC.CERR_FIRST0",
> + NPU2_BLOCK_CTL, NPU2_CQ_C_ERR_RPT_FIRST0),
> + __NPU2_SCOM_DUMP("CS.CTL.MISC.CERR_FIRST1",
> + NPU2_BLOCK_CTL, NPU2_CQ_C_ERR_RPT_FIRST1),
> +
> + /* CQ Data */
> + __NPU2_SCOM_DUMP("DAT.MISC.CERR_ECC_HOLD",
> + NPU2_BLOCK_DAT, NPU2_CQ_DAT_ECC_STATUS),
> + __NPU2_SCOM_DUMP("DAT.MISC.CERR_ECC_MASK",
> + NPU2_BLOCK_DAT, NPU2_CQ_DAT_ECC_MASK),
> + __NPU2_SCOM_DUMP("DAT.MISC.CERR_ECC_FIRST",
> + NPU2_BLOCK_DAT, NPU2_CQ_DAT_ECC_FIRST),
> + __NPU2_SCOM_DUMP("DAT.MISC.REM0",
> + NPU2_BLOCK_DAT, NPU2_CQ_DAT_RAS_MSG0),
> + __NPU2_SCOM_DUMP("DAT.MISC.REM1",
> + NPU2_BLOCK_DAT, NPU2_CQ_DAT_RAS_MSG1),
> +};
> +
> +static npu2_scom_dump_t npu2_scom_dump_nvlink[] = {
> + __NPU2_SCOM_DUMP("NTL0.REGS.CERR_FIRST1",
> + NPU2_BLOCK_NTL0, NPU2_NTL_ERR_FIRST1_OFF),
> + __NPU2_SCOM_DUMP("NTL1.REGS.CERR_FIRST1",
> + NPU2_BLOCK_NTL1, NPU2_NTL_ERR_FIRST1_OFF),
> + __NPU2_SCOM_DUMP("NTL0.REGS.CERR_FIRST2",
> + NPU2_BLOCK_NTL0, NPU2_NTL_ERR_FIRST2_OFF),
> + __NPU2_SCOM_DUMP("NTL1.REGS.CERR_FIRST2",
> + NPU2_BLOCK_NTL1, NPU2_NTL_ERR_FIRST2_OFF),
> +};
> +
> +static npu2_scom_dump_t npu2_scom_dump_ocapi[] = {
> + __NPU2_SCOM_DUMP("OTL0.MISC.C_ERR_RPT_HOLD0",
> + NPU2_BLOCK_OTL0, NPU2_OTL_ERR_RPT_HOLD0),
> + __NPU2_SCOM_DUMP("OTL1.MISC.C_ERR_RPT_HOLD0",
> + NPU2_BLOCK_OTL1, NPU2_OTL_ERR_RPT_HOLD0),
> + __NPU2_SCOM_DUMP("OTL0.MISC.OTL_REM0",
> + NPU2_BLOCK_OTL0, NPU2_OTL_RAS_ERR_MSG0),
> + __NPU2_SCOM_DUMP("OTL1.MISC.OTL_REM0",
> + NPU2_BLOCK_OTL1, NPU2_OTL_RAS_ERR_MSG0),
> + __NPU2_SCOM_DUMP("OTL0.MISC.ERROR_SIG_RXI",
> + NPU2_BLOCK_OTL0, NPU2_OTL_RXI_ERR_SIG),
> + __NPU2_SCOM_DUMP("OTL1.MISC.ERROR_SIG_RXI",
> + NPU2_BLOCK_OTL1, NPU2_OTL_RXI_ERR_SIG),
> + __NPU2_SCOM_DUMP("OTL0.MISC.ERROR_SIG_RXO",
> + NPU2_BLOCK_OTL0, NPU2_OTL_RXO_ERR_SIG),
> + __NPU2_SCOM_DUMP("OTL1.MISC.ERROR_SIG_RXO",
> + NPU2_BLOCK_OTL1, NPU2_OTL_RXO_ERR_SIG),
> + __NPU2_SCOM_DUMP("OTL0.MISC.C_ERR_RPT_HOLD1",
> + NPU2_BLOCK_OTL0, NPU2_OTL_ERR_RPT_HOLD1),
> + __NPU2_SCOM_DUMP("OTL1.MISC.C_ERR_RPT_HOLD1",
> + NPU2_BLOCK_OTL1, NPU2_OTL_ERR_RPT_HOLD1),
> +};
> +
> +static void print_one_reg(struct npu2 *npu, npu2_scom_dump_t *scom, int stack)
> +{
> + uint64_t reg, val;
> +
> + reg = NPU2_REG_OFFSET(stack, scom->block, scom->offset);
> + val = npu2_scom_read(npu->chip_id, npu->xscom_base,
> + reg, NPU2_MISC_DA_LEN_8B);
> +
> + prlog(PR_ERR, "NPU[%d] STCK%d.%s 0x%llx = 0x%016llx\n",
> + npu->chip_id, stack - 4, scom->name, reg, val);
> +}
> +
> +static void dump_npu2_regs_nvlink(struct npu2 *npu, int brick_index)
> +{
> + uint32_t stack, ntl;
> + int i;
> +
> + stack = NPU2_STACK_STCK_0 + brick_index / 2;
> + ntl = NPU2_BLOCK_NTL0 + (brick_index % 2) * 2;
> +
> + for (i = 0; i < ARRAY_SIZE(npu2_scom_dump_nvlink); i++) {
> + if (npu2_scom_dump_nvlink[i].block == ntl)
> + print_one_reg(npu, &npu2_scom_dump_nvlink[i], stack);
> + }
> +}
> +
> +static void dump_npu2_regs_opencapi(struct npu2 *npu, int brick_index)
> +{
> + uint64_t val, addr;
> + uint32_t stack, otl;
> + int i;
> +
> + stack = NPU2_STACK_STCK_0 + brick_index / 2;
> + otl = NPU2_BLOCK_OTL0 + (brick_index % 2);
> +
> + /* NPU registers */
> + for (i = 0; i < ARRAY_SIZE(npu2_scom_dump_ocapi); i++) {
> + if (npu2_scom_dump_ocapi[i].block == otl)
> + print_one_reg(npu, &npu2_scom_dump_ocapi[i], stack);
> + }
> +
> + /* Fabric registers */
> + addr = OB_ODL_STATUS(brick_index);
> + xscom_read(npu->chip_id, addr, &val);
> + prlog(PR_ERR, "NPU[%d] ODL status brick %d 0x%llx = 0x%016llx\n",
> + npu->chip_id, brick_index, addr, val);
> +
> + addr = OB_ODL_TRAINING_STATUS(brick_index);
> + xscom_read(npu->chip_id, addr, &val);
> + prlog(PR_ERR, "NPU[%d] ODL training status brick %d 0x%llx = 0x%016llx\n",
> + npu->chip_id, brick_index, addr, val);
> +
> + addr = OB_ODL_ENDPOINT_INFO(brick_index);
> + xscom_read(npu->chip_id, addr, &val);
> + prlog(PR_ERR, "NPU[%d] ODL endpoint info brick %d 0x%llx = 0x%016llx\n",
> + npu->chip_id, brick_index, addr, val);
> +}
> +
> +static void dump_npu2_regs(struct npu2 *npu, int brick_index) > +{
> + int i, stack, stack_min, stack_max;
> + uint64_t fir_val, mask_val, fir_addr, mask_addr;
> + uint64_t reg, val;
> + struct npu2_dev *dev;
> +
> + if (brick_index != -1) {
> + stack_min = stack_max = NPU2_STACK_STCK_0 + brick_index / 2;
> + } else {
> + stack_min = NPU2_STACK_STCK_0;
> + stack_max = NPU2_STACK_STCK_2;
> + /* Avoid dumping unused stacks for opencapi on Lagrange */
> + if (npu->total_devices == 2)
> + stack_min = stack_max = NPU2_STACK_STCK_1;
> + }
> +
> + /* NPU FIRs */
> + for (i = 0; i < NPU2_TOTAL_FIR_REGISTERS; i++) {
> + fir_addr = NPU2_FIR_REGISTER_0 + i * NPU2_FIR_OFFSET;
> + mask_addr = fir_addr + NPU2_FIR_MASK_OFFSET;
> + xscom_read(npu->chip_id, fir_addr, &fir_val);
> + xscom_read(npu->chip_id, mask_addr, &mask_val);
> + prlog(PR_ERR, "NPU[%d] FIR%d = 0x%016llx (mask 0x%016llx => 0x%016llx)\n",
> + npu->chip_id, i, fir_val, mask_val, fir_val & ~mask_val);
> + }
> +
> + /* NPU global, per-stack registers */
> + for (i = 0; i < ARRAY_SIZE(npu2_scom_dump_global); i++) {
> + for (stack = stack_min; stack <= stack_max; stack++)
> + print_one_reg(npu, &npu2_scom_dump_global[i], stack);
> + }
> +
> + /*
> + * NPU global registers, stack independent
> + *
> + * We have only one for now, so dump it directly
> + */
> + reg = NPU2_REG_OFFSET(NPU2_STACK_MISC, NPU2_BLOCK_XTS, 0);
> + val = npu2_scom_read(npu->chip_id, npu->xscom_base,
> + reg, NPU2_MISC_DA_LEN_8B);
> +
> + prlog(PR_ERR, "NPU[%d] XTS.REG.ERR_HOLD 0x%llx = 0x%016llx\n",
> + npu->chip_id, reg, val);
> +
> + /* nvlink- or opencapi-specific registers */
> + for (i = 0; i < npu->total_devices; i++) {
> + dev = &npu->devices[i];
> + if (brick_index == -1 || dev->brick_index == brick_index) {
> + if (dev->type == NPU2_DEV_TYPE_NVLINK)
> + dump_npu2_regs_nvlink(npu, dev->brick_index);
> + else if (dev->type == NPU2_DEV_TYPE_OPENCAPI)
> + dump_npu2_regs_opencapi(npu, dev->brick_index);
> + }
> + }
> +}
> +
> +void npu2_dump_scoms(int chip_id)
Can we rename this something that's less likely to get mentally confused
with "dump_npu2_regs"? I don't have suggestions.
> +{
> + struct npu2 *npu;
> + struct phb *phb;
> + struct npu2_dev *dev;
> +
> + /*
> + * Look for the npu2 structure for that chip ID. We can access it
> + * through the array of phbs, looking for a nvlink or opencapi
> + * phb. We can have several entries, but they all point
> + * to the same npu2 structure
> + */
> + for_each_phb(phb) {
> + npu = NULL;
> + if (phb->phb_type == phb_type_npu_v2) {
> + npu = phb_to_npu2_nvlink(phb);
> + } else if (phb->phb_type == phb_type_npu_v2_opencapi) {
> + dev = phb_to_npu2_dev_ocapi(phb);
> + npu = dev->npu;
> + }
> + if (npu && npu->chip_id == chip_id) {
> + dump_npu2_regs(npu, -1 /* all bricks */);
> + break;
> + }
> + }
> +}
> +
> static uint64_t npu2_ipi_attributes(struct irq_source *is __unused, uint32_t isn __unused)
> {
> struct npu2 *p = is->data;
> @@ -182,6 +480,7 @@ static void npu2_err_interrupt(struct irq_source *is, uint32_t isn)
> brick = 2 + ((idx - 27) % 4);
> prlog(PR_ERR, "NPU[%d] error interrupt for brick %d\n",
> p->chip_id, brick);
> + dump_npu2_regs(p, brick);
> opal_update_pending_evt(OPAL_EVENT_PCI_ERROR,
> OPAL_EVENT_PCI_ERROR);
> break;
> diff --git a/include/npu2-regs.h b/include/npu2-regs.h
> index 939a23f5..ba10b8ea 100644
> --- a/include/npu2-regs.h
> +++ b/include/npu2-regs.h
> @@ -203,6 +203,8 @@ void npu2_scom_write(uint64_t gcid, uint64_t scom_base,
> #define NPU2_PERF_MASK 0x110
> #define NPU2_DBG0_CFG 0x118
> #define NPU2_DBG1_CFG 0x120
> +#define NPU2_C_ERR_RPT_MSG5 0x128
> +#define NPU2_C_ERR_RPT_MSG6 0x130
>
> /* CTL block registers */
> #define NPU2_CQ_CTL_MISC_CFG 0x000
> @@ -295,10 +297,12 @@ void npu2_scom_write(uint64_t gcid, uint64_t scom_base,
> #define NPU2_NTL_MISC_CFG3(ndev) NPU2_NTL_REG_OFFSET(ndev, 0x008)
> #define NPU2_NTL_ERR_HOLD1(ndev) NPU2_NTL_REG_OFFSET(ndev, 0x010)
> #define NPU2_NTL_ERR_MASK1(ndev) NPU2_NTL_REG_OFFSET(ndev, 0x018)
> +#define NPU2_NTL_ERR_FIRST1_OFF 0x020
> #define NPU2_NTL_ERR_FIRST1(ndev) NPU2_NTL_REG_OFFSET(ndev, 0x020)
> #define NPU2_NTL_ERR_FIRST1_MASK(ndev) NPU2_NTL_REG_OFFSET(ndev, 0x028)
> #define NPU2_NTL_ERR_HOLD2(ndev) NPU2_NTL_REG_OFFSET(ndev, 0x030)
> #define NPU2_NTL_ERR_MASK2(ndev) NPU2_NTL_REG_OFFSET(ndev, 0x038)
> +#define NPU2_NTL_ERR_FIRST2_OFF 0x040
> #define NPU2_NTL_ERR_FIRST2(ndev) NPU2_NTL_REG_OFFSET(ndev, 0x040)
> #define NPU2_NTL_ERR_FIRST2_MASK(ndev) NPU2_NTL_REG_OFFSET(ndev, 0x048)
> #define NPU2_NTL_SCRATCH2(ndev) NPU2_NTL_REG_OFFSET(ndev, 0x050)
> @@ -402,6 +406,12 @@ void npu2_scom_write(uint64_t gcid, uint64_t scom_base,
> #define NPU2_OTL_OSL_DAR(stack, block) NPU2_REG_OFFSET(stack, block, 0x008)
> #define NPU2_OTL_OSL_TFC(stack, block) NPU2_REG_OFFSET(stack, block, 0x010)
> #define NPU2_OTL_OSL_PEHANDLE(stack, block) NPU2_REG_OFFSET(stack, block, 0x018)
> +#define NPU2_OTL_ERR_RPT_HOLD0 0x30
> +#define NPU2_OTL_RAS_ERR_MSG0 0x68
> +#define NPU2_OTL_RXI_ERR_SIG 0x70
> +#define NPU2_OTL_RXO_ERR_SIG 0x78
> +#define NPU2_OTL_ERR_RPT_HOLD1 0xB0
> +
>
> /* Misc block registers. Unlike the SM/CTL/DAT/NTL registers above
> * there is only a single instance of each of these in the NPU so we
> diff --git a/include/npu2.h b/include/npu2.h
> index ef4e7aff..d58aab47 100644
> --- a/include/npu2.h
> +++ b/include/npu2.h
> @@ -248,4 +248,5 @@ int64_t npu2_freeze_status(struct phb *phb __unused,
> uint8_t *freeze_state,
> uint16_t *pci_error_type __unused,
> uint16_t *severity __unused);
> +void npu2_dump_scoms(int chip_id);
> #endif /* __NPU2_H */
>
--
Andrew Donnellan OzLabs, ADL Canberra
andrew.donnellan at au1.ibm.com IBM Australia Limited
More information about the Skiboot
mailing list