[Skiboot] [PATCH] hw/npu2: Implement logging HMI actions

Balbir Singh bsingharora at gmail.com
Thu Nov 30 12:53:29 AEDT 2017


On Mon, 27 Nov 2017 15:16:10 +1100
Alistair Popple <alistair at popple.id.au> wrote:

> Thanks for putting the together Balbir. Couple of comments below.
> 
> On Monday, 27 November 2017 12:57:25 PM AEDT Balbir Singh wrote:
> > Log HMI errors as step 1. OS will need to deduce
> > and interpret the HMI event.
> > 
> > Signed-off-by: Balbir Singh <bsingharora at gmail.com>
> > ---
> >  core/hmi.c          | 84 ++++++++++++++++++++++++++++++++++++++++++++-
> >  include/npu2-regs.h | 18 ++++++++++
> >  include/opal-api.h  | 98 +++++++++++++++++++++++++++++++++++++++++++++++++++++
> >  3 files changed, 199 insertions(+), 1 deletion(-)
> > 
> > diff --git a/core/hmi.c b/core/hmi.c
> > index 07c08462..b450032c 100644
> > --- a/core/hmi.c
> > +++ b/core/hmi.c
> > @@ -23,6 +23,7 @@
> >  #include <cpu.h>
> >  #include <chip.h>
> >  #include <npu-regs.h>
> > +#include <npu2-regs.h>
> >  #include <npu.h>
> >  #include <capp.h>
> >  
> > @@ -517,6 +518,87 @@ static void find_nx_checkstop_reason(int flat_chip_id,
> >  	*event_generated = true;
> >  }
> >  
> > +static void find_npu2_checkstop_reason(int flat_chip_id,
> > +				      struct OpalHMIEvent *hmi_evt,
> > +				      bool *event_generated)
> > +{
> > +	struct phb *phb;
> > +	struct npu *p = NULL;
> > +	int i;
> > +
> > +	uint64_t npu2_fir;
> > +	uint64_t npu2_fir_mask;
> > +	uint64_t npu2_fir_action0;
> > +	uint64_t npu2_fir_action1;
> > +	uint64_t npu2_fir_addr;
> > +	uint64_t npu2_fir_mask_addr;
> > +	uint64_t npu2_fir_action0_addr;
> > +	uint64_t npu2_fir_action1_addr;
> > +	uint64_t fatal_errors;
> > +
> > +	/* Only check for NPU errors if the chip has a NPU */
> > +	if (PVR_TYPE(mfspr(SPR_PVR)) != PVR_TYPE_P9)
> > +		return;  
> 
> We don't need this. The loop should check if there is a compatible NPU or not on
> this chip.
>

Will do, dropping this bit

> > +	/* Find the NPU on the chip associated with the HMI. */
> > +	for_each_phb(phb) {
> > +		/* NOTE: if a chip ever has >1 NPU this will need adjusting */
> > +		if (dt_node_is_compatible(phb->dt_node, "ibm,power9-npu-pciex") &&
> > +		    (dt_get_chip_id(phb->dt_node) == flat_chip_id)) {
> > +			p = phb_to_npu(phb);
> > +			break;
> > +		}
> > +	}
> > +
> > +	/* If we didn't find a NPU on the chip, it's not our checkstop. */
> > +	if (p == NULL)
> > +		return;
> > +
> > +	npu2_fir_addr = NPU2_FIR_REGISTER_0;
> > +	npu2_fir_mask_addr = NPU2_FIR_REGISTER_0_MASK;
> > +	npu2_fir_action0_addr = NPU2_FIR_REGISTER_0_ACTION0;
> > +	npu2_fir_action1_addr = NPU2_FIR_REGISTER_0_ACTION1;
> > +
> > +	for (i = 0; i < NPU2_TOTAL_FIR_REGISTERS; i++) {
> > +		/* Read all the registers necessary to find a checkstop condition. */
> > +		if (xscom_read(flat_chip_id, npu2_fir_addr, &npu2_fir) ||
> > +			xscom_read(flat_chip_id, npu2_fir_mask_addr, &npu2_fir_mask) ||
> > +			xscom_read(flat_chip_id, npu2_fir_action0_addr, &npu2_fir_action0) ||
> > +			xscom_read(flat_chip_id, npu2_fir_action1_addr, &npu2_fir_action1)) {
> > +			prerror("HMI: Couldn't read NPU FIR register%d with XSCOM\n", i);
> > +			continue;
> > +		}
> > +
> > +		fatal_errors = npu2_fir & ~npu2_fir_mask & npu2_fir_action0 & npu2_fir_action1;
> > +
> > +		/* If there's no errors, we don't need to do anything. */
> > +		if (!fatal_errors)
> > +			continue;
> > +
> > +		prlog(PR_ERR, "NPU: FIR#%d FIR 0x%016llx mask 0x%016llx\n",
> > +			i, npu2_fir, npu2_fir_mask);
> > +		prlog(PR_ERR, "NPU: ACTION0 0x%016llx, ACTION1 0x%016llx\n",
> > +			npu2_fir_action0, npu2_fir_action1);
> > +
> > +		/* Can't do a fence yet, we are just logging fir information for now */
> > +		npu2_fir_addr += NPU2_FIR_OFFSET;
> > +		npu2_fir_mask_addr += NPU2_FIR_OFFSET;
> > +		npu2_fir_action0_addr += NPU2_FIR_OFFSET;
> > +		npu2_fir_action1_addr += NPU2_FIR_OFFSET;
> > +
> > +	}  
> 
> Not sure I love this loop but it works and I don't really have a better
> suggestion. Although perhaps it would be a little clearer if you just stored the
> FIR "base address" (ie. npu2_fir_addr) and generated the related registers
> (mask, action, etc) as offsets from that.

OK, that can be done

> 
> > +	/* Set up the HMI event */
> > +	hmi_evt->severity = OpalHMI_SEV_WARNING;
> > +	hmi_evt->type = OpalHMI_ERROR_MALFUNC_ALERT;
> > +	hmi_evt->u.xstop_error.xstop_type = CHECKSTOP_TYPE_NPU;
> > +	hmi_evt->u.xstop_error.u.chip_id = flat_chip_id;
> > +
> > +	/* Marking the event as recoverable so that we don't crash */  
> 
> You hope :-)
> 
> It wouldn't suprise me if there were at least some errors detected here that may
> result in other system xstops, but I agree that a NPU HMI shouldn't result in
> skiboot/firmware bringing down the system.

Yep, it's best effort. The idea is to send the information down to the kernel.
How the kernel communicates with other bits or just prints out the state of
affairs as they stand is TBD.

> 
> > +	queue_hmi_event(hmi_evt, 1);
> > +	*event_generated = true;
> > +}
> > +
> >  static void find_npu_checkstop_reason(int flat_chip_id,
> >  				      struct OpalHMIEvent *hmi_evt,
> >  				      bool *event_generated)
> > @@ -532,7 +614,7 @@ static void find_npu_checkstop_reason(int flat_chip_id,
> >  
> >  	/* Only check for NPU errors if the chip has a NPU */
> >  	if (PVR_TYPE(mfspr(SPR_PVR)) != PVR_TYPE_P8NVL)  
> 
> The previous comment is equally valid for NVLink1 too, but I must've missed it
> when this went in.


I can fix it up

> 
> > -		return;
> > +		return find_npu2_checkstop_reason(flat_chip_id, hmi_evt, event_generated);  
> 
> I don't think this should go here. In my opinion it would be better if
> find_npu2_checkstop_reason() was called directly from decode_malfunction().
> 
> >  	/* Find the NPU on the chip associated with the HMI. */
> >  	for_each_phb(phb) {
> > diff --git a/include/npu2-regs.h b/include/npu2-regs.h
> > index fdaad192..197bda12 100644
> > --- a/include/npu2-regs.h
> > +++ b/include/npu2-regs.h
> > @@ -475,4 +475,22 @@ void npu2_write_mask(struct npu2 *p, uint64_t reg, uint64_t val, uint64_t mask);
> >  #define NPU2_DD1_MISC_SCOM_IND_SCOM_DATA	0x38f
> >  #define NPU2_MISC_SCOM_IND_SCOM_DATA		0x68f
> >  
> > +#define NPU2_FIR_REGISTER_0			0x0000000005013C00
> > +#define NPU2_FIR_REGISTER_0_MASK		0x0000000005013C03
> > +#define NPU2_FIR_REGISTER_0_ACTION0		0x0000000005013C06
> > +#define NPU2_FIR_REGISTER_0_ACTION1		0x0000000005013C07
> > +
> > +#define NPU2_FIR_REGISTER_1			0x0000000005013C40
> > +#define NPU2_FIR_REGISTER_1_MASK		0x0000000005013C43
> > +#define NPU2_FIR_REGISTER_1_ACTION0		0x0000000005013C46
> > +#define NPU2_FIR_REGISTER_1_ACTION1		0x0000000005013C47
> > +
> > +#define NPU2_FIR_REGISTER_2			0x0000000005013C80
> > +#define NPU2_FIR_REGISTER_2_MASK		0x0000000005013C83
> > +#define NPU2_FIR_REGISTER_2_ACTION0		0x0000000005013C86
> > +#define NPU2_FIR_REGISTER_2_ACTION1		0x0000000005013C87
> > +
> > +#define NPU2_TOTAL_FIR_REGISTERS		3
> > +#define NPU2_FIR_OFFSET				0x40
> > +
> >  #endif /* __NPU2_REGS_H */
> > diff --git a/include/opal-api.h b/include/opal-api.h
> > index 0bc036ed..cef88f68 100644
> > --- a/include/opal-api.h
> > +++ b/include/opal-api.h
> > @@ -725,6 +725,104 @@ enum OpalHMI_NestAccelXstopReason {
> >  	NX_CHECKSTOP_PBI_ISN_UE			= 0x00002000,
> >  };
> >  
> > +/*
> > + * Can't use enums for 64 bit values, use #defines
> > + */
> > +#define NPU2_CHECKSTOP_REG0_NTL_ARRAY_CE		PPC_BIT(0)
> > +#define NPU2_CHECKSTOP_REG0_NTL_ARRAY_HDR_CE		PPC_BIT(1)
> > +#define NPU2_CHECKSTOP_REG0_NTL_ARRAY_DATA_UE		PPC_BIT(2)
> > +#define NPU2_CHECKSTOP_REG0_NTL_NVL_FLIT_PERR		PPC_BIT(3)
> > +#define NPU2_CHECKSTOP_REG0_NTL_NVL_DATA_PERR		PPC_BIT(4)
> > +#define NPU2_CHECKSTOP_REG0_NTL_NVL_PKT_MALFOR		PPC_BIT(5)
> > +#define NPU2_CHECKSTOP_REG0_NTL_NVL_PKT_UNSUPPORTED	PPC_BIT(6)
> > +#define NPU2_CHECKSTOP_REG0_NTL_NVL_CONFIG_ERR		PPC_BIT(7)
> > +#define NPU2_CHECKSTOP_REG0_NTL_NVL_CRC_ERR		PPC_BIT(8)
> > +#define NPU2_CHECKSTOP_REG0_NTL_PRI_ERR			PPC_BIT(9)
> > +#define NPU2_CHECKSTOP_REG0_NTL_LOGIC_ERR		PPC_BIT(10)
> > +#define NPU2_CHECKSTOP_REG0_NTL_LMD_POISON		PPC_BIT(11)
> > +#define NPU2_CHECKSTOP_REG0_NTL_ARRAY_DATA_SUE		PPC_BIT(12)
> > +#define NPU2_CHECKSTOP_REG0_CTL_ARRAY_CE		PPC_BIT(13)
> > +#define NPU2_CHECKSTOP_REG0_CTL_PBUS_RECOV_ERR		PPC_BIT(14)
> > +#define NPU2_CHECKSTOP_REG0_CTL_REG_RING_ERR		PPC_BIT(15)
> > +#define NPU2_CHECKSTOP_REG0_CTL_MMIO_ST_DATA_UE		PPC_BIT(16)
> > +#define NPU2_CHECKSTOP_REG0_CTL_PEF			PPC_BIT(17)
> > +#define NPU2_CHECKSTOP_REG0_CTL_NVL_CFG_ERR		PPC_BIT(18)
> > +#define NPU2_CHECKSTOP_REG0_CTL_NVL_FATAL_ERR		PPC_BIT(19)
> > +#define NPU2_CHECKSTOP_REG0_RESERVED_1			PPC_BIT(20)
> > +#define NPU2_CHECKSTOP_REG0_CTL_ARRAY_UE		PPC_BIT(21)
> > +#define NPU2_CHECKSTOP_REG0_CTL_PBUS_PERR		PPC_BIT(22)
> > +#define NPU2_CHECKSTOP_REG0_CTL_PBUS_FATAL_ERR		PPC_BIT(23)
> > +#define NPU2_CHECKSTOP_REG0_CTL_PBUS_CONFIG_ERR		PPC_BIT(24)
> > +#define NPU2_CHECKSTOP_REG0_CTL_FWD_PROGRESS_ERR	PPC_BIT(25)
> > +#define NPU2_CHECKSTOP_REG0_CTL_LOGIC_ERR		PPC_BIT(26)
> > +#define NPU2_CHECKSTOP_REG0_DAT_DATA_BE_UE		PPC_BIT(29)
> > +#define NPU2_CHECKSTOP_REG0_DAT_DATA_BE_CE		PPC_BIT(30)
> > +#define NPU2_CHECKSTOP_REG0_DAT_DATA_BE_PERR		PPC_BIT(31)
> > +#define NPU2_CHECKSTOP_REG0_DAT_CREG_PERR		PPC_BIT(32)
> > +#define NPU2_CHECKSTOP_REG0_DAT_RTAG_PERR		PPC_BIT(33)
> > +#define NPU2_CHECKSTOP_REG0_DAT_STATE_PERR		PPC_BIT(34)
> > +#define NPU2_CHECKSTOP_REG0_DAT_LOGIC_ERR		PPC_BIT(35)
> > +#define NPU2_CHECKSTOP_REG0_DAT_DATA_BE_SUE		PPC_BIT(36)
> > +#define NPU2_CHECKSTOP_REG0_DAT_PBRX_SUE		PPC_BIT(37)
> > +#define NPU2_CHECKSTOP_REG0_XTS_INT			PPC_BIT(40)
> > +#define NPU2_CHECKSTOP_REG0_XTS_SRAM_CE			PPC_BIT(41)
> > +#define NPU2_CHECKSTOP_REG0_XTS_SRAM_UE			PPC_BIT(42)
> > +#define NPU2_CHECKSTOP_REG0_XTS_PROTOCOL_CE		PPC_BIT(43)
> > +#define NPU2_CHECKSTOP_REG0_XTS_PROTOCOL_UE		PPC_BIT(44)
> > +#define NPU2_CHECKSTOP_REG0_XTS_PBUS_PROTOCOL		PPC_BIT(45)
> > +
> > +#define NPU2_CHECKSTOP_REG1_NDL_BRK0_STALL			PPC_BIT(0)
> > +#define NPU2_CHECKSTOP_REG1_NDL_BRK0_NOSTALL			PPC_BIT(1)
> > +#define NPU2_CHECKSTOP_REG1_NDL_BRK1_STALL			PPC_BIT(2)
> > +#define NPU2_CHECKSTOP_REG1_NDL_BRK1_NOSTALL			PPC_BIT(3)
> > +#define NPU2_CHECKSTOP_REG1_NDL_BRK2_STALL			PPC_BIT(4)
> > +#define NPU2_CHECKSTOP_REG1_NDL_BRK2_NOSTALL			PPC_BIT(5)
> > +#define NPU2_CHECKSTOP_REG1_NDL_BRK3_STALL			PPC_BIT(6)
> > +#define NPU2_CHECKSTOP_REG1_NDL_BRK3_NOSTALL			PPC_BIT(7)
> > +#define NPU2_CHECKSTOP_REG1_NDL_BRK4_STALL			PPC_BIT(8)
> > +#define NPU2_CHECKSTOP_REG1_NDL_BRK4_NOSTALL			PPC_BIT(9)
> > +#define NPU2_CHECKSTOP_REG1_NDL_BRK5_STALL			PPC_BIT(10)
> > +#define NPU2_CHECKSTOP_REG1_NDL_BRK5_NOSTALL			PPC_BIT(11)
> > +#define NPU2_CHECKSTOP_REG1_MISC_REG_RING_ERR			PPC_BIT(12)
> > +#define NPU2_CHECKSTOP_REG1_MISC_INT_RA_PERR			PPC_BIT(13)
> > +#define NPU2_CHECKSTOP_REG1_MISC_DA_ADDR_PERR			PPC_BIT(14)
> > +#define NPU2_CHECKSTOP_REG1_MISC_CTRL_PERR			PPC_BIT(15)
> > +#define NPU2_CHECKSTOP_REG1_MISC_NMMU_ERR			PPC_BIT(16)
> > +#define NPU2_CHECKSTOP_REG1_ATS_TVT_ENTRY_INVALID		PPC_BIT(17)
> > +#define NPU2_CHECKSTOP_REG1_ATS_TVT_ADDR_RANGE_ERR		PPC_BIT(18)
> > +#define NPU2_CHECKSTOP_REG1_ATS_TCE_PAGE_ACCESS_CA_ERR		PPC_BIT(19)
> > +#define NPU2_CHECKSTOP_REG1_ATS_TCE_CACHE_MULT_HIT_ERR		PPC_BIT(20)
> > +#define NPU2_CHECKSTOP_REG1_ATS_TCE_PAGE_ACCESS_TW_ERR		PPC_BIT(21)
> > +#define NPU2_CHECKSTOP_REG1_ATS_TCE_REQ_TO_ERR			PPC_BIT(22)
> > +#define NPU2_CHECKSTOP_REG1_ATS_TCD_PERR			PPC_BIT(23)
> > +#define NPU2_CHECKSTOP_REG1_ATS_TDR_PERR			PPC_BIT(24)
> > +#define NPU2_CHECKSTOP_REG1_ATS_AT_EA_UE			PPC_BIT(25)
> > +#define NPU2_CHECKSTOP_REG1_ATS_AT_EA_CE			PPC_BIT(26)
> > +#define NPU2_CHECKSTOP_REG1_ATS_AT_TDRMEM_UE			PPC_BIT(27)
> > +#define NPU2_CHECKSTOP_REG1_ATS_AT_TDRMEM_CE			PPC_BIT(28)
> > +#define NPU2_CHECKSTOP_REG1_ATS_AT_RSPOUT_UE			PPC_BIT(29)
> > +#define NPU2_CHECKSTOP_REG1_ATS_AT_RSPOUT_CE			PPC_BIT(30)
> > +#define NPU2_CHECKSTOP_REG1_ATS_TVT_PERR			PPC_BIT(31)
> > +#define NPU2_CHECKSTOP_REG1_ATS_IODA_ADDR_PERR			PPC_BIT(32)
> > +#define NPU2_CHECKSTOP_REG1_ATS_NPU_CTRL_PERR			PPC_BIT(33)
> > +#define NPU2_CHECKSTOP_REG1_ATS_NPU_TOR_PERR			PPC_BIT(34)
> > +#define NPU2_CHECKSTOP_REG1_ATS_INVAL_IODA_TBL_SEL		PPC_BIT(35)
> > +
> > +#define NPU2_CHECKSTOP_REG2_XSL_MMIO_INVALIDATE_REQ_WHILE_1_INPROG	PPC_BIT(36)
> > +#define NPU2_CHECKSTOP_REG2_XSL_UNEXPECTED_ITAG_PORT_0			PPC_BIT(37)
> > +#define NPU2_CHECKSTOP_REG2_XSL_UNEXPECTED_ITAG_PORT_1			PPC_BIT(38)
> > +#define NPU2_CHECKSTOP_REG2_XSL_UNEXPECTED_RD_PEE_COMPLETION		PPC_BIT(39)
> > +#define NPU2_CHECKSTOP_REG2_XSL_UNEXPECTED_CO_RESP			PPC_BIT(40)
> > +#define NPU2_CHECKSTOP_REG2_XSL_XLAT_REQ_WHILE_SPAP_INVALID		PPC_BIT(41)
> > +#define NPU2_CHECKSTOP_REG2_XSL_INVALID_PEE				PPC_BIT(42)
> > +#define NPU2_CHECKSTOP_REG2_XSL_BLOOM_FILTER_PROTECT_ERR		PPC_BIT(43)
> > +#define NPU2_CHECKSTOP_REG2_XSL_CE					PPC_BIT(46)
> > +#define NPU2_CHECKSTOP_REG2_XSL_UE					PPC_BIT(47)
> > +#define NPU2_CHECKSTOP_REG2_XSL_SLBI_TLBI_BUFF_OVERFLOW			PPC_BIT(48)
> > +#define NPU2_CHECKSTOP_REG2_XSL_SBE_CORR_ERR_PB_CHKOUT_RSP_DATA		PPC_BIT(49)
> > +#define NPU2_CHECKSTOP_REG2_XSL_UE_PB_CHKOUT_RSP_DATA			PPC_BIT(50)
> > +#define NPU2_CHECKSTOP_REG2_XSL_SUE_PB_CHKOUT_RSP_DATA			PPC_BIT(51)
> > +  
> 
> I couldn't see these referenced anywhere, do we use them at all?
>

Yep, they've been added to opal-api.h, in the hope that we can send the fir
bits down and have the kernel mask and print the appropriate information
when we add support.

Thanks for the review!
Balbir Singh.


More information about the Skiboot mailing list