[Skiboot] [PATCH] hw/npu2: Implement logging HMI actions

Alistair Popple alistair at popple.id.au
Mon Nov 27 15:16:10 AEDT 2017


Thanks for putting the together Balbir. Couple of comments below.

On Monday, 27 November 2017 12:57:25 PM AEDT Balbir Singh wrote:
> Log HMI errors as step 1. OS will need to deduce
> and interpret the HMI event.
> 
> Signed-off-by: Balbir Singh <bsingharora at gmail.com>
> ---
>  core/hmi.c          | 84 ++++++++++++++++++++++++++++++++++++++++++++-
>  include/npu2-regs.h | 18 ++++++++++
>  include/opal-api.h  | 98 +++++++++++++++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 199 insertions(+), 1 deletion(-)
> 
> diff --git a/core/hmi.c b/core/hmi.c
> index 07c08462..b450032c 100644
> --- a/core/hmi.c
> +++ b/core/hmi.c
> @@ -23,6 +23,7 @@
>  #include <cpu.h>
>  #include <chip.h>
>  #include <npu-regs.h>
> +#include <npu2-regs.h>
>  #include <npu.h>
>  #include <capp.h>
>  
> @@ -517,6 +518,87 @@ static void find_nx_checkstop_reason(int flat_chip_id,
>  	*event_generated = true;
>  }
>  
> +static void find_npu2_checkstop_reason(int flat_chip_id,
> +				      struct OpalHMIEvent *hmi_evt,
> +				      bool *event_generated)
> +{
> +	struct phb *phb;
> +	struct npu *p = NULL;
> +	int i;
> +
> +	uint64_t npu2_fir;
> +	uint64_t npu2_fir_mask;
> +	uint64_t npu2_fir_action0;
> +	uint64_t npu2_fir_action1;
> +	uint64_t npu2_fir_addr;
> +	uint64_t npu2_fir_mask_addr;
> +	uint64_t npu2_fir_action0_addr;
> +	uint64_t npu2_fir_action1_addr;
> +	uint64_t fatal_errors;
> +
> +	/* Only check for NPU errors if the chip has a NPU */
> +	if (PVR_TYPE(mfspr(SPR_PVR)) != PVR_TYPE_P9)
> +		return;

We don't need this. The loop should check if there is a compatible NPU or not on
this chip.

> +	/* Find the NPU on the chip associated with the HMI. */
> +	for_each_phb(phb) {
> +		/* NOTE: if a chip ever has >1 NPU this will need adjusting */
> +		if (dt_node_is_compatible(phb->dt_node, "ibm,power9-npu-pciex") &&
> +		    (dt_get_chip_id(phb->dt_node) == flat_chip_id)) {
> +			p = phb_to_npu(phb);
> +			break;
> +		}
> +	}
> +
> +	/* If we didn't find a NPU on the chip, it's not our checkstop. */
> +	if (p == NULL)
> +		return;
> +
> +	npu2_fir_addr = NPU2_FIR_REGISTER_0;
> +	npu2_fir_mask_addr = NPU2_FIR_REGISTER_0_MASK;
> +	npu2_fir_action0_addr = NPU2_FIR_REGISTER_0_ACTION0;
> +	npu2_fir_action1_addr = NPU2_FIR_REGISTER_0_ACTION1;
> +
> +	for (i = 0; i < NPU2_TOTAL_FIR_REGISTERS; i++) {
> +		/* Read all the registers necessary to find a checkstop condition. */
> +		if (xscom_read(flat_chip_id, npu2_fir_addr, &npu2_fir) ||
> +			xscom_read(flat_chip_id, npu2_fir_mask_addr, &npu2_fir_mask) ||
> +			xscom_read(flat_chip_id, npu2_fir_action0_addr, &npu2_fir_action0) ||
> +			xscom_read(flat_chip_id, npu2_fir_action1_addr, &npu2_fir_action1)) {
> +			prerror("HMI: Couldn't read NPU FIR register%d with XSCOM\n", i);
> +			continue;
> +		}
> +
> +		fatal_errors = npu2_fir & ~npu2_fir_mask & npu2_fir_action0 & npu2_fir_action1;
> +
> +		/* If there's no errors, we don't need to do anything. */
> +		if (!fatal_errors)
> +			continue;
> +
> +		prlog(PR_ERR, "NPU: FIR#%d FIR 0x%016llx mask 0x%016llx\n",
> +			i, npu2_fir, npu2_fir_mask);
> +		prlog(PR_ERR, "NPU: ACTION0 0x%016llx, ACTION1 0x%016llx\n",
> +			npu2_fir_action0, npu2_fir_action1);
> +
> +		/* Can't do a fence yet, we are just logging fir information for now */
> +		npu2_fir_addr += NPU2_FIR_OFFSET;
> +		npu2_fir_mask_addr += NPU2_FIR_OFFSET;
> +		npu2_fir_action0_addr += NPU2_FIR_OFFSET;
> +		npu2_fir_action1_addr += NPU2_FIR_OFFSET;
> +
> +	}

Not sure I love this loop but it works and I don't really have a better
suggestion. Although perhaps it would be a little clearer if you just stored the
FIR "base address" (ie. npu2_fir_addr) and generated the related registers
(mask, action, etc) as offsets from that.

> +	/* Set up the HMI event */
> +	hmi_evt->severity = OpalHMI_SEV_WARNING;
> +	hmi_evt->type = OpalHMI_ERROR_MALFUNC_ALERT;
> +	hmi_evt->u.xstop_error.xstop_type = CHECKSTOP_TYPE_NPU;
> +	hmi_evt->u.xstop_error.u.chip_id = flat_chip_id;
> +
> +	/* Marking the event as recoverable so that we don't crash */

You hope :-)

It wouldn't suprise me if there were at least some errors detected here that may
result in other system xstops, but I agree that a NPU HMI shouldn't result in
skiboot/firmware bringing down the system.

> +	queue_hmi_event(hmi_evt, 1);
> +	*event_generated = true;
> +}
> +
>  static void find_npu_checkstop_reason(int flat_chip_id,
>  				      struct OpalHMIEvent *hmi_evt,
>  				      bool *event_generated)
> @@ -532,7 +614,7 @@ static void find_npu_checkstop_reason(int flat_chip_id,
>  
>  	/* Only check for NPU errors if the chip has a NPU */
>  	if (PVR_TYPE(mfspr(SPR_PVR)) != PVR_TYPE_P8NVL)

The previous comment is equally valid for NVLink1 too, but I must've missed it
when this went in.

> -		return;
> +		return find_npu2_checkstop_reason(flat_chip_id, hmi_evt, event_generated);

I don't think this should go here. In my opinion it would be better if
find_npu2_checkstop_reason() was called directly from decode_malfunction().

>  	/* Find the NPU on the chip associated with the HMI. */
>  	for_each_phb(phb) {
> diff --git a/include/npu2-regs.h b/include/npu2-regs.h
> index fdaad192..197bda12 100644
> --- a/include/npu2-regs.h
> +++ b/include/npu2-regs.h
> @@ -475,4 +475,22 @@ void npu2_write_mask(struct npu2 *p, uint64_t reg, uint64_t val, uint64_t mask);
>  #define NPU2_DD1_MISC_SCOM_IND_SCOM_DATA	0x38f
>  #define NPU2_MISC_SCOM_IND_SCOM_DATA		0x68f
>  
> +#define NPU2_FIR_REGISTER_0			0x0000000005013C00
> +#define NPU2_FIR_REGISTER_0_MASK		0x0000000005013C03
> +#define NPU2_FIR_REGISTER_0_ACTION0		0x0000000005013C06
> +#define NPU2_FIR_REGISTER_0_ACTION1		0x0000000005013C07
> +
> +#define NPU2_FIR_REGISTER_1			0x0000000005013C40
> +#define NPU2_FIR_REGISTER_1_MASK		0x0000000005013C43
> +#define NPU2_FIR_REGISTER_1_ACTION0		0x0000000005013C46
> +#define NPU2_FIR_REGISTER_1_ACTION1		0x0000000005013C47
> +
> +#define NPU2_FIR_REGISTER_2			0x0000000005013C80
> +#define NPU2_FIR_REGISTER_2_MASK		0x0000000005013C83
> +#define NPU2_FIR_REGISTER_2_ACTION0		0x0000000005013C86
> +#define NPU2_FIR_REGISTER_2_ACTION1		0x0000000005013C87
> +
> +#define NPU2_TOTAL_FIR_REGISTERS		3
> +#define NPU2_FIR_OFFSET				0x40
> +
>  #endif /* __NPU2_REGS_H */
> diff --git a/include/opal-api.h b/include/opal-api.h
> index 0bc036ed..cef88f68 100644
> --- a/include/opal-api.h
> +++ b/include/opal-api.h
> @@ -725,6 +725,104 @@ enum OpalHMI_NestAccelXstopReason {
>  	NX_CHECKSTOP_PBI_ISN_UE			= 0x00002000,
>  };
>  
> +/*
> + * Can't use enums for 64 bit values, use #defines
> + */
> +#define NPU2_CHECKSTOP_REG0_NTL_ARRAY_CE		PPC_BIT(0)
> +#define NPU2_CHECKSTOP_REG0_NTL_ARRAY_HDR_CE		PPC_BIT(1)
> +#define NPU2_CHECKSTOP_REG0_NTL_ARRAY_DATA_UE		PPC_BIT(2)
> +#define NPU2_CHECKSTOP_REG0_NTL_NVL_FLIT_PERR		PPC_BIT(3)
> +#define NPU2_CHECKSTOP_REG0_NTL_NVL_DATA_PERR		PPC_BIT(4)
> +#define NPU2_CHECKSTOP_REG0_NTL_NVL_PKT_MALFOR		PPC_BIT(5)
> +#define NPU2_CHECKSTOP_REG0_NTL_NVL_PKT_UNSUPPORTED	PPC_BIT(6)
> +#define NPU2_CHECKSTOP_REG0_NTL_NVL_CONFIG_ERR		PPC_BIT(7)
> +#define NPU2_CHECKSTOP_REG0_NTL_NVL_CRC_ERR		PPC_BIT(8)
> +#define NPU2_CHECKSTOP_REG0_NTL_PRI_ERR			PPC_BIT(9)
> +#define NPU2_CHECKSTOP_REG0_NTL_LOGIC_ERR		PPC_BIT(10)
> +#define NPU2_CHECKSTOP_REG0_NTL_LMD_POISON		PPC_BIT(11)
> +#define NPU2_CHECKSTOP_REG0_NTL_ARRAY_DATA_SUE		PPC_BIT(12)
> +#define NPU2_CHECKSTOP_REG0_CTL_ARRAY_CE		PPC_BIT(13)
> +#define NPU2_CHECKSTOP_REG0_CTL_PBUS_RECOV_ERR		PPC_BIT(14)
> +#define NPU2_CHECKSTOP_REG0_CTL_REG_RING_ERR		PPC_BIT(15)
> +#define NPU2_CHECKSTOP_REG0_CTL_MMIO_ST_DATA_UE		PPC_BIT(16)
> +#define NPU2_CHECKSTOP_REG0_CTL_PEF			PPC_BIT(17)
> +#define NPU2_CHECKSTOP_REG0_CTL_NVL_CFG_ERR		PPC_BIT(18)
> +#define NPU2_CHECKSTOP_REG0_CTL_NVL_FATAL_ERR		PPC_BIT(19)
> +#define NPU2_CHECKSTOP_REG0_RESERVED_1			PPC_BIT(20)
> +#define NPU2_CHECKSTOP_REG0_CTL_ARRAY_UE		PPC_BIT(21)
> +#define NPU2_CHECKSTOP_REG0_CTL_PBUS_PERR		PPC_BIT(22)
> +#define NPU2_CHECKSTOP_REG0_CTL_PBUS_FATAL_ERR		PPC_BIT(23)
> +#define NPU2_CHECKSTOP_REG0_CTL_PBUS_CONFIG_ERR		PPC_BIT(24)
> +#define NPU2_CHECKSTOP_REG0_CTL_FWD_PROGRESS_ERR	PPC_BIT(25)
> +#define NPU2_CHECKSTOP_REG0_CTL_LOGIC_ERR		PPC_BIT(26)
> +#define NPU2_CHECKSTOP_REG0_DAT_DATA_BE_UE		PPC_BIT(29)
> +#define NPU2_CHECKSTOP_REG0_DAT_DATA_BE_CE		PPC_BIT(30)
> +#define NPU2_CHECKSTOP_REG0_DAT_DATA_BE_PERR		PPC_BIT(31)
> +#define NPU2_CHECKSTOP_REG0_DAT_CREG_PERR		PPC_BIT(32)
> +#define NPU2_CHECKSTOP_REG0_DAT_RTAG_PERR		PPC_BIT(33)
> +#define NPU2_CHECKSTOP_REG0_DAT_STATE_PERR		PPC_BIT(34)
> +#define NPU2_CHECKSTOP_REG0_DAT_LOGIC_ERR		PPC_BIT(35)
> +#define NPU2_CHECKSTOP_REG0_DAT_DATA_BE_SUE		PPC_BIT(36)
> +#define NPU2_CHECKSTOP_REG0_DAT_PBRX_SUE		PPC_BIT(37)
> +#define NPU2_CHECKSTOP_REG0_XTS_INT			PPC_BIT(40)
> +#define NPU2_CHECKSTOP_REG0_XTS_SRAM_CE			PPC_BIT(41)
> +#define NPU2_CHECKSTOP_REG0_XTS_SRAM_UE			PPC_BIT(42)
> +#define NPU2_CHECKSTOP_REG0_XTS_PROTOCOL_CE		PPC_BIT(43)
> +#define NPU2_CHECKSTOP_REG0_XTS_PROTOCOL_UE		PPC_BIT(44)
> +#define NPU2_CHECKSTOP_REG0_XTS_PBUS_PROTOCOL		PPC_BIT(45)
> +
> +#define NPU2_CHECKSTOP_REG1_NDL_BRK0_STALL			PPC_BIT(0)
> +#define NPU2_CHECKSTOP_REG1_NDL_BRK0_NOSTALL			PPC_BIT(1)
> +#define NPU2_CHECKSTOP_REG1_NDL_BRK1_STALL			PPC_BIT(2)
> +#define NPU2_CHECKSTOP_REG1_NDL_BRK1_NOSTALL			PPC_BIT(3)
> +#define NPU2_CHECKSTOP_REG1_NDL_BRK2_STALL			PPC_BIT(4)
> +#define NPU2_CHECKSTOP_REG1_NDL_BRK2_NOSTALL			PPC_BIT(5)
> +#define NPU2_CHECKSTOP_REG1_NDL_BRK3_STALL			PPC_BIT(6)
> +#define NPU2_CHECKSTOP_REG1_NDL_BRK3_NOSTALL			PPC_BIT(7)
> +#define NPU2_CHECKSTOP_REG1_NDL_BRK4_STALL			PPC_BIT(8)
> +#define NPU2_CHECKSTOP_REG1_NDL_BRK4_NOSTALL			PPC_BIT(9)
> +#define NPU2_CHECKSTOP_REG1_NDL_BRK5_STALL			PPC_BIT(10)
> +#define NPU2_CHECKSTOP_REG1_NDL_BRK5_NOSTALL			PPC_BIT(11)
> +#define NPU2_CHECKSTOP_REG1_MISC_REG_RING_ERR			PPC_BIT(12)
> +#define NPU2_CHECKSTOP_REG1_MISC_INT_RA_PERR			PPC_BIT(13)
> +#define NPU2_CHECKSTOP_REG1_MISC_DA_ADDR_PERR			PPC_BIT(14)
> +#define NPU2_CHECKSTOP_REG1_MISC_CTRL_PERR			PPC_BIT(15)
> +#define NPU2_CHECKSTOP_REG1_MISC_NMMU_ERR			PPC_BIT(16)
> +#define NPU2_CHECKSTOP_REG1_ATS_TVT_ENTRY_INVALID		PPC_BIT(17)
> +#define NPU2_CHECKSTOP_REG1_ATS_TVT_ADDR_RANGE_ERR		PPC_BIT(18)
> +#define NPU2_CHECKSTOP_REG1_ATS_TCE_PAGE_ACCESS_CA_ERR		PPC_BIT(19)
> +#define NPU2_CHECKSTOP_REG1_ATS_TCE_CACHE_MULT_HIT_ERR		PPC_BIT(20)
> +#define NPU2_CHECKSTOP_REG1_ATS_TCE_PAGE_ACCESS_TW_ERR		PPC_BIT(21)
> +#define NPU2_CHECKSTOP_REG1_ATS_TCE_REQ_TO_ERR			PPC_BIT(22)
> +#define NPU2_CHECKSTOP_REG1_ATS_TCD_PERR			PPC_BIT(23)
> +#define NPU2_CHECKSTOP_REG1_ATS_TDR_PERR			PPC_BIT(24)
> +#define NPU2_CHECKSTOP_REG1_ATS_AT_EA_UE			PPC_BIT(25)
> +#define NPU2_CHECKSTOP_REG1_ATS_AT_EA_CE			PPC_BIT(26)
> +#define NPU2_CHECKSTOP_REG1_ATS_AT_TDRMEM_UE			PPC_BIT(27)
> +#define NPU2_CHECKSTOP_REG1_ATS_AT_TDRMEM_CE			PPC_BIT(28)
> +#define NPU2_CHECKSTOP_REG1_ATS_AT_RSPOUT_UE			PPC_BIT(29)
> +#define NPU2_CHECKSTOP_REG1_ATS_AT_RSPOUT_CE			PPC_BIT(30)
> +#define NPU2_CHECKSTOP_REG1_ATS_TVT_PERR			PPC_BIT(31)
> +#define NPU2_CHECKSTOP_REG1_ATS_IODA_ADDR_PERR			PPC_BIT(32)
> +#define NPU2_CHECKSTOP_REG1_ATS_NPU_CTRL_PERR			PPC_BIT(33)
> +#define NPU2_CHECKSTOP_REG1_ATS_NPU_TOR_PERR			PPC_BIT(34)
> +#define NPU2_CHECKSTOP_REG1_ATS_INVAL_IODA_TBL_SEL		PPC_BIT(35)
> +
> +#define NPU2_CHECKSTOP_REG2_XSL_MMIO_INVALIDATE_REQ_WHILE_1_INPROG	PPC_BIT(36)
> +#define NPU2_CHECKSTOP_REG2_XSL_UNEXPECTED_ITAG_PORT_0			PPC_BIT(37)
> +#define NPU2_CHECKSTOP_REG2_XSL_UNEXPECTED_ITAG_PORT_1			PPC_BIT(38)
> +#define NPU2_CHECKSTOP_REG2_XSL_UNEXPECTED_RD_PEE_COMPLETION		PPC_BIT(39)
> +#define NPU2_CHECKSTOP_REG2_XSL_UNEXPECTED_CO_RESP			PPC_BIT(40)
> +#define NPU2_CHECKSTOP_REG2_XSL_XLAT_REQ_WHILE_SPAP_INVALID		PPC_BIT(41)
> +#define NPU2_CHECKSTOP_REG2_XSL_INVALID_PEE				PPC_BIT(42)
> +#define NPU2_CHECKSTOP_REG2_XSL_BLOOM_FILTER_PROTECT_ERR		PPC_BIT(43)
> +#define NPU2_CHECKSTOP_REG2_XSL_CE					PPC_BIT(46)
> +#define NPU2_CHECKSTOP_REG2_XSL_UE					PPC_BIT(47)
> +#define NPU2_CHECKSTOP_REG2_XSL_SLBI_TLBI_BUFF_OVERFLOW			PPC_BIT(48)
> +#define NPU2_CHECKSTOP_REG2_XSL_SBE_CORR_ERR_PB_CHKOUT_RSP_DATA		PPC_BIT(49)
> +#define NPU2_CHECKSTOP_REG2_XSL_UE_PB_CHKOUT_RSP_DATA			PPC_BIT(50)
> +#define NPU2_CHECKSTOP_REG2_XSL_SUE_PB_CHKOUT_RSP_DATA			PPC_BIT(51)
> +

I couldn't see these referenced anywhere, do we use them at all?

- Alistair

>  struct OpalHMIEvent {
>  	uint8_t		version;	/* 0x00 */
>  	uint8_t		severity;	/* 0x01 */
> 




More information about the Skiboot mailing list