[Skiboot] [PATCH] opal/hmi: Report NPU2 checkstop reason

Andrew Donnellan ajd at linux.ibm.com
Thu May 30 11:51:16 AEST 2019


On 23/5/19 10:21 pm, Frederic Barrat wrote:
> The NPU2 is currently not passing any information to linux to explain
> the cause of an HMI. NPU2 has three Fault Isolation Registers and over
> 30 of those FIR bits are configured to raise an HMI by default. We
> won't be able to fit all possible state in the 32-bit xstop_reason
> field of the HMI event, but we can still try to encode up to 4 HMI
> reasons.
> 
> Signed-off-by: Frederic Barrat <fbarrat at linux.ibm.com>

Reviewed-by: Andrew Donnellan <ajd at linux.ibm.com>

> ---
>   core/hmi.c | 44 ++++++++++++++++++++++++++++++++++++++++++++
>   1 file changed, 44 insertions(+)
> 
> diff --git a/core/hmi.c b/core/hmi.c
> index d97f3fc0..3b2860f8 100644
> --- a/core/hmi.c
> +++ b/core/hmi.c
> @@ -576,6 +576,46 @@ static bool phb_is_npu2(struct dt_node *dn)
>   		dt_node_is_compatible(dn, "ibm,power9-npu-opencapi-pciex"));
>   }
>   
> +static void add_npu2_xstop_reason(uint32_t *xstop_reason, uint8_t reason)
> +{
> +	int i, reason_count;
> +	uint8_t *ptr;
> +
> +	reason_count = sizeof(*xstop_reason) / sizeof(reason);
> +	ptr = (uint8_t *) xstop_reason;
> +	for (i = 0; i < reason_count; i++) {
> +		if (*ptr == 0) {
> +			*ptr = reason;
> +			break;
> +		}
> +		ptr++;
> +	}
> +}
> +
> +static void encode_npu2_xstop_reason(uint32_t *xstop_reason,
> +				uint64_t fir, int fir_number)
> +{
> +	int bit;
> +	uint8_t reason;
> +
> +	/*
> +	 * There are three 64-bit FIRs but the xstop reason field of
> +	 * the hmi event is only 32-bit. Encode which FIR bit is set as:
> +	 * - 2 bits for the FIR number
> +	 * - 6 bits for the bit number (0 -> 63)
> +	 *
> +	 * So we could even encode up to 4 reasons for the HMI, if
> +	 * that can ever happen
> +	 */
> +	while (fir) {
> +		bit = ilog2(fir);
> +		reason = fir_number << 6;
> +		reason |= (63 - bit); // IBM numbering
> +		add_npu2_xstop_reason(xstop_reason, reason);
> +		fir ^= 1ULL << bit;
> +	}
> +}
> +
>   static void find_npu2_checkstop_reason(int flat_chip_id,
>   				      struct OpalHMIEvent *hmi_evt,
>   				      uint64_t *out_flags)
> @@ -592,6 +632,7 @@ static void find_npu2_checkstop_reason(int flat_chip_id,
>   	uint64_t npu2_fir_action0_addr;
>   	uint64_t npu2_fir_action1_addr;
>   	uint64_t fatal_errors;
> +	uint32_t xstop_reason = 0;
>   	int total_errors = 0;
>   	const char *loc;
>   
> @@ -635,6 +676,8 @@ static void find_npu2_checkstop_reason(int flat_chip_id,
>   			prlog(PR_ERR, "NPU: [Loc: %s] P:%d ACTION0 0x%016llx, ACTION1 0x%016llx\n",
>   					loc, flat_chip_id, npu2_fir_action0, npu2_fir_action1);
>   			total_errors++;
> +
> +			encode_npu2_xstop_reason(&xstop_reason, fatal_errors, i);
>   		}
>   
>   		/* Can't do a fence yet, we are just logging fir information for now */
> @@ -667,6 +710,7 @@ static void find_npu2_checkstop_reason(int flat_chip_id,
>   	hmi_evt->severity = OpalHMI_SEV_WARNING;
>   	hmi_evt->type = OpalHMI_ERROR_MALFUNC_ALERT;
>   	hmi_evt->u.xstop_error.xstop_type = CHECKSTOP_TYPE_NPU;
> +	hmi_evt->u.xstop_error.xstop_reason = xstop_reason;
>   	hmi_evt->u.xstop_error.u.chip_id = flat_chip_id;
>   
>   	/* Marking the event as recoverable so that we don't crash */
> 

-- 
Andrew Donnellan              OzLabs, ADL Canberra
ajd at linux.ibm.com             IBM Australia Limited



More information about the Skiboot mailing list