[Skiboot] [PATCH V2 7/7] hmi: Add handling for NPU checkstops

Alistair Popple alistair at popple.id.au
Thu Mar 24 14:07:58 AEDT 2016


On Mon, 21 Mar 2016 12:00:06 Russell Currey wrote:
> If the NPU detects an unrecoverable error, it will send a HMI.  This is
> problematic since unhandled HMIs will checkstop the entire system, which
> is not the intended behaviour of a NPU failure.  Instead, the NPU
> emulated PCI devices should be fenced as part of EEH.
> 
> Add support for handling NPU HMIs.  This works by finding the NPU
> responsible for the HMI, checking its error registers, and sending a
> recoverable HMI event.  The NPU itself cannot actually recover, but the
> system should not be brought down.  Fence mode is set on the NPU, such
> that any further operations on the NPU will trigger EEH, and it will be
> subsequently fenced from the system.
> 
> Signed-off-by: Russell Currey <ruscur at russell.cc>

Reviewed-by: Alistair Popple <alistair at popple.id.au>

> ---
> V2:
> 	- use for_each_phb macro, dealing with non-continuous PHB IDs
> 	- add a comment mentioning a future >1 NPU per chip case
> 	- move the P8NVL check into the function, looks cleaner
> ---
>  core/hmi.c         | 72 +++++++++++++++++++++++++++++++++++++++++++++++++++++-
>  include/opal-api.h |  1 +
>  2 files changed, 72 insertions(+), 1 deletion(-)
> 
> diff --git a/core/hmi.c b/core/hmi.c
> index a4c7869..5d8020a 100644
> --- a/core/hmi.c
> +++ b/core/hmi.c
> @@ -24,6 +24,8 @@
>  #include <pci.h>
>  #include <cpu.h>
>  #include <chip.h>
> +#include <npu-regs.h>
> +#include <npu.h>
>  
>  /*
>   * HMER register layout:
> @@ -439,6 +441,74 @@ static void find_nx_checkstop_reason(int flat_chip_id,
>  	*event_generated = 1;
>  }
>  
> +static void find_npu_checkstop_reason(int flat_chip_id,
> +				      struct OpalHMIEvent *hmi_evt,
> +				      int *event_generated)
> +{
> +	struct phb *phb;
> +	struct npu *p = NULL;
> +
> +	uint64_t npu_fir;
> +	uint64_t npu_fir_mask;
> +	uint64_t npu_fir_action0;
> +	uint64_t npu_fir_action1;
> +	uint64_t fatal_errors;
> +
> +	/* Only check for NPU errors if the chip has a NPU */
> +	if (PVR_TYPE(mfspr(SPR_PVR)) != PVR_TYPE_P8NVL)
> +		return;
> +
> +	/* Find the NPU on the chip associated with the HMI. */
> +	for_each_phb(phb) {
> +		/* NOTE: if a chip ever has >1 NPU this will need adjusting */
> +		if (dt_node_is_compatible(phb->dt_node, "ibm,power8-npu-pciex") &&
> +		    (dt_get_chip_id(phb->dt_node) == flat_chip_id)) {
> +			p = phb_to_npu(phb);
> +			break;
> +		}
> +	}
> +
> +	/* If we didn't find a NPU on the chip, it's not our checkstop. */
> +	if (p == NULL)
> +		return;
> +
> +	/* Read all the registers necessary to find a checkstop condition. */
> +	if (xscom_read(flat_chip_id,
> +		       p->at_xscom + NX_FIR, &npu_fir) ||
> +	    xscom_read(flat_chip_id,
> +		       p->at_xscom + NX_FIR_MASK, &npu_fir_mask) ||
> +	    xscom_read(flat_chip_id,
> +		       p->at_xscom + NX_FIR_ACTION0, &npu_fir_action0) ||
> +	    xscom_read(flat_chip_id,
> +		       p->at_xscom + NX_FIR_ACTION1, &npu_fir_action1)) {
> +		prerror("HMI: Couldn't read NPU registers with XSCOM\n");
> +		return;
> +	}
> +
> +	fatal_errors = npu_fir & ~npu_fir_mask & npu_fir_action0 & npu_fir_action1;
> +
> +	/* If there's no errors, we don't need to do anything. */
> +	if (!fatal_errors)
> +		return;
> +
> +	prlog(PR_DEBUG,
> +	      "NPU: FIR %llx FIR mask %llx FIR ACTION0 %llx FIR ACTION1 %llx\n",
> +	      npu_fir, npu_fir_mask, npu_fir_action0, npu_fir_action1);
> +
> +	/* Set the NPU to fenced since it can't recover. */
> +	p->fenced = true;
> +
> +	/* Set up the HMI event */
> +	hmi_evt->severity = OpalHMI_SEV_WARNING;
> +	hmi_evt->type = OpalHMI_ERROR_MALFUNC_ALERT;
> +	hmi_evt->u.xstop_error.xstop_type = CHECKSTOP_TYPE_NPU;
> +	hmi_evt->u.xstop_error.u.chip_id = flat_chip_id;
> +
> +	/* The HMI is "recoverable" because it shouldn't crash the system */
> +	queue_hmi_event(hmi_evt, 1);
> +	*event_generated = 1;
> +}
> +
>  static void decode_malfunction(struct OpalHMIEvent *hmi_evt)
>  {
>  	int i;
> @@ -456,8 +526,8 @@ static void decode_malfunction(struct OpalHMIEvent *hmi_evt)
>  				queue_hmi_event(hmi_evt, recover);
>  				event_generated = 1;
>  			}
> -
>  			find_nx_checkstop_reason(i, hmi_evt, &event_generated);
> +			find_npu_checkstop_reason(i, hmi_evt, &event_generated);
>  		}
>  
>  	if (recover != -1) {
> diff --git a/include/opal-api.h b/include/opal-api.h
> index 369aa93..0b7b0bb 100644
> --- a/include/opal-api.h
> +++ b/include/opal-api.h
> @@ -577,6 +577,7 @@ enum OpalHMI_XstopType {
>  	CHECKSTOP_TYPE_UNKNOWN	=	0,
>  	CHECKSTOP_TYPE_CORE	=	1,
>  	CHECKSTOP_TYPE_NX	=	2,
> +	CHECKSTOP_TYPE_NPU	=	3
>  };
>  
>  enum OpalHMI_CoreXstopReason {
> 



More information about the Skiboot mailing list