[Skiboot] [PATCH 7/7] hmi: Add handling for NPU checkstops

Fri Mar 18 11:29:16 AEDT 2016

Russell,

A couple of minor nit-picks below, probably worth fixing if doing a V2.

- Alistair

On Tue, 15 Mar 2016 18:33:57 Russell Currey wrote:
> If the NPU detects an unrecoverable error, it will send a HMI.  This is
> problematic since unhandled HMIs will checkstop the entire system, which
> is not the intended behaviour of a NPU failure.  Instead, the NPU
> emulated PCI devices should be fenced as part of EEH.
> 
> Add support for handling NPU HMIs.  This works by finding the NPU
> responsible for the HMI, checking its error registers, and sending a
> recoverable HMI event.  The NPU itself cannot actually recover, but the
> system should not be brought down.  Fence mode is set on the NPU, such
> that any further operations on the NPU will trigger EEH, and it will be
> subsequently fenced from the system.
> 
> Signed-off-by: Russell Currey <ruscur at russell.cc>
> ---
>  core/hmi.c         | 78 
+++++++++++++++++++++++++++++++++++++++++++++++++++++-
>  include/opal-api.h |  1 +
>  2 files changed, 78 insertions(+), 1 deletion(-)
> 
> diff --git a/core/hmi.c b/core/hmi.c
> index faf99ca..2c1ac95 100644
> --- a/core/hmi.c
> +++ b/core/hmi.c
> @@ -24,6 +24,8 @@
>  #include <pci.h>
>  #include <cpu.h>
>  #include <chip.h>
> +#include <npu-regs.h>
> +#include <npu.h>
>  
>  /*
>   * HMER register layout:
> @@ -433,6 +435,77 @@ static void find_nx_checkstop_reason(int flat_chip_id,
>  	*event_generated = 1;
>  }
>  
> +static void find_npu_checkstop_reason(int flat_chip_id,
> +				      struct OpalHMIEvent *hmi_evt,
> +				      int *event_generated)
> +{
> +	struct phb *phb;
> +	struct npu *p = NULL;
> +
> +	int i;
> +
> +	uint64_t npu_fir;
> +	uint64_t npu_fir_mask;
> +	uint64_t npu_fir_action0;
> +	uint64_t npu_fir_action1;
> +	uint64_t fatal_errors;
> +
> +	/* Find the NPU on the chip associated with the HMI. */
> +	for (i = 0; i < MAX_PHB_ID; i++) {
> +		phb = pci_get_phb(i);
> +
> +		/* If we've gone through all PHBs, we're done. */
> +		if (phb == NULL)
> +			break;

This assumes there are no holes in the PHB ID's which might not be true 
depending on how the PHBs were registered (ie. if the opal_id was passed to 
pci_register_phb instead of OPAL_DYNAMIC_PHB_ID) or if we ever implement 
pci_unregister_phb.

It might better to implement for_each_phb() or similar in pci.h that can walk 
all the PHBs.

> +		if (dt_node_is_compatible(phb->dt_node, "ibm,power8-npu-
pciex") &&
> +		    (dt_get_chip_id(phb->dt_node) == flat_chip_id)) {
> +			p = phb_to_npu(phb);
> +			break;

This assumes each chip only has a single NPU, which is true today but if 
you're doing a v2 it might be worth a comment to remind us to update this if 
it ever changes...

> +		}
> +	}
> +
> +	/* If we didn't find a NPU on the chip, it's not our checkstop. */
> +	if (p == NULL)
> +		return;
> +
> +	/* Read all the registers necessary to find a checkstop condition. */
> +	if (xscom_read(flat_chip_id,
> +		       p->at_xscom + NX_FIR, &npu_fir) ||
> +	    xscom_read(flat_chip_id,
> +		       p->at_xscom + NX_FIR_MASK, &npu_fir_mask) ||
> +	    xscom_read(flat_chip_id,
> +		       p->at_xscom + NX_FIR_ACTION0, &npu_fir_action0) ||
> +	    xscom_read(flat_chip_id,
> +		       p->at_xscom + NX_FIR_ACTION1, &npu_fir_action1)) {
> +		prerror("HMI: Couldn't read NPU registers with XSCOM\n");
> +		return;
> +	}
> +
> +	fatal_errors = npu_fir & ~npu_fir_mask & npu_fir_action0 & 
npu_fir_action1;
> +
> +	/* If there's no errors, we don't need to do anything. */
> +	if (!fatal_errors)
> +		return;
> +
> +	prlog(PR_DEBUG,
> +	      "NPU: FIR %llx FIR mask %llx FIR ACTION0 %llx FIR ACTION1 
%llx\n",
> +	      npu_fir, npu_fir_mask, npu_fir_action0, npu_fir_action1);
> +
> +	/* Set the NPU to fenced since it can't recover. */
> +	p->fenced = true;

I assume we don't need a lock here because we never clear the flag?

> +
> +	/* Set up the HMI event */
> +	hmi_evt->severity = OpalHMI_SEV_WARNING;
> +	hmi_evt->type = OpalHMI_ERROR_MALFUNC_ALERT;
> +	hmi_evt->u.xstop_error.xstop_type = CHECKSTOP_TYPE_NPU;
> +	hmi_evt->u.xstop_error.u.chip_id = flat_chip_id;
> +
> +	/* The HMI is "recoverable" because it shouldn't crash the system */
> +	queue_hmi_event(hmi_evt, 1);
> +	*event_generated = 1;
> +}
> +
>  static void decode_malfunction(struct OpalHMIEvent *hmi_evt)
>  {
>  	int i;
> @@ -450,8 +523,11 @@ static void decode_malfunction(struct OpalHMIEvent 
*hmi_evt)
>  				queue_hmi_event(hmi_evt, recover);
>  				event_generated = 1;
>  			}
> -
>  			find_nx_checkstop_reason(i, hmi_evt, 
&event_generated);
> +			/* Only check for NPU errors if we have a NPU. */
> +			if (PVR_TYPE(mfspr(SPR_PVR)) == PVR_TYPE_P8NVL)
> +				find_npu_checkstop_reason(i, hmi_evt,
> +							  &event_generated);
>  		}
>  
>  	if (recover != -1) {
> diff --git a/include/opal-api.h b/include/opal-api.h
> index 369aa93..0b7b0bb 100644
> --- a/include/opal-api.h
> +++ b/include/opal-api.h
> @@ -577,6 +577,7 @@ enum OpalHMI_XstopType {
>  	CHECKSTOP_TYPE_UNKNOWN	=	0,
>  	CHECKSTOP_TYPE_CORE	=	1,
>  	CHECKSTOP_TYPE_NX	=	2,
> +	CHECKSTOP_TYPE_NPU	=	3
>  };
>  
>  enum OpalHMI_CoreXstopReason {
>