[Skiboot] [PATCH V2 4/4] nvlink: Add primitive EEH support for NPU devices

Alistair Popple alistair at popple.id.au
Wed Jan 20 15:40:59 AEDT 2016


Acked-By: Alistair Popple <alistair at popple.id.au>

On Mon, 18 Jan 2016 16:59:42 Russell Currey wrote:
> Implements Extended Error Handling callbacks for NVLink devices.
> 
> At present, this supports fence mode emulation, and some easily detectable
> freezes.  There is a lot of work still to be done here, but this enables
> EEH to work as expected in some specific scenarios.
> 
> Signed-off-by: Russell Currey <ruscur at russell.cc>
> ---
> V2: Iterate up to NPU_NUM_OF_PES instead of total devices
> ---
>  hw/npu.c | 45 ++++++++++++++++++++++++++++++++++++++++++---
>  1 file changed, 42 insertions(+), 3 deletions(-)
> 
> diff --git a/hw/npu.c b/hw/npu.c
> index 23facaf..a3898b1 100644
> --- a/hw/npu.c
> +++ b/hw/npu.c
> @@ -766,7 +766,6 @@ static void npu_err_interrupt(void *data, uint32_t isn)
>  		prerror("Invalid NPU error interrupt received\n");
>  		break;
>  	case 6 ... 7:
> -		NPUERR(p, "Error handling not implemented\n");
>  		opal_update_pending_evt(OPAL_EVENT_PCI_ERROR,
>  					OPAL_EVENT_PCI_ERROR);
>  	}
> @@ -992,6 +991,13 @@ static int64_t npu_power_state(struct phb *phb __unused)
>  	return OPAL_SHPC_POWER_ON;
>  }
>  
> +static int64_t npu_hreset(struct phb *phb __unused)
> +{
> +	prlog(PR_DEBUG, "NPU: driver should call reset procedure here\n");
> +
> +	return OPAL_SUCCESS;
> +}
> +
>  static int64_t npu_freset(struct phb *phb __unused)
>  {
>  	/* FIXME: PHB fundamental reset, which need to be
> @@ -1021,6 +1027,39 @@ static int64_t npu_freeze_status(struct phb *phb,
>  	return OPAL_SUCCESS;
>  }
>  
> +static int64_t npu_eeh_next_error(struct phb *phb,
> +				  uint64_t *first_frozen_pe,
> +				  uint16_t *pci_error_type,
> +				  uint16_t *severity)
> +{
> +	struct npu *p = phb_to_npu(phb);
> +	int i;
> +	uint64_t result = 0;
> +	*first_frozen_pe = -1;
> +	*pci_error_type = OPAL_EEH_NO_ERROR;
> +	*severity = OPAL_EEH_SEV_NO_ERROR;
> +
> +	if (p->fenced) {
> +		*pci_error_type = OPAL_EEH_PHB_ERROR;
> +		*severity = OPAL_EEH_SEV_PHB_FENCED;
> +		return OPAL_SUCCESS;
> +	}
> +
> +	npu_ioda_sel(p, NPU_IODA_TBL_PESTB, 0, true);
> +	for (i = 0; i < NPU_NUM_OF_PES; i++) {
> +		result = in_be64(p->at_regs + NPU_IODA_DATA0);
> +		if (result > 0) {
> +			*first_frozen_pe = i;
> +			*pci_error_type = OPAL_EEH_PE_ERROR;
> +			*severity = OPAL_EEH_SEV_PE_ER;
> +			break;
> +		}
> +	}
> +
> +	return OPAL_SUCCESS;
> +}
> +
> +
>  /* Sets the NPU to trigger an error when a DMA occurs */
>  static int64_t npu_err_inject(struct phb *phb, uint32_t pe_num,
>  			      uint32_t type, uint32_t func __unused,
> @@ -1093,14 +1132,14 @@ static const struct phb_ops npu_ops = {
>  	.power_state		= npu_power_state,
>  	.slot_power_off		= NULL,
>  	.slot_power_on		= NULL,
> -	.hot_reset		= NULL,
> +	.hot_reset		= npu_hreset,
>  	.fundamental_reset	= npu_freset,
>  	.complete_reset		= NULL,
>  	.poll			= NULL,
>  	.eeh_freeze_status	= npu_freeze_status,
>  	.eeh_freeze_clear	= NULL,
>  	.eeh_freeze_set		= NULL,
> -	.next_error		= NULL,
> +	.next_error		= npu_eeh_next_error,
>  	.err_inject		= npu_err_inject,
>  	.get_diag_data		= NULL,
>  	.get_diag_data2		= NULL,
> 



More information about the Skiboot mailing list