[Skiboot] [PATCH v3 5/7] hw/npu2: Report errors to the OS if an OpenCAPI brick is fenced

Andrew Donnellan andrew.donnellan at au1.ibm.com
Mon Apr 8 17:19:45 AEST 2019


On 6/4/19 1:33 am, Frederic Barrat wrote:
> Now that the NPU may report interrupts due to the link going down
> unexpectedly, report those errors to the OS when queried by the
> 'next_error' PHB callback.
> 
> The hardware doesn't support recovery of the link when it goes down
> unexpectedly. So we report the PHB as dead, so that the OS can log the
> proper message, notify the drivers and take the devices down.
> 
> Signed-off-by: Frederic Barrat <fbarrat at linux.ibm.com>

Adding my:

Reviewed-by: Andrew Donnellan <andrew.donnellan at au1.ibm.com>

again.

> ---
> Changelog
> v2, v3: no change
> 
>   hw/npu2-opencapi.c | 55 ++++++++++++++++++++++++++++++++++++++++++----
>   include/npu2.h     |  1 +
>   2 files changed, 52 insertions(+), 4 deletions(-)
> 
> diff --git a/hw/npu2-opencapi.c b/hw/npu2-opencapi.c
> index 285615a5..9df51b22 100644
> --- a/hw/npu2-opencapi.c
> +++ b/hw/npu2-opencapi.c
> @@ -1434,18 +1434,64 @@ static int64_t npu2_opencapi_ioda_reset(struct phb __unused *phb,
>   	return OPAL_SUCCESS;
>   }
>   
> -static int64_t npu2_opencapi_set_pe(struct phb __unused *phb,
> -				    uint64_t __unused pe_num,
> +static int64_t npu2_opencapi_set_pe(struct phb *phb,
> +				    uint64_t pe_num,
>   				    uint64_t __unused bdfn,
>   				    uint8_t __unused bcompare,
>   				    uint8_t __unused dcompare,
>   				    uint8_t __unused fcompare,
>   				    uint8_t __unused action)
>   {
> +	struct npu2_dev *dev = phb_to_npu2_dev_ocapi(phb);
>   	/*
>   	 * Ignored on OpenCAPI - we use fixed PE assignments. May need
>   	 * addressing when we support dual-link devices.
> +	 *
> +	 * We nonetheless store the PE reported by the OS so that we
> +	 * can send it back in case of error. If there are several PCI
> +	 * functions on the device, the OS can define many PEs, we
> +	 * only keep one, the OS will handle it.
>   	 */
> +	dev->linux_pe = pe_num;
> +	return OPAL_SUCCESS;
> +}
> +
> +static int64_t npu2_opencapi_freeze_status(struct phb *phb __unused,
> +			   uint64_t pe_number __unused,
> +			   uint8_t *freeze_state,
> +			   uint16_t *pci_error_type,
> +			   uint16_t *severity)
> +{
> +	*freeze_state = OPAL_EEH_STOPPED_NOT_FROZEN;
> +	*pci_error_type = OPAL_EEH_NO_ERROR;
> +	if (severity)
> +		*severity = OPAL_EEH_SEV_NO_ERROR;
> +
> +	return OPAL_SUCCESS;
> +}
> +
> +static int64_t npu2_opencapi_eeh_next_error(struct phb *phb,
> +				   uint64_t *first_frozen_pe,
> +				   uint16_t *pci_error_type,
> +				   uint16_t *severity)
> +{
> +	struct npu2_dev *dev = phb_to_npu2_dev_ocapi(phb);
> +	uint64_t reg;
> +
> +	if (!first_frozen_pe || !pci_error_type || !severity)
> +		return OPAL_PARAMETER;
> +
> +	reg = npu2_read(dev->npu, NPU2_MISC_FENCE_STATE);
> +	if (reg & PPC_BIT(dev->brick_index)) {
> +		OCAPIERR(dev, "Brick %d fenced!\n", dev->brick_index);
> +		*first_frozen_pe = dev->linux_pe;
> +		*pci_error_type = OPAL_EEH_PHB_ERROR;
> +		*severity = OPAL_EEH_SEV_PHB_DEAD;
> +	} else {
> +		*first_frozen_pe = -1;
> +		*pci_error_type = OPAL_EEH_NO_ERROR;
> +		*severity = OPAL_EEH_SEV_NO_ERROR;
> +	}
>   	return OPAL_SUCCESS;
>   }
>   
> @@ -1646,6 +1692,7 @@ static void setup_device(struct npu2_dev *dev)
>   	dev->phb_ocapi.scan_map = 0;
>   
>   	dev->bdfn = 0;
> +	dev->linux_pe = -1;
>   	dev->train_need_fence = false;
>   	dev->train_fenced = false;
>   
> @@ -1765,10 +1812,10 @@ static const struct phb_ops npu2_opencapi_ops = {
>   	.get_msi_64		= NULL,
>   	.set_pe			= npu2_opencapi_set_pe,
>   	.set_peltv		= NULL,
> -	.eeh_freeze_status	= npu2_freeze_status,  /* TODO */
> +	.eeh_freeze_status	= npu2_opencapi_freeze_status,
>   	.eeh_freeze_clear	= NULL,
>   	.eeh_freeze_set		= NULL,
> -	.next_error		= NULL,
> +	.next_error		= npu2_opencapi_eeh_next_error,
>   	.err_inject		= NULL,
>   	.get_diag_data		= NULL,
>   	.get_diag_data2		= NULL,
> diff --git a/include/npu2.h b/include/npu2.h
> index 6c73679f..ef4e7aff 100644
> --- a/include/npu2.h
> +++ b/include/npu2.h
> @@ -157,6 +157,7 @@ struct npu2_dev {
>   
>   	/* OpenCAPI */
>   	struct phb		phb_ocapi;
> +	uint64_t		linux_pe;
>   	bool			train_need_fence;
>   	bool			train_fenced;
>   };
> 

-- 
Andrew Donnellan              OzLabs, ADL Canberra
andrew.donnellan at au1.ibm.com  IBM Australia Limited



More information about the Skiboot mailing list