[Skiboot] [RFC 08/12] npu2-opencapi: Improve error reporting to the OS

Andrew Donnellan ajd at linux.ibm.com
Fri Jun 21 17:20:23 AEST 2019


On 19/6/19 10:45 pm, Frederic Barrat wrote:
> When resetting an opencapi link, the brick will be fenced
> temporarily. Therefore we can't rely on the fencing state of the brick
> any more to check for the health of an opencapi PHB, as we could
> report errors if queried for a PHB state at the same time a link is
> being reset.
> 
> Instead, we flag the device as 'broken' when an error interrupt is
> received, just before raising an event to the OS. When the OS is
> querying for the state of a PHB, we only have to look at the 'broken'
> attribute.
> 
> Note that there's no recovery possible on P9 when an error interrupt
> is received unexpectedly, as recovery is not supported by hardware. So
> when a device/link is marked as 'broken', it stays broken. All the OS
> can do is log the error and notify the drivers.
> 
> Signed-off-by: Frederic Barrat <fbarrat at linux.ibm.com>

Reviewed-by: Andrew Donnellan <ajd at linux.ibm.com>

> ---
>   hw/npu2-common.c   |  7 +++++++
>   hw/npu2-opencapi.c | 21 +++++++++++++++++----
>   include/npu2.h     |  4 ++++
>   3 files changed, 28 insertions(+), 4 deletions(-)
> 
> diff --git a/hw/npu2-common.c b/hw/npu2-common.c
> index f3f2f45a..a2563efc 100644
> --- a/hw/npu2-common.c
> +++ b/hw/npu2-common.c
> @@ -419,6 +419,13 @@ static void npu2_err_interrupt(struct irq_source *is, uint32_t isn)
>   			p->chip_id, irq_name);
>   		free(irq_name);
>   		show_all_regs(p, brick);
> +		/*
> +		 * P9 NPU doesn't support recovering a link going down
> +		 * unexpectedly. So we mark the device as broken and
> +		 * report it to the OS, so that the error is logged
> +		 * and the drivers notified.
> +		 */
> +		npu2_opencapi_set_broken(p, brick);
>   		opal_update_pending_evt(OPAL_EVENT_PCI_ERROR,
>   					OPAL_EVENT_PCI_ERROR);
>   		break;
> diff --git a/hw/npu2-opencapi.c b/hw/npu2-opencapi.c
> index 153f2c6f..c11c945f 100644
> --- a/hw/npu2-opencapi.c
> +++ b/hw/npu2-opencapi.c
> @@ -1476,14 +1476,12 @@ static int64_t npu2_opencapi_eeh_next_error(struct phb *phb,
>   				   uint16_t *severity)
>   {
>   	struct npu2_dev *dev = phb_to_npu2_dev_ocapi(phb);
> -	uint64_t reg;
>   
>   	if (!first_frozen_pe || !pci_error_type || !severity)
>   		return OPAL_PARAMETER;
>   
> -	reg = npu2_read(dev->npu, NPU2_MISC_FENCE_STATE);
> -	if (reg & PPC_BIT(dev->brick_index)) {
> -		OCAPIERR(dev, "Brick %d fenced!\n", dev->brick_index);
> +	if (dev->flags & NPU2_DEV_BROKEN) {
> +		OCAPIDBG(dev, "Reporting device as broken\n");
>   		*first_frozen_pe = dev->linux_pe;
>   		*pci_error_type = OPAL_EEH_PHB_ERROR;
>   		*severity = OPAL_EEH_SEV_PHB_DEAD;
> @@ -1833,6 +1831,21 @@ static const struct phb_ops npu2_opencapi_ops = {
>   	.tce_kill		= NULL,
>   };
>   
> +void npu2_opencapi_set_broken(struct npu2 *npu, int brick)
> +{
> +	struct phb *phb;
> +	struct npu2_dev *dev;
> +
> +	for_each_phb(phb) {
> +		if (phb->phb_type == phb_type_npu_v2_opencapi) {
> +			dev = phb_to_npu2_dev_ocapi(phb);
> +			if (dev->npu == npu &&
> +			    dev->brick_index == brick)
> +				dev->flags |= NPU2_DEV_BROKEN;
> +		}
> +	}
> +}
> +
>   static int64_t opal_npu_spa_setup(uint64_t phb_id, uint32_t __unused bdfn,
>   				uint64_t addr, uint64_t PE_mask)
>   {
> diff --git a/include/npu2.h b/include/npu2.h
> index 4648464b..b376d0ee 100644
> --- a/include/npu2.h
> +++ b/include/npu2.h
> @@ -131,6 +131,8 @@ struct npu2_dev_nvlink {
>   	const char		*slot_label;
>   };
>   
> +#define NPU2_DEV_BROKEN		0x1
> +
>   struct npu2_dev {
>   	enum npu2_dev_type	type;
>   	uint32_t		link_index;
> @@ -139,6 +141,7 @@ struct npu2_dev {
>   	struct dt_node		*dt_node;
>   	struct npu2_pcie_bar	bars[2];
>   	struct npu2		*npu;
> +	long			flags;
>   
>   	uint32_t		bdfn;
>   
> @@ -256,4 +259,5 @@ int64_t npu2_freeze_status(struct phb *phb __unused,
>   			   uint16_t *pci_error_type __unused,
>   			   uint16_t *severity __unused);
>   void npu2_dump_scoms(int chip_id);
> +void npu2_opencapi_set_broken(struct npu2 *npu, int brick);
>   #endif /* __NPU2_H */
> 

-- 
Andrew Donnellan              OzLabs, ADL Canberra
ajd at linux.ibm.com             IBM Australia Limited



More information about the Skiboot mailing list