[Skiboot] [PATCH] npu2-opencapi: Don't send commands to NPU when link is down

Andrew Donnellan andrew.donnellan at au1.ibm.com
Tue Jul 17 11:49:55 AEST 2018


On 16/07/18 19:13, Frederic Barrat wrote:
> Even if the link is down, the PCI scan framework always try to access
> the root device ("scan upstream only"). For opencapi, there's no root
> device, so we may try to issue a config operation when the link is
> down. The operation fails, but it raises a FIR bit and can trigger an
> HMI.
> 
> To fix it, we now keep track of the link state and avoid sending any
> configuration space operations to the NPU if we know the link is not
> ready.
> 
> Signed-off-by: Frederic Barrat <fbarrat at linux.ibm.com>
> ---
>   hw/npu2-opencapi.c | 10 +++++++++-
>   include/npu2.h     |  1 +
>   2 files changed, 10 insertions(+), 1 deletion(-)
> 
> diff --git a/hw/npu2-opencapi.c b/hw/npu2-opencapi.c
> index f82e6562..474ac0fb 100644
> --- a/hw/npu2-opencapi.c
> +++ b/hw/npu2-opencapi.c
> @@ -1107,6 +1107,7 @@ static int64_t npu2_opencapi_poll_link(struct pci_slot *slot)
>   		return pci_slot_set_sm_timeout(slot, msecs_to_tb(1));
>   
>   	case OCAPI_SLOT_LINK_TRAINED:
> +		dev->link_down = false;
>   		otl_enabletx(chip_id, dev->npu->xscom_base, dev);
>   		pci_slot_set_state(slot, OCAPI_SLOT_NORMAL);
>   		return OPAL_SUCCESS;
> @@ -1137,7 +1138,7 @@ static int64_t npu2_opencapi_freset(struct pci_slot *slot)
>   	case OCAPI_SLOT_NORMAL:
>   	case OCAPI_SLOT_FRESET_START:
>   		OCAPIDBG(dev, "FRESET starts\n");
> -
> +		dev->link_down = true;
>   		if (slot->ops.get_presence_state)
>   			slot->ops.get_presence_state(slot, &presence);
>   		if (!presence) {
> @@ -1261,6 +1262,9 @@ static int64_t npu2_opencapi_pcicfg_read(struct phb *phb, uint32_t bdfn,
>   	if (rc)
>   		return rc;
>   
> +	if (dev->link_down)
> +		return OPAL_HARDWARE;
> +
>   	genid_base = dev->bars[1].npu2_bar.base +
>   		(index_to_block(dev->index) == NPU2_BLOCK_OTL1 ? 256 : 0);
>   
> @@ -1319,6 +1323,9 @@ static int64_t npu2_opencapi_pcicfg_write(struct phb *phb, uint32_t bdfn,
>   	if (rc)
>   		return rc;
>   
> +	if (dev->link_down)
> +		return OPAL_HARDWARE;
> +
>   	genid_base = dev->bars[1].npu2_bar.base +
>   		(index_to_block(dev->index) == NPU2_BLOCK_OTL1 ? 256 : 0);
>   
> @@ -1637,6 +1644,7 @@ static void npu2_opencapi_setup_device(struct dt_node *dn_link, struct npu2 *n,
>   	dev->bdfn = 0;
>   	dev->train_need_fence = false;
>   	dev->train_fenced = false;
> +	dev->link_down = true;
>   	n->total_devices++;
>   
>   	/* Find I2C port for handling device reset */
> diff --git a/include/npu2.h b/include/npu2.h
> index 4c2e20e0..c151f0e3 100644
> --- a/include/npu2.h
> +++ b/include/npu2.h
> @@ -138,6 +138,7 @@ struct npu2_dev {
>   	uint64_t		i2c_port_id_ocapi;
>   	bool			train_need_fence;
>   	bool			train_fenced;
> +	bool			link_down;

If this is only being used for OpenCAPI it should get a comment 
indicating as such.

>   };
>   
>   struct npu2 {
> 

-- 
Andrew Donnellan              OzLabs, ADL Canberra
andrew.donnellan at au1.ibm.com  IBM Australia Limited



More information about the Skiboot mailing list