[Skiboot] [PATCH skiboot] npu2: Reset NVLinks when resetting a GPU

srikanth sraithal at linux.vnet.ibm.com
Wed Jun 12 15:57:43 AEST 2019


Tested-by: Srikanth Aithal <sraithal at linux.vnet.ibm.com>

Issue is now fixed, tested with skiboot-v6.3.1-p0a5edb1

On 5/28/19 6:48 AM, Alexey Kardashevskiy wrote:
>
> On 20/05/2019 14:19, Alexey Kardashevskiy wrote:
>> Resetting a V100 GPU brings its NVLinks down and if an NPU tries using
>> those, an HMI occurs. We were lucky not to observe this as the bare metal
>> does not normally reset a GPU and when passed through, GPUs are usually
>> before NPUs in QEMU command line or Libvirt XML and because of that NPUs
>> are naturally reset first. However simple change of the device order
>> brings HMIs.
>>
>> This defines a bus control filter for a PCI slot with a GPU with NVLinks
>> so when the host system issues secondary bus reset to the slot, it resets
>> associated NVLinks.
>>
>> Signed-off-by: Alexey Kardashevskiy <aik at ozlabs.ru>
>
> Reported-by: Srikanth Aithal <sraithal at linux.vnet.ibm.com>
>
>
> Srikanth, can you please confirm (with Tested-by) the fix? Thanks. Also
> we are migrating now to @linux.ibm.com.
>
>
> Stewart, ping?
>
>
>
>> ---
>>   hw/npu2.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
>>   1 file changed, 55 insertions(+)
>>
>> diff --git a/hw/npu2.c b/hw/npu2.c
>> index e444bc66cfd3..4aa1231dfc2b 100644
>> --- a/hw/npu2.c
>> +++ b/hw/npu2.c
>> @@ -537,6 +537,48 @@ static int __npu2_dev_bind_pci_dev(struct phb *phb __unused,
>>   	return 0;
>>   }
>>   
>> +static int64_t npu2_gpu_brigde_sec_bus_reset(void *dev,
>> +		struct pci_cfg_reg_filter *pcrf __unused,
>> +		uint32_t offset, uint32_t len,
>> +		uint32_t *data, bool write)
>> +{
>> +	struct pci_device *pd = dev;
>> +	struct pci_device *gpu;
>> +	struct phb *npphb;
>> +	struct npu2 *npu;
>> +	struct dt_node *np;
>> +	struct npu2_dev	*ndev;
>> +	int i;
>> +
>> +	assert(write);
>> +
>> +	if ((len != 2) || (offset & 1)) {
>> +		/* Short config writes are not supported */
>> +		PCIERR(pd->phb, pd->bdfn,
>> +		       "Unsupported write to bridge control register\n");
>> +		return OPAL_PARAMETER;
>> +	}
>> +
>> +	gpu = list_top(&pd->children, struct pci_device, link);
>> +	if (gpu && (*data & PCI_CFG_BRCTL_SECONDARY_RESET)) {
>> +		dt_for_each_compatible(dt_root, np, "ibm,power9-npu-pciex") {
>> +			npphb = pci_get_phb(dt_prop_get_cell(np,
>> +					"ibm,opal-phbid", 1));
>> +			if (!npphb || npphb->phb_type != phb_type_npu_v2)
>> +				continue;
>> +
>> +			npu = phb_to_npu2_nvlink(npphb);
>> +			for (i = 0; i < npu->total_devices; ++i) {
>> +				ndev = &npu->devices[i];
>> +				if (ndev->nvlink.pd == gpu)
>> +					npu2_dev_procedure_reset(ndev);
>> +			}
>> +		}
>> +	}
>> +
>> +	return OPAL_PARTIAL;
>> +}
>> +
>>   static void npu2_dev_bind_pci_dev(struct npu2_dev *dev)
>>   {
>>   	struct phb *phb;
>> @@ -558,6 +600,19 @@ static void npu2_dev_bind_pci_dev(struct npu2_dev *dev)
>>   			dev->nvlink.phb = phb;
>>   			/* Found the device, set the bit in config space */
>>   			npu2_set_link_flag(dev, NPU2_DEV_PCI_LINKED);
>> +
>> +			/*
>> +			 * We define a custom sec bus reset handler for a slot
>> +			 * with an NVLink-connected GPU to prevent HMIs which
>> +			 * will otherwise happen if we reset GPU before
>> +			 * resetting NVLinks.
>> +			 */
>> +			if (dev->nvlink.pd->parent &&
>> +			    dev->nvlink.pd->parent->slot)
>> +				pci_add_cfg_reg_filter(dev->nvlink.pd->parent,
>> +						PCI_CFG_BRCTL, 2,
>> +						PCI_REG_FLAG_WRITE,
>> +						npu2_gpu_brigde_sec_bus_reset);
>>   			return;
>>   		}
>>   	}
>>



More information about the Skiboot mailing list