[PATCH kernel v4 11/11] powerpc/powernv/npu: Enable NVLink pass through

Fri May 6 11:02:44 AEST 2016

On Thu, 5 May 2016 15:49:18 Alexey Kardashevskiy wrote:
> On 05/04/2016 12:08 AM, Alistair Popple wrote:
> > Hi Alexey,
> >
> > On Fri, 29 Apr 2016 18:55:24 Alexey Kardashevskiy wrote:
> >> IBM POWER8 NVlink systems come with Tesla K40-ish GPUs each of which
> >> also has a couple of fast speed links (NVLink). The interface to links
> >> is exposed as an emulated PCI bridge which is included into the same
> >> IOMMU group as the corresponding GPU.
> >>
> >> In the kernel, NPUs get a separate PHB of the PNV_PHB_NPU type and a PE
> >> which behave pretty much as the standard IODA2 PHB except NPU PHB has
> >> just a single TVE in the hardware which means it can have either
> >> 32bit window or 64bit window or DMA bypass but never two of these.
> >>
> >> In order to make these links work when GPU is passed to the guest,
> >> these bridges need to be passed as well; otherwise performance will
> >> degrade.
> >>
> >> This implements and exports API to manage NPU state in regard to VFIO;
> >> it replicates iommu_table_group_ops.
> >>
> >> This defines a new pnv_pci_ioda2_npu_ops which is assigned to
> >> the IODA2 bridge if there are NPUs for a GPU on the bridge.
> >> The new callbacks call the default IODA2 callbacks plus new NPU API.
> >> This adds a gpe_table_group_to_npe() helper to find NPU PE for the IODA2
> >> table_group, it is not expected to fail as the helper is only called
> >> from the pnv_pci_ioda2_npu_ops.
> >>
> >> This does not define NPU-specific .release_ownership() so after
> >> VFIO is finished, DMA on NPU is disabled which is ok as the nvidia
> >> driver sets DMA mask when probing which enable 32 or 64bit DMA on NPU.
> >>
> >> This adds a pnv_pci_npu_setup_iommu() helper which adds NPUs to
> >> the GPU group if any found. The helper uses helpers to look for
> >> the "ibm,gpu" property in the device tree which is a phandle of
> >> the corresponding GPU.
> >>
> >> This adds an additional loop over PEs in pnv_ioda_setup_dma() as the main
> >> loop skips NPU PEs as they do not have 32bit DMA segments.
> >>
> >> As pnv_npu_set_window() and pnv_npu_unset_window() are started being used
> >> by the new IODA2-NPU IOMMU group, this makes the helpers public and
> >> adds the DMA window number parameter.
> >>
> >> Signed-off-by: Alexey Kardashevskiy <aik at ozlabs.ru>
> >> ---
> >> Changes:
> >> v4:
> >> * reused pnv_npu_set_window/pnv_npu_unset_window where possible
> >> * added comments, changed commit log
> >>
> >> v3:
> >> * moved NPU-to-GPU IOMMU grouping later after all PHBs are discovered
> >> * removed hack to make iommu_add_device() work, iommu_group_add_device()
> >> is used instead
> >> * cleanup in gpe_table_group_to_npe_cb()
> >>
> >> v2:
> >> * reimplemented to support NPU + GPU in the same group
> >> * merged "powerpc/powernv/npu: Add NPU devices to IOMMU group" and
> >> "powerpc/powernv/npu: Enable passing through via VFIO" into this patch
> >> ---
> >>  arch/powerpc/platforms/powernv/npu-dma.c  |  64 +++++++++++++++++--
> >>  arch/powerpc/platforms/powernv/pci-ioda.c | 102
> > ++++++++++++++++++++++++++++++
> >>  arch/powerpc/platforms/powernv/pci.h      |   6 ++
> >>  3 files changed, 166 insertions(+), 6 deletions(-)
> >>
> >> diff --git a/arch/powerpc/platforms/powernv/npu-dma.c
> > b/arch/powerpc/platforms/powernv/npu-dma.c
> >> index cb2d1da..0459e10 100644
> >> --- a/arch/powerpc/platforms/powernv/npu-dma.c
> >> +++ b/arch/powerpc/platforms/powernv/npu-dma.c
> >> @@ -12,6 +12,7 @@
> >>  #include <linux/export.h>
> >>  #include <linux/pci.h>
> >>  #include <linux/memblock.h>
> >> +#include <linux/iommu.h>
> >>
> >>  #include <asm/iommu.h>
> >>  #include <asm/pnv-pci.h>
> >> @@ -154,7 +155,7 @@ static struct pnv_ioda_pe *get_gpu_pci_dev_and_pe(struct
> > pnv_ioda_pe *npe,
> >>  	return pe;
> >>  }
> >>
> >> -static long pnv_npu_set_window(struct pnv_ioda_pe *npe,
> >> +long pnv_npu_set_window(struct pnv_ioda_pe *npe, int num,
> >>  		struct iommu_table *tbl)
> >>  {
> >>  	struct pnv_phb *phb = npe->phb;
> >> @@ -182,13 +183,13 @@ static long pnv_npu_set_window(struct pnv_ioda_pe
> > *npe,
> >>  	pnv_pci_ioda2_tce_invalidate_entire(phb, false);
> >>
> >>  	/* Add the table to the list so its TCE cache will get invalidated */
> >> -	pnv_pci_link_table_and_group(phb->hose->node, 0,
> >> +	pnv_pci_link_table_and_group(phb->hose->node, num,
> >>  			tbl, &npe->table_group);
> >>
> >>  	return 0;
> >>  }
> >>
> >> -static long pnv_npu_unset_window(struct pnv_ioda_pe *npe)
> >> +long pnv_npu_unset_window(struct pnv_ioda_pe *npe, int num)
> >>  {
> >>  	struct pnv_phb *phb = npe->phb;
> >>  	int64_t rc;
> >> @@ -205,7 +206,7 @@ static long pnv_npu_unset_window(struct pnv_ioda_pe
> > *npe)
> >>  	}
> >>  	pnv_pci_ioda2_tce_invalidate_entire(phb, false);
> >>
> >> -	pnv_pci_unlink_table_and_group(npe->table_group.tables[0],
> >> +	pnv_pci_unlink_table_and_group(npe->table_group.tables[num],
> >>  			&npe->table_group);
> >>
> >>  	return 0;
> >> @@ -231,7 +232,7 @@ static void pnv_npu_dma_set_32(struct pnv_ioda_pe *npe)
> >>  	if (!gpe)
> >>  		return;
> >>
> >> -	rc = pnv_npu_set_window(npe, gpe->table_group.tables[0]);
> >> +	rc = pnv_npu_set_window(npe, 0, gpe->table_group.tables[0]);
> >>
> >>  	/*
> >>  	 * We don't initialise npu_pe->tce32_table as we always use
> >> @@ -255,7 +256,7 @@ static int pnv_npu_dma_set_bypass(struct pnv_ioda_pe
> > *npe)
> >>  	if (phb->type != PNV_PHB_NPU || !npe->pdev)
> >>  		return -EINVAL;
> >>
> >> -	rc = pnv_npu_unset_window(npe);
> >> +	rc = pnv_npu_unset_window(npe, 0);
> >>  	if (rc != OPAL_SUCCESS)
> >>  		return rc;
> >>
> >> @@ -307,3 +308,54 @@ void pnv_npu_try_dma_set_bypass(struct pci_dev *gpdev,
> > bool bypass)
> >>  		}
> >>  	}
> >>  }
> >> +
> >> +/* Switch ownership from platform code to external user (e.g. VFIO) */
> >> +void pnv_npu_take_ownership(struct pnv_ioda_pe *npe)
> >> +{
> >> +	struct pnv_phb *phb = npe->phb;
> >> +	int64_t rc;
> >> +
> >> +	/*
> >> +	 * Note: NPU has just a single TVE in the hardware which means that
> >> +	 * while used by the kernel, it can have either 32bit window or
> >> +	 * DMA bypass but never both. So we deconfigure 32bit window only
> >> +	 * if it was enabled at the moment of ownership change.
> >> +	 */
> >> +	if (npe->table_group.tables[0]) {
> >> +		pnv_npu_unset_window(npe, 0);
> >> +		return;
> >> +	}
> >> +
> >> +	/* Disable bypass */
> >> +	rc = opal_pci_map_pe_dma_window_real(phb->opal_id,
> >> +			npe->pe_number, npe->pe_number,
> >> +			0 /* bypass base */, 0);
> >> +	if (rc) {
> >> +		pe_err(npe, "Failed to disable bypass, err %lld\n", rc);
> >> +		return;
> >> +	}
> >> +	pnv_pci_ioda2_tce_invalidate_entire(npe->phb, false);
> >> +}
> >> +
> >> +struct pnv_ioda_pe *pnv_pci_npu_setup_iommu(struct pnv_ioda_pe *npe)
> >> +{
> >> +	struct pnv_phb *phb = npe->phb;
> >> +	struct pci_bus *pbus = phb->hose->bus;
> >> +	struct pci_dev *npdev, *gpdev = NULL, *gptmp;
> >> +	struct pnv_ioda_pe *gpe = get_gpu_pci_dev_and_pe(npe, &gpdev);
> >> +
> >> +	if (!gpe || !gpdev)
> >> +		return NULL;
> >> +
> >> +	list_for_each_entry(npdev, &pbus->devices, bus_list) {
> >> +		gptmp = pnv_pci_get_gpu_dev(npdev);
> >> +
> >> +		if (gptmp != gpdev)
> >> +			continue;
> >
> > If I'm not mistaken it looks like you are trying to find all the NPU PEs also
> > attached to the same GPU on the same (firmware emulated) NPU PHB? I'm just
> > curious - does this work if the GPU has say 2 NPU PE#s (ie. links) on
> > different NPU PHB's?
> >
> >> +		pe_info(gpe, "Attached NPU %s\n", dev_name(&npdev->dev));
> >> +		iommu_group_add_device(gpe->table_group.group, &npdev->dev);
> >> +	}
> >> +
> >> +	return gpe;
> >> +}
> >> diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c
> > b/arch/powerpc/platforms/powernv/pci-ioda.c
> >> index 922c74c..feabe50 100644
> >> --- a/arch/powerpc/platforms/powernv/pci-ioda.c
> >> +++ b/arch/powerpc/platforms/powernv/pci-ioda.c
> >> @@ -2273,6 +2273,90 @@ static struct iommu_table_group_ops pnv_pci_ioda2_ops
> > = {
> >>  	.take_ownership = pnv_ioda2_take_ownership,
> >>  	.release_ownership = pnv_ioda2_release_ownership,
> >>  };
> >> +
> >> +static int gpe_table_group_to_npe_cb(struct device *dev, void *opaque)
> >> +{
> >> +	struct pci_controller *hose;
> >> +	struct pnv_phb *phb;
> >> +	struct pnv_ioda_pe **ptmppe = opaque;
> >> +	struct pci_dev *pdev = container_of(dev, struct pci_dev, dev);
> >> +	struct pci_dn *pdn = pci_get_pdn(pdev);
> >> +
> >> +	if (!pdn || pdn->pe_number == IODA_INVALID_PE)
> >> +		return 0;
> >> +
> >> +	hose = pci_bus_to_host(pdev->bus);
> >> +	phb = hose->private_data;
> >> +	if (phb->type != PNV_PHB_NPU)
> >> +		return 0;
> >> +
> >> +	*ptmppe = &phb->ioda.pe_array[pdn->pe_number];
> >> +
> >> +	return 1;
> >> +}
> >> +
> >> +/*
> >> + * This returns PE of associated NPU.
> >> + * This assumes that NPU is in the same IOMMU group with GPU and there is
> >> + * no other PEs.
> >> + */
> >
> > Which answers my above question as this doesn't look like it supports GPUs
> > with multiple NPU PE#s attached. I don't think this is actually a problem
> > though as no hardware I know of will ever do this, even though theoretically
> > it's possible.
> 
> 
> I believe when such a situation happens in the future, it will be different 
> PVR, PHB4 (or 5 or whatever) and IODA3 (or 4, or...).

In *theory* it could still happen on PHB3/IODA2.

> I could write code in assumption that there can be more NPU PEs per one GPU 
> PE but it does not make much sense (to me) to design the hardware this way 
> and when/if it will be designed this way, the details most likely will 
> differ from what I can imagine today.

I completely agree. No point reworking it and adding complexity for a situation
that most likely will never exist. I just wanted to get an understanding of any
restrictions in case something does change in future.

> >
> > It might be nice if we could warn if this configuration is detected but not
> > really a big issue.
>  >
> > Everything else looks reasonable as far as I can tell
> > (although again I'm no vfio iommu groups expert) so happy for you to add my
> > reviewed-by:
> >
> > Reviewed-By: Alistair Popple <alistair at popple.id.au>
> 
> 
> Thanks!
> 
> >
> >> +static struct pnv_ioda_pe *gpe_table_group_to_npe(
> >> +		struct iommu_table_group *table_group)
> >> +{
> >> +	struct pnv_ioda_pe *npe = NULL;
> >> +	int ret = iommu_group_for_each_dev(table_group->group, &npe,
> >> +			gpe_table_group_to_npe_cb);
> >> +
> >> +	BUG_ON(!ret || !npe);
> >> +
> >> +	return npe;
> >> +}
> >> +
> >> +static long pnv_pci_ioda2_npu_set_window(struct iommu_table_group
> > *table_group,
> >> +		int num, struct iommu_table *tbl)
> >> +{
> >> +	long ret = pnv_pci_ioda2_set_window(table_group, num, tbl);
> >> +
> >> +	if (ret)
> >> +		return ret;
> >> +
> >> +	ret = pnv_npu_set_window(gpe_table_group_to_npe(table_group), num,
> > tbl);
> >> +	if (ret)
> >> +		pnv_pci_ioda2_unset_window(table_group, num);
> >> +
> >> +	return ret;
> >> +}
> >> +
> >> +static long pnv_pci_ioda2_npu_unset_window(
> >> +		struct iommu_table_group *table_group,
> >> +		int num)
> >> +{
> >> +	long ret = pnv_pci_ioda2_unset_window(table_group, num);
> >> +
> >> +	if (ret)
> >> +		return ret;
> >> +
> >> +	return pnv_npu_unset_window(gpe_table_group_to_npe(table_group), num);
> >> +}
> >> +
> >> +static void pnv_ioda2_npu_take_ownership(struct iommu_table_group
> > *table_group)
> >> +{
> >> +	/*
> >> +	 * Detach NPU first as pnv_ioda2_take_ownership() will destroy
> >> +	 * the iommu_table if 32bit DMA is enabled.
> >> +	 */
> >> +	pnv_npu_take_ownership(gpe_table_group_to_npe(table_group));
> >> +	pnv_ioda2_take_ownership(table_group);
> >> +}
> >> +
> >> +static struct iommu_table_group_ops pnv_pci_ioda2_npu_ops = {
> >> +	.get_table_size = pnv_pci_ioda2_get_table_size,
> >> +	.create_table = pnv_pci_ioda2_create_table,
> >> +	.set_window = pnv_pci_ioda2_npu_set_window,
> >> +	.unset_window = pnv_pci_ioda2_npu_unset_window,
> >> +	.take_ownership = pnv_ioda2_npu_take_ownership,
> >> +	.release_ownership = pnv_ioda2_release_ownership,
> >> +};
> >>  #endif
> >>
> >>  static void pnv_pci_ioda_setup_opal_tce_kill(struct pnv_phb *phb)
> >> @@ -3012,6 +3096,7 @@ static void pnv_pci_ioda_setup_DMA(void)
> >>  {
> >>  	struct pci_controller *hose, *tmp;
> >>  	struct pnv_phb *phb;
> >> +	struct pnv_ioda_pe *pe, *gpe;
> >>
> >>  	list_for_each_entry_safe(hose, tmp, &hose_list, list_node) {
> >>  		pnv_ioda_setup_dma(hose->private_data);
> >> @@ -3020,6 +3105,23 @@ static void pnv_pci_ioda_setup_DMA(void)
> >>  		phb = hose->private_data;
> >>  		phb->initialized = 1;
> >>  	}
> >> +
> >> +	/*
> >> +	 * Now we have all PHBs discovered, time to add NPU devices to
> >> +	 * the corresponding IOMMU groups.
> >> +	 */
> >> +	list_for_each_entry_safe(hose, tmp, &hose_list, list_node) {
> >> +		phb = hose->private_data;
> >> +
> >> +		if (phb->type != PNV_PHB_NPU)
> >> +			continue;
> >> +
> >> +		list_for_each_entry(pe, &phb->ioda.pe_dma_list, dma_link) {
> >> +			gpe = pnv_pci_npu_setup_iommu(pe);
> >> +			if (gpe)
> >> +				gpe->table_group.ops = &pnv_pci_ioda2_npu_ops;
> >> +		}
> >> +	}
> >>  }
> >>
> >>  static void pnv_pci_ioda_create_dbgfs(void)
> >> diff --git a/arch/powerpc/platforms/powernv/pci.h
> > b/arch/powerpc/platforms/powernv/pci.h
> >> index e117bd8..ebc6ee4 100644
> >> --- a/arch/powerpc/platforms/powernv/pci.h
> >> +++ b/arch/powerpc/platforms/powernv/pci.h
> >> @@ -244,5 +244,11 @@ extern void pe_level_printk(const struct pnv_ioda_pe
> > *pe, const char *level,
> >>  /* Nvlink functions */
> >>  extern void pnv_npu_try_dma_set_bypass(struct pci_dev *gpdev, bool bypass);
> >>  extern void pnv_pci_ioda2_tce_invalidate_entire(struct pnv_phb *phb, bool
> > rm);
> >> +extern struct pnv_ioda_pe *pnv_pci_npu_setup_iommu(struct pnv_ioda_pe
> > *npe);
> >> +extern long pnv_npu_set_window(struct pnv_ioda_pe *npe, int num,
> >> +		struct iommu_table *tbl);
> >> +extern long pnv_npu_unset_window(struct pnv_ioda_pe *npe, int num);
> >> +extern void pnv_npu_take_ownership(struct pnv_ioda_pe *npe);
> >> +extern void pnv_npu_release_ownership(struct pnv_ioda_pe *npe);
> >>
> >>  #endif /* __POWERNV_PCI_H */
> >>
> >
> 
> 
>