[Skiboot] [PATCH v3] nvlink: Associate and allocate NPUs using slots

Alistair Popple alistair at popple.id.au
Tue Jul 12 15:24:16 AEST 2016


This patch looks suitably terrific. We end up treating NPU group id's as device
numbers for the emulated PCIe device bdfn's but that should work fine as the
bdfn allocation has been updated to guarantee they match.

I haven't actually tested it with CUDA and GPUs but other than that it looks
good.

Reviewed-By: Alistair Popple <alistair at popple.id.au>

On Tue, 5 Jul 2016 15:05:35 Russell Currey wrote:
> Allocating BDFNs to NPU devices and associating NPU devices with PCI
> devices of GPUs both rely on comparing PBCQ handles.  This will fail if a
> system has multiple sets of GPUs behind a single PHB.
> 
> Rework this to instead use slot locations.  The following changes are
> introduced:
> 
>  - Groups of NPU links that connect to the same GPU are presented in the
>    slot table entries as st_npu_slot, using ST_LOC_NPU_GROUP
> 
>  - NPU links are created with the ibm,npu-group-id property replacing the
>    ibm,pbcq property, which is used in BDFN allocation and GPU association
> 
>  - Slot comparison is handled slightly differently for NPU devices as the
>    function of the BDFN is ignored, since the device number represents the
>    physical GPU the link is connected to
> 
>  - BDFN allocation for NPU devices is now derived from the groups in the
>    slot table.  For Garrison, the same BDFNs are generated as before.
> 
>  - Association with GPU PCI devices is performed by comparing the slot
>    label.  This means for future machines with NPUs that slot labels
>    are compulsory to have NVLink functionality working.
> 
> Signed-off-by: Russell Currey <ruscur at russell.cc>
> ---
> This patch has too much in it, but I can't find a way to separate it
> without breaking functionality.  Let me know if you see a way to do so.
> 
> This supercedes the series starting with "astbmc: Add NPU slot type and
> use it in Garrison".  It's all a bit of a mess...
> ---
>  hw/npu.c                    | 59 +++++++++++++++++++--------------------------
>  include/npu.h               |  3 +++
>  platforms/astbmc/astbmc.h   |  8 ++++++
>  platforms/astbmc/garrison.c | 59 +++++++++++++--------------------------------
>  platforms/astbmc/slots.c    | 10 +++++++-
>  5 files changed, 62 insertions(+), 77 deletions(-)
> 
> diff --git a/hw/npu.c b/hw/npu.c
> index f535f8a..2ab6ac2 100644
> --- a/hw/npu.c
> +++ b/hw/npu.c
> @@ -30,6 +30,7 @@
>  #include <npu-regs.h>
>  #include <npu.h>
>  #include <xscom.h>
> +#include <string.h>
>  
>  /*
>   * Terminology:
> @@ -503,25 +504,27 @@ static int __npu_dev_bind_pci_dev(struct phb *phb __unused,
>  {
>  	struct npu_dev *dev = data;
>  	struct dt_node *pci_dt_node;
> -	uint32_t npu_npcq_phandle;
> +	char *pcislot;
>  
>  	/* Ignore non-nvidia PCI devices */
>  	if ((pd->vdid & 0xffff) != 0x10de)
>  		return 0;
>  
> -	/* Find the PCI devices pbcq */
> -	for (pci_dt_node = pd->dn->parent;
> -	     pci_dt_node && !dt_find_property(pci_dt_node, "ibm,pbcq");
> +	/* Find the PCI device's slot location */
> +	for (pci_dt_node = pd->dn;
> +	     pci_dt_node && !dt_find_property(pci_dt_node, "ibm,slot-label");
>  	     pci_dt_node = pci_dt_node->parent);
>  
>  	if (!pci_dt_node)
>  		return 0;
>  
> -	npu_npcq_phandle = dt_prop_get_u32(dev->dt_node, "ibm,npu-pbcq");
> +	pcislot = (char *)dt_prop_get(pci_dt_node, "ibm,slot-label");
>  
> -	if (dt_prop_get_u32(pci_dt_node, "ibm,pbcq") == npu_npcq_phandle &&
> -	    (pd->vdid & 0xffff) == 0x10de)
> -			return 1;
> +	prlog(PR_DEBUG, "NPU: comparing GPU %s and NPU %s\n",
> +	      pcislot, dev->slot_label);
> +
> +	if (streq(pcislot, dev->slot_label))
> +		return 1;
>  
>  	return 0;
>  }
> @@ -604,6 +607,9 @@ static int npu_dn_fixup(struct phb *phb,
>  	if (dev->phb || dev->pd)
>  		return 0;
>  
> +	/* NPU devices require a slot location to associate with GPUs */
> +	dev->slot_label = dt_prop_get(pd->dn, "ibm,slot-label");
> +
>  	/* Bind the emulated PCI device with the real one, which can't
>  	 * be done until the PCI devices are populated. Once the real
>  	 * PCI device is identified, we also need fix the device-tree
> @@ -1619,34 +1625,19 @@ static void npu_dev_create_cfg(struct npu_dev *dev)
>  		NPU_DEV_CFG_INIT_RO(dev, PCI_CFG_INT_LINE, 4, 0x00000200);
>  }
>  
> -static uint32_t npu_allocate_bdfn(struct npu *p, uint32_t pbcq)
> +static uint32_t npu_allocate_bdfn(struct npu *p, uint32_t group)
>  {
>  	int i;
> -	int dev = -1;
> -	int bdfn = -1;
> +	int bdfn = (group << 3);
>  
> -	/* Find the highest function number alloacted to emulated PCI
> -	 * devices associated with this GPU. */
> -	for(i = 0; i < p->total_devices; i++) {
> -		int dev_bdfn = p->devices[i].bdfn;
> -		dev = MAX(dev, dev_bdfn & 0xf8);
> -
> -		if (dt_prop_get_u32(p->devices[i].dt_node,
> -				    "ibm,npu-pbcq") == pbcq)
> -			bdfn = MAX(bdfn, dev_bdfn);
> +	for (i = 0; i < p->total_devices; i++) {
> +		if (p->devices[i].bdfn == bdfn) {
> +			bdfn++;
> +			break;
> +		}
>  	}
>  
> -	if (bdfn >= 0)
> -		/* Device has already been allocated for this GPU so
> -		 * assign the emulated PCI device the next
> -		 * function. */
> -		return bdfn + 1;
> -	else if (dev >= 0)
> -		/* Otherwise allocate a new device and allocate
> -		 * function 0. */
> -		return dev + (1 << 3);
> -	else
> -		return 0;
> +	return bdfn;
>  }
>  
>  static void npu_create_devices(struct dt_node *dn, struct npu *p)
> @@ -1674,7 +1665,7 @@ static void npu_create_devices(struct dt_node *dn, struct npu *p)
>  	p->phb.scan_map = 0;
>  	dt_for_each_compatible(npu_dn, link, "ibm,npu-link") {
>  		struct npu_dev_bar *bar;
> -		uint32_t pbcq;
> +		uint32_t group_id;
>  		uint64_t val;
>  		uint32_t j;
>  
> @@ -1689,8 +1680,8 @@ static void npu_create_devices(struct dt_node *dn, struct npu *p)
>  		/* We don't support MMIO PHY access yet */
>  		dev->pl_base = NULL;
>  
> -		pbcq = dt_prop_get_u32(link, "ibm,npu-pbcq");
> -		dev->bdfn = npu_allocate_bdfn(p, pbcq);
> +		group_id = dt_prop_get_u32(link, "ibm,npu-group-id");
> +		dev->bdfn = npu_allocate_bdfn(p, group_id);
>  
>  		/* This must be done after calling
>  		 * npu_allocate_bdfn() */
> diff --git a/include/npu.h b/include/npu.h
> index 778c985..800515a 100644
> --- a/include/npu.h
> +++ b/include/npu.h
> @@ -148,6 +148,9 @@ struct npu_dev {
>  	uint32_t		procedure_status;
>  
>  	uint8_t			pe_num;
> +
> +	/* Used to associate the NPU device with GPU PCI devices */
> +	const char		*slot_label;
>  };
>  
>  /* NPU PHB descriptor */
> diff --git a/platforms/astbmc/astbmc.h b/platforms/astbmc/astbmc.h
> index 23c31c7..322282e 100644
> --- a/platforms/astbmc/astbmc.h
> +++ b/platforms/astbmc/astbmc.h
> @@ -20,6 +20,13 @@
>  
>  #define ST_LOC_PHB(chip_id, phb_idx)    ((chip_id) << 16 | (phb_idx))
>  #define ST_LOC_DEVFN(dev, fn)	        ((dev) << 3 | (fn))
> +/*
> + * NPU groups are used to allocate device numbers.  There is a 1 to 1
> + * correlation between a NPU group and a physical GPU.  Links within a group
> + * are allocated as functions within a device, so groups must be numbered
> + * sequentially starting at 0.
> + */
> +#define ST_LOC_NPU_GROUP(group_id)	(group_id << 3)
>  
>  struct slot_table_entry {
>  	enum slot_table_etype {
> @@ -27,6 +34,7 @@ struct slot_table_entry {
>  		st_phb,
>  		st_pluggable_slot,
>  		st_builtin_dev,
> +		st_npu_slot
>  	} etype;
>  	uint32_t location;
>  	const char *name;
> diff --git a/platforms/astbmc/garrison.c b/platforms/astbmc/garrison.c
> index 3ff84a3..f400a51 100644
> --- a/platforms/astbmc/garrison.c
> +++ b/platforms/astbmc/garrison.c
> @@ -63,23 +63,13 @@ static const struct slot_table_entry garrison_phb0_3_slot[] = {
>  
>  static const struct slot_table_entry garrison_npu0_slots[] = {
>  	{
> -		.etype = st_pluggable_slot,
> -		.location = ST_LOC_DEVFN(0,0),
> -		.name = "GPU2",
> -	},
> -	{
> -		.etype = st_pluggable_slot,
> -		.location = ST_LOC_DEVFN(0,1),
> +		.etype = st_npu_slot,
> +		.location = ST_LOC_NPU_GROUP(0),
>  		.name = "GPU2",
>  	},
>  	{
> -		.etype = st_pluggable_slot,
> -		.location = ST_LOC_DEVFN(1,0),
> -		.name = "GPU1",
> -	},
> -	{
> -		.etype = st_pluggable_slot,
> -		.location = ST_LOC_DEVFN(1,1),
> +		.etype = st_npu_slot,
> +		.location = ST_LOC_NPU_GROUP(1),
>  		.name = "GPU1",
>  	},
>  	{ .etype = st_end },
> @@ -152,23 +142,13 @@ static const struct slot_table_entry garrison_phb1_3_slot[] = {
>  
>  static const struct slot_table_entry garrison_npu1_slots[] = {
>  	{
> -		.etype = st_pluggable_slot,
> -		.location = ST_LOC_DEVFN(0,0),
> -		.name = "GPU4",
> -	},
> -	{
> -		.etype = st_pluggable_slot,
> -		.location = ST_LOC_DEVFN(0,1),
> +		.etype = st_npu_slot,
> +		.location = ST_LOC_NPU_GROUP(0),
>  		.name = "GPU4",
>  	},
>  	{
> -		.etype = st_pluggable_slot,
> -		.location = ST_LOC_DEVFN(1,0),
> -		.name = "GPU3",
> -	},
> -	{
> -		.etype = st_pluggable_slot,
> -		.location = ST_LOC_DEVFN(1,1),
> +		.etype = st_npu_slot,
> +		.location = ST_LOC_NPU_GROUP(1),
>  		.name = "GPU3",
>  	},
>  	{ .etype = st_end },
> @@ -233,7 +213,7 @@ static const struct slot_table_entry garrison_phb_table[] = {
>  #define NPU_INDIRECT0	0x8000000008010c3f
>  #define NPU_INDIRECT1	0x8000000008010c7f
>  
> -static void create_link(struct dt_node *npu, struct dt_node *pbcq, int index)
> +static void create_link(struct dt_node *npu, int group, int index)
>  {
>  	struct dt_node *link;
>  	uint32_t lane_mask;
> @@ -255,12 +235,12 @@ static void create_link(struct dt_node *npu, struct dt_node *pbcq, int index)
>  	}
>  	dt_add_property_u64s(link, "ibm,npu-phy", phy);
>  	dt_add_property_cells(link, "ibm,npu-lane-mask", lane_mask);
> -	dt_add_property_cells(link, "ibm,npu-pbcq", pbcq->phandle);
> +	dt_add_property_cells(link, "ibm,npu-group-id", group);
>  }
>  
>  static void dt_create_npu(void)
>  {
> -        struct dt_node *xscom, *npu, *pbcq;
> +        struct dt_node *xscom, *npu;
>          char namebuf[32];
>  
>  	dt_for_each_compatible(dt_root, xscom, "ibm,xscom") {
> @@ -275,17 +255,12 @@ static void dt_create_npu(void)
>  		dt_add_property_cells(npu, "ibm,npu-index", 0);
>  		dt_add_property_cells(npu, "ibm,npu-links", 4);
>  
> -		/* On Garrison we have 2 links per GPU device. The
> -		 * first 2 links go to the GPU connected via
> -		 * pbcq at 2012c00 the second two via pbcq at 2012800. */
> -		pbcq = dt_find_by_name(xscom, "pbcq at 2012c00");
> -		assert(pbcq);
> -		create_link(npu, pbcq, 0);
> -		create_link(npu, pbcq, 1);
> -		pbcq = dt_find_by_name(xscom, "pbcq at 2012800");
> -		assert(pbcq);
> -		create_link(npu, pbcq, 4);
> -		create_link(npu, pbcq, 5);
> +		/* On Garrison we have 2 links per GPU device.  These are
> +		 * grouped together as per the slot tables above. */
> +		create_link(npu, 0, 0);
> +		create_link(npu, 0, 1);
> +		create_link(npu, 1, 4);
> +		create_link(npu, 1, 5);
>  	}
>  }
>  
> diff --git a/platforms/astbmc/slots.c b/platforms/astbmc/slots.c
> index 36547e1..678a3cc 100644
> --- a/platforms/astbmc/slots.c
> +++ b/platforms/astbmc/slots.c
> @@ -54,6 +54,7 @@ static const struct slot_table_entry *match_slot_dev_entry(struct phb *phb,
>  							   struct pci_device *pd)
>  {
>  	const struct slot_table_entry *parent, *ent;
> +	uint32_t bdfn;
>  
>  	/* Find a parent recursively */
>  	if (pd->parent)
> @@ -70,7 +71,14 @@ static const struct slot_table_entry *match_slot_dev_entry(struct phb *phb,
>  			prerror("SLOT: Bad PHB entry type in table !\n");
>  			continue;
>  		}
> -		if (ent->location == (pd->bdfn & 0xff))
> +
> +		/* NPU slots match on device, not function */
> +		if (ent->etype == st_npu_slot)
> +			bdfn = pd->bdfn & 0xf8;
> +		else
> +			bdfn = pd->bdfn & 0xff;
> +
> +		if (ent->location == bdfn)
>  			return ent;
>  	}
>  	return NULL;
> 



More information about the Skiboot mailing list