[Skiboot] [PATCH 5/5] core/pci: Support SRIOV VFs

Russell Currey ruscur at russell.cc
Mon Feb 6 17:29:22 AEDT 2017


On Fri, 2016-11-18 at 16:05 +1100, Gavin Shan wrote:
> Currently, skiboot can't see SRIOV VFs. It introduces some troubles
> as I can see: The device initialization logic (phb->ops->device_init())
> isn't applied to VFs, meaning we have to maintain same and duplicated
> mechanism in kernel for VFs only. It introduces difficulty to code
> maintaining and prone to lose sychronization.
> 
> This was motivated by bug reported by Carol: The VF's Max Payload
> Size (MPS) isn't matched with PF's on Mellanox's adapter even kernel
> tried to make them same. It's caused by readonly PCIECAP_EXP_DEVCTL
> register on VFs. The skiboot would be best place to emulate this bits
> to eliminate the gap as I can see.
> 
> This supports SRIOV VFs. When the PF's SRIOV capability is populated,
> the number of maximal VFs (struct pci_device) are instanciated, but
> usable yet. In the mean while, PCI config register filter is registered

should this be "but not usable yet"?

> against PCIECAP_SRIOV_CTRL_VFE to capture the event of enabling or
> disabling VFs. The VFs are initialized, put into the PF's children
> list (pd->children), populate its PCI capabilities, and register PCI
> config register filter against PCICAP_EXP_DEVCTL. The filter's handler
> caches what is written to MPS field and returns the cached value on
> read, to eliminate the gap mentioned as above.
>
> Signed-off-by: Gavin Shan <gwshan at linux.vnet.ibm.com>
> ---

Having knowledge of VFs in skiboot seems like a good idea.

Minor comments below. 

Reviewed-by: Russell Currey <ruscur at russell.cc>

>  core/Makefile.inc |   4 +-
>  core/pci-iov.c    | 252
> ++++++++++++++++++++++++++++++++++++++++++++++++++++++
>  core/pci.c        |   2 +
>  include/pci-cfg.h |  32 +++++++
>  include/pci-iov.h |  37 ++++++++
>  5 files changed, 325 insertions(+), 2 deletions(-)
>  create mode 100644 core/pci-iov.c
>  create mode 100644 include/pci-iov.h
> 
> diff --git a/core/Makefile.inc b/core/Makefile.inc
> index 9223af1..5ff84d1 100644
> --- a/core/Makefile.inc
> +++ b/core/Makefile.inc
> @@ -2,8 +2,8 @@
>  
>  SUBDIRS += core
>  CORE_OBJS = relocate.o console.o stack.o init.o chip.o mem_region.o
> -CORE_OBJS += malloc.o lock.o cpu.o utils.o fdt.o opal.o interrupts.o
> -CORE_OBJS += timebase.o opal-msg.o pci.o pci-virt.o pci-slot.o pcie-slot.o
> +CORE_OBJS += malloc.o lock.o cpu.o utils.o fdt.o opal.o interrupts.o
> timebase.o
> +CORE_OBJS += opal-msg.o pci.o pci-iov.o pci-virt.o pci-slot.o pcie-slot.o
>  CORE_OBJS += pci-opal.o fast-reboot.o device.o exceptions.o trace.o
> affinity.o
>  CORE_OBJS += vpd.o hostservices.o platform.o nvram.o nvram-format.o hmi.o
>  CORE_OBJS += console-log.o ipmi.o time-utils.o pel.o pool.o errorlog.o
> diff --git a/core/pci-iov.c b/core/pci-iov.c
> new file mode 100644
> index 0000000..2d37b15
> --- /dev/null
> +++ b/core/pci-iov.c
> @@ -0,0 +1,252 @@
> +/* Copyright 2013-2016 IBM Corp.
> + *
> + * Licensed under the Apache License, Version 2.0 (the "License");
> + * you may not use this file except in compliance with the License.
> + * You may obtain a copy of the License at
> + *
> + *      http://www.apache.org/licenses/LICENSE-2.0
> + *
> + * Unless required by applicable law or agreed to in writing, software
> + * distributed under the License is distributed on an "AS IS" BASIS,
> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
> + * implied.
> + * See the License for the specific language governing permissions and
> + * limitations under the License.
> + */
> +
> +#include <skiboot.h>
> +#include <pci.h>
> +#include <pci-cfg.h>
> +#include <pci-slot.h>
> +#include <pci-iov.h>
> +
> +/*
> + * Tackle the VF's MPS in PCIe capability. The field is read only.
> + * This function caches what is written and returns the cached
> + * MPS on read.
> + */
> +static int64_t pci_iov_vf_devctl(void *dev, struct pci_cfg_reg_filter *pcrf,
> +				 uint32_t offset, uint32_t len,
> +				 uint32_t *data, bool write)
> +{
> +	struct pci_device *vf = (struct pci_device *)dev;
> +	uint32_t pos = pci_cap(vf, PCI_CFG_CAP_ID_EXP, false);
> +	uint8_t *pcache;
> +
> +	if (offset != pcrf->start ||
> +	    offset != (pos + PCICAP_EXP_DEVCTL))
> +		return OPAL_SUCCESS;

In what situations would this be called with a wrong offset?  Does there need to
be some kind of message or different return here, given this function always
returns OPAL_SUCCESS?

> +
> +	pcache = &pcrf->data[0];
> +	if (write) {
> +		*pcache = ((uint8_t)(*data >> (8 * (4 - len)))) &
> +			   PCICAP_EXP_DEVCTL_MPS;
> +	} else {
> +		*data &= ~(PCICAP_EXP_DEVCTL_MPS << (8 * (4 - len)));
> +		*data |= (((uint32_t)(*pcache & PCICAP_EXP_DEVCTL_MPS))
> +			  << (8 * (4 - len)));
> +	}
> +
> +	return OPAL_SUCCESS;
> +}
> +
> +static void pci_iov_vf_quirk(struct phb *phb, struct pci_device *vf)
> +{
> +	struct pci_cfg_reg_filter *pcrf;
> +	uint32_t pos;
> +
> +	if (!pci_has_cap(vf, PCI_CFG_CAP_ID_EXP, false))
> +		return;
> +
> +	/* Mellanox MT27500 Family [ConnectX-3] */

It might be nice to extend the comment to explain why this is broken for this
particular family of devices.

> +	if (vf->vdid != 0x100315b3)
> +		return;
> +
> +	pos = pci_cap(vf, PCI_CFG_CAP_ID_EXP, false);
> +	pcrf = pci_add_cfg_reg_filter(vf, pos + PCICAP_EXP_DEVCTL, 4,
> +				      PCI_REG_FLAG_MASK, pci_iov_vf_devctl);
> +	if (!pcrf)
> +		prlog(PR_WARNING, "%s: Missed DEVCTL filter on
> %04x:%02x:%02x.%01x\n",
> +		      __func__, phb->opal_id, (vf->bdfn >> 8),
> +		      ((vf->bdfn >> 3) & 0x1f), (vf->bdfn & 0x7));
> +}
> +
> +/*
> + * Update the SRIOV parameters that change when the number of
> + * VFs is configured.
> + */
> +static bool pci_iov_update_parameters(struct pci_iov *iov)
> +{
> +	struct phb *phb = iov->phb;
> +	uint16_t bdfn = iov->pd->bdfn;
> +	uint32_t pos = iov->pos;
> +	uint16_t val;
> +	bool enabled;
> +
> +	pci_cfg_read16(phb, bdfn, pos + PCIECAP_SRIOV_CTRL, &val);
> +	enabled = !!(val & PCIECAP_SRIOV_CTRL_VFE);
> +	if (iov->enabled == enabled)
> +		return false;
> +
> +	if (enabled) {
> +		pci_cfg_read16(phb, bdfn, pos + PCIECAP_SRIOV_INITIAL_VF,
> +			       &iov->init_VFs);
> +		pci_cfg_read16(phb, bdfn, pos + PCIECAP_SRIOV_NUM_VF,
> +			       &iov->num_VFs);
> +		pci_cfg_read16(phb, bdfn, pos + PCIECAP_SRIOV_VF_OFFSET,
> +			       &iov->offset);
> +		pci_cfg_read16(phb, bdfn, pos + PCIECAP_SRIOV_VF_STRIDE,
> +			       &iov->stride);
> +	} else {
> +		iov->init_VFs	= 0;
> +		iov->num_VFs	= 0;
> +		iov->offset	= 0;
> +		iov->stride	= 0;
> +	}
> +
> +	iov->enabled = enabled;
> +	return true;
> +}
> +
> +static int64_t pci_iov_change(void *dev __unused,
> +			      struct pci_cfg_reg_filter *pcrf,
> +			      uint32_t offset __unused,
> +			      uint32_t len __unused,
> +			      uint32_t *data __unused,
> +			      bool write __unused)
> +{
> +	struct pci_iov *iov = (struct pci_iov *)pcrf->data;
> +	struct phb *phb = iov->phb;
> +	struct pci_device *pd = iov->pd;
> +	struct pci_device *vf, *tmp;
> +	uint32_t i;
> +	bool changed;
> +
> +	/* Update SRIOV variable parameters */
> +	changed = pci_iov_update_parameters(iov);
> +	if (!changed)
> +		return OPAL_SUCCESS;
> +
> +	/* Remove all VFs that have been attached to the parent */
> +	if (!iov->enabled) {
> +		list_for_each_safe(&pd->children, vf, tmp, link)
> +			list_del(&vf->link);
> +		return OPAL_SUCCESS;
> +	}
> +
> +	/* Initialize the VFs and attach them to parent */
> +	for (changed = false, i = 0; i < iov->num_VFs; i++) {
> +		vf = &iov->VFs[i];
> +		vf->bdfn = pd->bdfn + iov->offset + iov->stride * i;
> +		list_add_tail(&pd->children, &vf->link);
> +
> +		/*
> +		 * We don't populate the capabilities again if they have
> +		 * been existing, to save time. Also, we need delay for
> +		 * 100ms before the VF's config space becomes ready.
> +		 */
> +		if (!pci_has_cap(vf, PCI_CFG_CAP_ID_EXP, false)) {
> +			if (!changed) {
> +				changed = !changed;
> +				time_wait_ms(100);
> +			}
> +
> +			pci_init_capabilities(phb, vf);
> +			pci_iov_vf_quirk(phb, vf);
> +		}
> +
> +		/* Call PHB hook */
> +		if (phb->ops->device_init)
> +			phb->ops->device_init(phb, pd, NULL);
> +	}
> +
> +	return OPAL_SUCCESS;
> +}
> +
> +/*
> + * This function is called with disabled SRIOV capability. So the VF's
> + * config address isn't finalized and its config space isn't accessible.
> + */
> +static void pci_iov_init_VF(struct pci_device *pd, struct pci_device *vf)
> +{
> +	vf->is_bridge		= false;
> +	vf->is_multifunction	= false;
> +	vf->dev_type		= PCIE_TYPE_ENDPOINT;
> +	vf->scan_map		= -1;
> +	vf->vdid		= pd->vdid;
> +	vf->sub_vdid		= pd->sub_vdid;
> +	vf->class		= pd->class;
> +	vf->dn			= NULL;
> +	vf->slot		= NULL;
> +	vf->parent		= pd;
> +	list_head_init(&vf->pcrf);
> +	list_head_init(&vf->children);
> +}
> +
> +void pci_init_iov_cap(struct phb *phb, struct pci_device *pd)
> +{
> +	int64_t pos;
> +	struct pci_iov *iov;
> +	struct pci_cfg_reg_filter *pcrf;
> +	uint32_t i;
> +
> +	/* Search for SRIOV capability */
> +	if (!pci_has_cap(pd, PCI_CFG_CAP_ID_EXP, false))
> +		return;
> +
> +	pos = pci_find_ecap(phb, pd->bdfn, PCIECAP_ID_SRIOV, NULL);
> +	if (pos <= 0)
> +		return;
> +
> +	/* Allocate IOV */
> +	iov = zalloc(sizeof(*iov));
> +	if (!iov) {
> +		prlog(PR_ERR, "%s: Cannot alloc IOV for
> %04x:%02x:%02x.%01x\n",
> +		      __func__, phb->opal_id, (pd->bdfn >> 8),
> +		      ((pd->bdfn >> 3) & 0x1f), (pd->bdfn & 0x7));
> +		return;
> +	}
> +
> +	/* Allocate VFs */
> +	pci_cfg_read16(phb, pd->bdfn, pos + PCIECAP_SRIOV_TOTAL_VF,
> +		       &iov->total_VFs);
> +	iov->VFs = zalloc(sizeof(*iov->VFs) * iov->total_VFs);
> +	if (!iov->VFs) {
> +		prlog(PR_ERR, "%s: Cannot alloc %d VFs for
> %04x:%02x:%02x.%01x\n",
> +		      __func__, iov->total_VFs, phb->opal_id,
> +		      (pd->bdfn >> 8), ((pd->bdfn >> 3) & 0x1f),
> +		      (pd->bdfn & 0x7));
> +		free(iov);
> +		return;
> +	}
> +
> +	/* Initialize VFs */
> +	for (i = 0; i < iov->total_VFs; i++)
> +		pci_iov_init_VF(pd, &iov->VFs[i]);
> +
> +	/* Register filter for enabling or disabling SRIOV capability */
> +	pcrf = pci_add_cfg_reg_filter(pd, pos + PCIECAP_SRIOV_CTRL, 2,
> +				      PCI_REG_FLAG_WRITE, pci_iov_change);
> +	if (!pcrf) {
> +		prlog(PR_ERR, "%s: Cannot set filter on
> %04x:%02x:%02x.%01x\n",
> +		      __func__, phb->opal_id, (pd->bdfn >> 8),
> +		      ((pd->bdfn >> 3) & 0x1f), (pd->bdfn & 0x7));
> +		free(iov->VFs);
> +		free(iov);
> +		return;
> +	}
> +
> +	/* Associate filter and IOV capability */
> +	pcrf->data = (void *)iov;
> +
> +	/*
> +	 * Retrieve the number of VFs and other information if applicable.
> +	 * Register the SRIOV capability in the mean while.
> +	 */
> +	iov->phb = phb;
> +	iov->pd = pd;
> +	iov->pos = pos;
> +	iov->enabled = false;
> +	pci_iov_update_parameters(iov);
> +	pci_set_cap(pd, PCIECAP_ID_SRIOV, pos, iov, true);
> +}
> diff --git a/core/pci.c b/core/pci.c
> index b38d5a0..860ff5a 100644
> --- a/core/pci.c
> +++ b/core/pci.c
> @@ -18,6 +18,7 @@
>  #include <cpu.h>
>  #include <pci.h>
>  #include <pci-cfg.h>
> +#include <pci-iov.h>
>  #include <pci-slot.h>
>  #include <timebase.h>
>  #include <device.h>
> @@ -204,6 +205,7 @@ void pci_init_capabilities(struct phb *phb, struct
> pci_device *pd)
>  {
>  	pci_init_pcie_cap(phb, pd);
>  	pci_init_aer_cap(phb, pd);
> +	pci_init_iov_cap(phb, pd);
>  }
>  
>  static struct pci_device *pci_scan_one(struct phb *phb, struct pci_device
> *parent,
> diff --git a/include/pci-cfg.h b/include/pci-cfg.h
> index 27c0f74..530f0a8 100644
> --- a/include/pci-cfg.h
> +++ b/include/pci-cfg.h
> @@ -486,6 +486,38 @@
>  #define PCIECAP_AER_TLP_PFX_LOG2	0x40
>  #define PCIECAP_AER_TLP_PFX_LOG3	0x44
>  
> +/* SRIOV capability */
> +#define PCIECAP_ID_SRIOV		0x10
> +#define PCIECAP_SRIOV_CAP		0x04
> +#define   PCIECAP_SRIOV_CAP_VFM		0x01
> +#define   PCIECAP_SRIOV_CAP_INTR(x)	((x) >> 21)
> +#define PCIECAP_SRIOV_CTRL		0x08
> +#define   PCIECAP_SRIOV_CTRL_VFE	0x01
> +#define   PCIECAP_SRIOV_CTRL_VFM	0x02
> +#define   PCIECAP_SRIOV_CTRL_INTR	0x04
> +#define   PCIECAP_SRIOV_CTRL_MSE	0x08
> +#define   PCIECAP_SRIOV_CTRL_ARI	0x10
> +#define PCIECAP_SRIOV_STATUS		0x0a
> +#define   PCIECAP_SRIOV_STATUS_VFM	0x01
> +#define PCIECAP_SRIOV_INITIAL_VF	0x0c
> +#define PCIECAP_SRIOV_TOTAL_VF		0x0e
> +#define PCIECAP_SRIOV_NUM_VF		0x10
> +#define PCIECAP_SRIOV_FUNC_LINK		0x12
> +#define PCIECAP_SRIOV_VF_OFFSET		0x14
> +#define PCIECAP_SRIOV_VF_STRIDE		0x16
> +#define PCIECAP_SRIOV_VF_DID		0x1a
> +#define PCIECAP_SRIOV_SUP_PGSIZE	0x1c
> +#define PCIECAP_SRIOV_SYS_PGSIZE	0x20
> +#define PCIECAP_SRIOV_BAR		0x24
> +#define    PCIECAP_SRIOV_NUM_BARS	6
> +#define PCIECAP_SRIOV_VFM		0x3c
> +#define  PCIECAP_SRIOV_VFM_BIR(x)	((x) & 7)
> +#define  PCIECAP_SRIOV_VFM_OFFSET(x)	((x) & ~7)
> +#define  PCIECAP_SRIOV_VFM_UA		0x0
> +#define  PCIECAP_SRIOV_VFM_MI		0x1
> +#define  PCIECAP_SRIOV_VFM_MO		0x2
> +#define  PCIECAP_SRIOV_VFM_AV		0x3
> +
>  /* Vendor specific extend capability */
>  #define PCIECAP_ID_VNDR			0x0b
>  #define PCIECAP_VNDR_HDR		0x04
> diff --git a/include/pci-iov.h b/include/pci-iov.h
> new file mode 100644
> index 0000000..787b2cd
> --- /dev/null
> +++ b/include/pci-iov.h
> @@ -0,0 +1,37 @@
> +/* Copyright 2013-2016 IBM Corp.
> + *
> + * Licensed under the Apache License, Version 2.0 (the "License");
> + * you may not use this file except in compliance with the License.
> + * You may obtain a copy of the License at
> + *
> + *      http://www.apache.org/licenses/LICENSE-2.0
> + *
> + * Unless required by applicable law or agreed to in writing, software
> + * distributed under the License is distributed on an "AS IS" BASIS,
> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
> + * implied.
> + * See the License for the specific language governing permissions and
> + * limitations under the License.
> + */
> +
> +#ifndef __PCI_IOV_H
> +#define __PCI_IOV_H
> +
> +struct pci_iov {
> +	struct phb			*phb;
> +	struct pci_device		*pd;
> +	struct pci_device		*VFs;
> +	uint32_t			pos;
> +	bool				enabled;
> +	struct pci_cfg_reg_filter	pcrf;
> +
> +	uint16_t			init_VFs;
> +	uint16_t			total_VFs;
> +	uint16_t			num_VFs;
> +	uint16_t			offset;
> +	uint16_t			stride;
> +};
> +
> +extern void pci_init_iov_cap(struct phb *phb, struct pci_device *pd);
> +
> +#endif



More information about the Skiboot mailing list