[Skiboot] [PATCH v2 5/5] core/pci: Support SRIOV VFs

Gavin Shan gwshan at linux.vnet.ibm.com
Fri Feb 10 12:20:02 AEDT 2017


Currently, skiboot can't see SRIOV VFs. It introduces some troubles
as I can see: The device initialization logic (phb->ops->device_init())
isn't applied to VFs, meaning we have to maintain same and duplicated
mechanism in kernel for VFs only. It introduces difficulty to code
maintaining and prone to lose sychronization.

This was motivated by bug reported by Carol: The VF's Max Payload
Size (MPS) isn't matched with PF's on Mellanox's adapter even kernel
tried to make them same. It's caused by readonly PCIECAP_EXP_DEVCTL
register on VFs. The skiboot would be best place to emulate this bits
to eliminate the gap as I can see.

This supports SRIOV VFs. When the PF's SRIOV capability is populated,
the number of maximal VFs (struct pci_device) are instanciated, but
but not usable yet. In the mean while, PCI config register filter is
registered against PCIECAP_SRIOV_CTRL_VFE to capture the event of
enabling or disabling VFs. The VFs are initialized, put into the PF's
children list (pd->children), populate its PCI capabilities, and
register PCI config register filter against PCICAP_EXP_DEVCTL. The
filter's handler caches what is written to MPS field and returns
the cached value on read, to eliminate the gap mentioned as above.

Signed-off-by: Gavin Shan <gwshan at linux.vnet.ibm.com>
Reviewed-by: Russell Currey <ruscur at russell.cc>
---
 core/Makefile.inc |   4 +-
 core/pci-iov.c    | 257 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 core/pci.c        |   2 +
 include/pci-cfg.h |  32 +++++++
 include/pci-iov.h |  37 ++++++++
 5 files changed, 330 insertions(+), 2 deletions(-)
 create mode 100644 core/pci-iov.c
 create mode 100644 include/pci-iov.h

diff --git a/core/Makefile.inc b/core/Makefile.inc
index 2167044..ae3c297 100644
--- a/core/Makefile.inc
+++ b/core/Makefile.inc
@@ -2,8 +2,8 @@
 
 SUBDIRS += core
 CORE_OBJS = relocate.o console.o stack.o init.o chip.o mem_region.o
-CORE_OBJS += malloc.o lock.o cpu.o utils.o fdt.o opal.o interrupts.o
-CORE_OBJS += timebase.o opal-msg.o pci.o pci-virt.o pci-slot.o pcie-slot.o
+CORE_OBJS += malloc.o lock.o cpu.o utils.o fdt.o opal.o interrupts.o timebase.o
+CORE_OBJS += opal-msg.o pci.o pci-iov.o pci-virt.o pci-slot.o pcie-slot.o
 CORE_OBJS += pci-opal.o fast-reboot.o device.o exceptions.o trace.o affinity.o
 CORE_OBJS += vpd.o hostservices.o platform.o nvram.o nvram-format.o hmi.o
 CORE_OBJS += console-log.o ipmi.o time-utils.o pel.o pool.o errorlog.o
diff --git a/core/pci-iov.c b/core/pci-iov.c
new file mode 100644
index 0000000..14c810b
--- /dev/null
+++ b/core/pci-iov.c
@@ -0,0 +1,257 @@
+/* Copyright 2013-2016 IBM Corp.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <skiboot.h>
+#include <pci.h>
+#include <pci-cfg.h>
+#include <pci-slot.h>
+#include <pci-iov.h>
+
+/*
+ * Tackle the VF's MPS in PCIe capability. The field is read only.
+ * This function caches what is written and returns the cached
+ * MPS on read.
+ */
+static int64_t pci_iov_vf_devctl(void *dev, struct pci_cfg_reg_filter *pcrf,
+				 uint32_t offset, uint32_t len,
+				 uint32_t *data, bool write)
+{
+	struct pci_device *vf = (struct pci_device *)dev;
+	uint32_t pos = pci_cap(vf, PCI_CFG_CAP_ID_EXP, false);
+	uint8_t *pcache;
+
+	if (offset != pcrf->start ||
+	    offset != (pos + PCICAP_EXP_DEVCTL))
+		return OPAL_SUCCESS;
+
+	pcache = &pcrf->data[0];
+	if (write) {
+		*pcache = ((uint8_t)(*data >> (8 * (4 - len)))) &
+			   PCICAP_EXP_DEVCTL_MPS;
+	} else {
+		*data &= ~(PCICAP_EXP_DEVCTL_MPS << (8 * (4 - len)));
+		*data |= (((uint32_t)(*pcache & PCICAP_EXP_DEVCTL_MPS))
+			  << (8 * (4 - len)));
+	}
+
+	return OPAL_SUCCESS;
+}
+
+static void pci_iov_vf_quirk(struct phb *phb, struct pci_device *vf)
+{
+	struct pci_cfg_reg_filter *pcrf;
+	uint32_t pos;
+
+	if (!pci_has_cap(vf, PCI_CFG_CAP_ID_EXP, false))
+		return;
+
+	/*
+	 * On Mellanox MT27500 Family [ConnectX-3], its VF's MPS field in
+	 * the corresponding config register is readonly. The MPS for PF/VF
+	 * are usually different. We are introducing a quirk to make them
+	 * look same to avoid confusion.
+	 */
+	if (vf->vdid != 0x100315b3)
+		return;
+
+	pos = pci_cap(vf, PCI_CFG_CAP_ID_EXP, false);
+	pcrf = pci_add_cfg_reg_filter(vf, pos + PCICAP_EXP_DEVCTL, 4,
+				      PCI_REG_FLAG_MASK, pci_iov_vf_devctl);
+	if (!pcrf)
+		prlog(PR_WARNING, "%s: Missed DEVCTL filter on %04x:%02x:%02x.%01x\n",
+		      __func__, phb->opal_id, (vf->bdfn >> 8),
+		      ((vf->bdfn >> 3) & 0x1f), (vf->bdfn & 0x7));
+}
+
+/*
+ * Update the SRIOV parameters that change when the number of
+ * VFs is configured.
+ */
+static bool pci_iov_update_parameters(struct pci_iov *iov)
+{
+	struct phb *phb = iov->phb;
+	uint16_t bdfn = iov->pd->bdfn;
+	uint32_t pos = iov->pos;
+	uint16_t val;
+	bool enabled;
+
+	pci_cfg_read16(phb, bdfn, pos + PCIECAP_SRIOV_CTRL, &val);
+	enabled = !!(val & PCIECAP_SRIOV_CTRL_VFE);
+	if (iov->enabled == enabled)
+		return false;
+
+	if (enabled) {
+		pci_cfg_read16(phb, bdfn, pos + PCIECAP_SRIOV_INITIAL_VF,
+			       &iov->init_VFs);
+		pci_cfg_read16(phb, bdfn, pos + PCIECAP_SRIOV_NUM_VF,
+			       &iov->num_VFs);
+		pci_cfg_read16(phb, bdfn, pos + PCIECAP_SRIOV_VF_OFFSET,
+			       &iov->offset);
+		pci_cfg_read16(phb, bdfn, pos + PCIECAP_SRIOV_VF_STRIDE,
+			       &iov->stride);
+	} else {
+		iov->init_VFs	= 0;
+		iov->num_VFs	= 0;
+		iov->offset	= 0;
+		iov->stride	= 0;
+	}
+
+	iov->enabled = enabled;
+	return true;
+}
+
+static int64_t pci_iov_change(void *dev __unused,
+			      struct pci_cfg_reg_filter *pcrf,
+			      uint32_t offset __unused,
+			      uint32_t len __unused,
+			      uint32_t *data __unused,
+			      bool write __unused)
+{
+	struct pci_iov *iov = (struct pci_iov *)pcrf->data;
+	struct phb *phb = iov->phb;
+	struct pci_device *pd = iov->pd;
+	struct pci_device *vf, *tmp;
+	uint32_t i;
+	bool changed;
+
+	/* Update SRIOV variable parameters */
+	changed = pci_iov_update_parameters(iov);
+	if (!changed)
+		return OPAL_SUCCESS;
+
+	/* Remove all VFs that have been attached to the parent */
+	if (!iov->enabled) {
+		list_for_each_safe(&pd->children, vf, tmp, link)
+			list_del(&vf->link);
+		return OPAL_SUCCESS;
+	}
+
+	/* Initialize the VFs and attach them to parent */
+	for (changed = false, i = 0; i < iov->num_VFs; i++) {
+		vf = &iov->VFs[i];
+		vf->bdfn = pd->bdfn + iov->offset + iov->stride * i;
+		list_add_tail(&pd->children, &vf->link);
+
+		/*
+		 * We don't populate the capabilities again if they have
+		 * been existing, to save time. Also, we need delay for
+		 * 100ms before the VF's config space becomes ready.
+		 */
+		if (!pci_has_cap(vf, PCI_CFG_CAP_ID_EXP, false)) {
+			if (!changed) {
+				changed = !changed;
+				time_wait_ms(100);
+			}
+
+			pci_init_capabilities(phb, vf);
+			pci_iov_vf_quirk(phb, vf);
+		}
+
+		/* Call PHB hook */
+		if (phb->ops->device_init)
+			phb->ops->device_init(phb, pd, NULL);
+	}
+
+	return OPAL_SUCCESS;
+}
+
+/*
+ * This function is called with disabled SRIOV capability. So the VF's
+ * config address isn't finalized and its config space isn't accessible.
+ */
+static void pci_iov_init_VF(struct pci_device *pd, struct pci_device *vf)
+{
+	vf->is_bridge		= false;
+	vf->is_multifunction	= false;
+	vf->dev_type		= PCIE_TYPE_ENDPOINT;
+	vf->scan_map		= -1;
+	vf->vdid		= pd->vdid;
+	vf->sub_vdid		= pd->sub_vdid;
+	vf->class		= pd->class;
+	vf->dn			= NULL;
+	vf->slot		= NULL;
+	vf->parent		= pd;
+	list_head_init(&vf->pcrf);
+	list_head_init(&vf->children);
+}
+
+void pci_init_iov_cap(struct phb *phb, struct pci_device *pd)
+{
+	int64_t pos;
+	struct pci_iov *iov;
+	struct pci_cfg_reg_filter *pcrf;
+	uint32_t i;
+
+	/* Search for SRIOV capability */
+	if (!pci_has_cap(pd, PCI_CFG_CAP_ID_EXP, false))
+		return;
+
+	pos = pci_find_ecap(phb, pd->bdfn, PCIECAP_ID_SRIOV, NULL);
+	if (pos <= 0)
+		return;
+
+	/* Allocate IOV */
+	iov = zalloc(sizeof(*iov));
+	if (!iov) {
+		prlog(PR_ERR, "%s: Cannot alloc IOV for %04x:%02x:%02x.%01x\n",
+		      __func__, phb->opal_id, (pd->bdfn >> 8),
+		      ((pd->bdfn >> 3) & 0x1f), (pd->bdfn & 0x7));
+		return;
+	}
+
+	/* Allocate VFs */
+	pci_cfg_read16(phb, pd->bdfn, pos + PCIECAP_SRIOV_TOTAL_VF,
+		       &iov->total_VFs);
+	iov->VFs = zalloc(sizeof(*iov->VFs) * iov->total_VFs);
+	if (!iov->VFs) {
+		prlog(PR_ERR, "%s: Cannot alloc %d VFs for %04x:%02x:%02x.%01x\n",
+		      __func__, iov->total_VFs, phb->opal_id,
+		      (pd->bdfn >> 8), ((pd->bdfn >> 3) & 0x1f),
+		      (pd->bdfn & 0x7));
+		free(iov);
+		return;
+	}
+
+	/* Initialize VFs */
+	for (i = 0; i < iov->total_VFs; i++)
+		pci_iov_init_VF(pd, &iov->VFs[i]);
+
+	/* Register filter for enabling or disabling SRIOV capability */
+	pcrf = pci_add_cfg_reg_filter(pd, pos + PCIECAP_SRIOV_CTRL, 2,
+				      PCI_REG_FLAG_WRITE, pci_iov_change);
+	if (!pcrf) {
+		prlog(PR_ERR, "%s: Cannot set filter on %04x:%02x:%02x.%01x\n",
+		      __func__, phb->opal_id, (pd->bdfn >> 8),
+		      ((pd->bdfn >> 3) & 0x1f), (pd->bdfn & 0x7));
+		free(iov->VFs);
+		free(iov);
+		return;
+	}
+
+	/* Associate filter and IOV capability */
+	pcrf->data = (void *)iov;
+
+	/*
+	 * Retrieve the number of VFs and other information if applicable.
+	 * Register the SRIOV capability in the mean while.
+	 */
+	iov->phb = phb;
+	iov->pd = pd;
+	iov->pos = pos;
+	iov->enabled = false;
+	pci_iov_update_parameters(iov);
+	pci_set_cap(pd, PCIECAP_ID_SRIOV, pos, iov, true);
+}
diff --git a/core/pci.c b/core/pci.c
index 142a1a1..9889dbf 100644
--- a/core/pci.c
+++ b/core/pci.c
@@ -18,6 +18,7 @@
 #include <cpu.h>
 #include <pci.h>
 #include <pci-cfg.h>
+#include <pci-iov.h>
 #include <pci-slot.h>
 #include <timebase.h>
 #include <device.h>
@@ -204,6 +205,7 @@ void pci_init_capabilities(struct phb *phb, struct pci_device *pd)
 {
 	pci_init_pcie_cap(phb, pd);
 	pci_init_aer_cap(phb, pd);
+	pci_init_iov_cap(phb, pd);
 }
 
 static struct pci_device *pci_scan_one(struct phb *phb, struct pci_device *parent,
diff --git a/include/pci-cfg.h b/include/pci-cfg.h
index 27c0f74..530f0a8 100644
--- a/include/pci-cfg.h
+++ b/include/pci-cfg.h
@@ -486,6 +486,38 @@
 #define PCIECAP_AER_TLP_PFX_LOG2	0x40
 #define PCIECAP_AER_TLP_PFX_LOG3	0x44
 
+/* SRIOV capability */
+#define PCIECAP_ID_SRIOV		0x10
+#define PCIECAP_SRIOV_CAP		0x04
+#define   PCIECAP_SRIOV_CAP_VFM		0x01
+#define   PCIECAP_SRIOV_CAP_INTR(x)	((x) >> 21)
+#define PCIECAP_SRIOV_CTRL		0x08
+#define   PCIECAP_SRIOV_CTRL_VFE	0x01
+#define   PCIECAP_SRIOV_CTRL_VFM	0x02
+#define   PCIECAP_SRIOV_CTRL_INTR	0x04
+#define   PCIECAP_SRIOV_CTRL_MSE	0x08
+#define   PCIECAP_SRIOV_CTRL_ARI	0x10
+#define PCIECAP_SRIOV_STATUS		0x0a
+#define   PCIECAP_SRIOV_STATUS_VFM	0x01
+#define PCIECAP_SRIOV_INITIAL_VF	0x0c
+#define PCIECAP_SRIOV_TOTAL_VF		0x0e
+#define PCIECAP_SRIOV_NUM_VF		0x10
+#define PCIECAP_SRIOV_FUNC_LINK		0x12
+#define PCIECAP_SRIOV_VF_OFFSET		0x14
+#define PCIECAP_SRIOV_VF_STRIDE		0x16
+#define PCIECAP_SRIOV_VF_DID		0x1a
+#define PCIECAP_SRIOV_SUP_PGSIZE	0x1c
+#define PCIECAP_SRIOV_SYS_PGSIZE	0x20
+#define PCIECAP_SRIOV_BAR		0x24
+#define    PCIECAP_SRIOV_NUM_BARS	6
+#define PCIECAP_SRIOV_VFM		0x3c
+#define  PCIECAP_SRIOV_VFM_BIR(x)	((x) & 7)
+#define  PCIECAP_SRIOV_VFM_OFFSET(x)	((x) & ~7)
+#define  PCIECAP_SRIOV_VFM_UA		0x0
+#define  PCIECAP_SRIOV_VFM_MI		0x1
+#define  PCIECAP_SRIOV_VFM_MO		0x2
+#define  PCIECAP_SRIOV_VFM_AV		0x3
+
 /* Vendor specific extend capability */
 #define PCIECAP_ID_VNDR			0x0b
 #define PCIECAP_VNDR_HDR		0x04
diff --git a/include/pci-iov.h b/include/pci-iov.h
new file mode 100644
index 0000000..787b2cd
--- /dev/null
+++ b/include/pci-iov.h
@@ -0,0 +1,37 @@
+/* Copyright 2013-2016 IBM Corp.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __PCI_IOV_H
+#define __PCI_IOV_H
+
+struct pci_iov {
+	struct phb			*phb;
+	struct pci_device		*pd;
+	struct pci_device		*VFs;
+	uint32_t			pos;
+	bool				enabled;
+	struct pci_cfg_reg_filter	pcrf;
+
+	uint16_t			init_VFs;
+	uint16_t			total_VFs;
+	uint16_t			num_VFs;
+	uint16_t			offset;
+	uint16_t			stride;
+};
+
+extern void pci_init_iov_cap(struct phb *phb, struct pci_device *pd);
+
+#endif
-- 
2.7.4



More information about the Skiboot mailing list