[Skiboot] [PATCH v2 5/5] core/pci: Support SRIOV VFs
Gavin Shan
gwshan at linux.vnet.ibm.com
Fri Feb 10 12:20:02 AEDT 2017
Currently, skiboot can't see SRIOV VFs. It introduces some troubles
as I can see: The device initialization logic (phb->ops->device_init())
isn't applied to VFs, meaning we have to maintain same and duplicated
mechanism in kernel for VFs only. It introduces difficulty to code
maintaining and prone to lose sychronization.
This was motivated by bug reported by Carol: The VF's Max Payload
Size (MPS) isn't matched with PF's on Mellanox's adapter even kernel
tried to make them same. It's caused by readonly PCIECAP_EXP_DEVCTL
register on VFs. The skiboot would be best place to emulate this bits
to eliminate the gap as I can see.
This supports SRIOV VFs. When the PF's SRIOV capability is populated,
the number of maximal VFs (struct pci_device) are instanciated, but
but not usable yet. In the mean while, PCI config register filter is
registered against PCIECAP_SRIOV_CTRL_VFE to capture the event of
enabling or disabling VFs. The VFs are initialized, put into the PF's
children list (pd->children), populate its PCI capabilities, and
register PCI config register filter against PCICAP_EXP_DEVCTL. The
filter's handler caches what is written to MPS field and returns
the cached value on read, to eliminate the gap mentioned as above.
Signed-off-by: Gavin Shan <gwshan at linux.vnet.ibm.com>
Reviewed-by: Russell Currey <ruscur at russell.cc>
---
core/Makefile.inc | 4 +-
core/pci-iov.c | 257 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
core/pci.c | 2 +
include/pci-cfg.h | 32 +++++++
include/pci-iov.h | 37 ++++++++
5 files changed, 330 insertions(+), 2 deletions(-)
create mode 100644 core/pci-iov.c
create mode 100644 include/pci-iov.h
diff --git a/core/Makefile.inc b/core/Makefile.inc
index 2167044..ae3c297 100644
--- a/core/Makefile.inc
+++ b/core/Makefile.inc
@@ -2,8 +2,8 @@
SUBDIRS += core
CORE_OBJS = relocate.o console.o stack.o init.o chip.o mem_region.o
-CORE_OBJS += malloc.o lock.o cpu.o utils.o fdt.o opal.o interrupts.o
-CORE_OBJS += timebase.o opal-msg.o pci.o pci-virt.o pci-slot.o pcie-slot.o
+CORE_OBJS += malloc.o lock.o cpu.o utils.o fdt.o opal.o interrupts.o timebase.o
+CORE_OBJS += opal-msg.o pci.o pci-iov.o pci-virt.o pci-slot.o pcie-slot.o
CORE_OBJS += pci-opal.o fast-reboot.o device.o exceptions.o trace.o affinity.o
CORE_OBJS += vpd.o hostservices.o platform.o nvram.o nvram-format.o hmi.o
CORE_OBJS += console-log.o ipmi.o time-utils.o pel.o pool.o errorlog.o
diff --git a/core/pci-iov.c b/core/pci-iov.c
new file mode 100644
index 0000000..14c810b
--- /dev/null
+++ b/core/pci-iov.c
@@ -0,0 +1,257 @@
+/* Copyright 2013-2016 IBM Corp.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <skiboot.h>
+#include <pci.h>
+#include <pci-cfg.h>
+#include <pci-slot.h>
+#include <pci-iov.h>
+
+/*
+ * Tackle the VF's MPS in PCIe capability. The field is read only.
+ * This function caches what is written and returns the cached
+ * MPS on read.
+ */
+static int64_t pci_iov_vf_devctl(void *dev, struct pci_cfg_reg_filter *pcrf,
+ uint32_t offset, uint32_t len,
+ uint32_t *data, bool write)
+{
+ struct pci_device *vf = (struct pci_device *)dev;
+ uint32_t pos = pci_cap(vf, PCI_CFG_CAP_ID_EXP, false);
+ uint8_t *pcache;
+
+ if (offset != pcrf->start ||
+ offset != (pos + PCICAP_EXP_DEVCTL))
+ return OPAL_SUCCESS;
+
+ pcache = &pcrf->data[0];
+ if (write) {
+ *pcache = ((uint8_t)(*data >> (8 * (4 - len)))) &
+ PCICAP_EXP_DEVCTL_MPS;
+ } else {
+ *data &= ~(PCICAP_EXP_DEVCTL_MPS << (8 * (4 - len)));
+ *data |= (((uint32_t)(*pcache & PCICAP_EXP_DEVCTL_MPS))
+ << (8 * (4 - len)));
+ }
+
+ return OPAL_SUCCESS;
+}
+
+static void pci_iov_vf_quirk(struct phb *phb, struct pci_device *vf)
+{
+ struct pci_cfg_reg_filter *pcrf;
+ uint32_t pos;
+
+ if (!pci_has_cap(vf, PCI_CFG_CAP_ID_EXP, false))
+ return;
+
+ /*
+ * On Mellanox MT27500 Family [ConnectX-3], its VF's MPS field in
+ * the corresponding config register is readonly. The MPS for PF/VF
+ * are usually different. We are introducing a quirk to make them
+ * look same to avoid confusion.
+ */
+ if (vf->vdid != 0x100315b3)
+ return;
+
+ pos = pci_cap(vf, PCI_CFG_CAP_ID_EXP, false);
+ pcrf = pci_add_cfg_reg_filter(vf, pos + PCICAP_EXP_DEVCTL, 4,
+ PCI_REG_FLAG_MASK, pci_iov_vf_devctl);
+ if (!pcrf)
+ prlog(PR_WARNING, "%s: Missed DEVCTL filter on %04x:%02x:%02x.%01x\n",
+ __func__, phb->opal_id, (vf->bdfn >> 8),
+ ((vf->bdfn >> 3) & 0x1f), (vf->bdfn & 0x7));
+}
+
+/*
+ * Update the SRIOV parameters that change when the number of
+ * VFs is configured.
+ */
+static bool pci_iov_update_parameters(struct pci_iov *iov)
+{
+ struct phb *phb = iov->phb;
+ uint16_t bdfn = iov->pd->bdfn;
+ uint32_t pos = iov->pos;
+ uint16_t val;
+ bool enabled;
+
+ pci_cfg_read16(phb, bdfn, pos + PCIECAP_SRIOV_CTRL, &val);
+ enabled = !!(val & PCIECAP_SRIOV_CTRL_VFE);
+ if (iov->enabled == enabled)
+ return false;
+
+ if (enabled) {
+ pci_cfg_read16(phb, bdfn, pos + PCIECAP_SRIOV_INITIAL_VF,
+ &iov->init_VFs);
+ pci_cfg_read16(phb, bdfn, pos + PCIECAP_SRIOV_NUM_VF,
+ &iov->num_VFs);
+ pci_cfg_read16(phb, bdfn, pos + PCIECAP_SRIOV_VF_OFFSET,
+ &iov->offset);
+ pci_cfg_read16(phb, bdfn, pos + PCIECAP_SRIOV_VF_STRIDE,
+ &iov->stride);
+ } else {
+ iov->init_VFs = 0;
+ iov->num_VFs = 0;
+ iov->offset = 0;
+ iov->stride = 0;
+ }
+
+ iov->enabled = enabled;
+ return true;
+}
+
+static int64_t pci_iov_change(void *dev __unused,
+ struct pci_cfg_reg_filter *pcrf,
+ uint32_t offset __unused,
+ uint32_t len __unused,
+ uint32_t *data __unused,
+ bool write __unused)
+{
+ struct pci_iov *iov = (struct pci_iov *)pcrf->data;
+ struct phb *phb = iov->phb;
+ struct pci_device *pd = iov->pd;
+ struct pci_device *vf, *tmp;
+ uint32_t i;
+ bool changed;
+
+ /* Update SRIOV variable parameters */
+ changed = pci_iov_update_parameters(iov);
+ if (!changed)
+ return OPAL_SUCCESS;
+
+ /* Remove all VFs that have been attached to the parent */
+ if (!iov->enabled) {
+ list_for_each_safe(&pd->children, vf, tmp, link)
+ list_del(&vf->link);
+ return OPAL_SUCCESS;
+ }
+
+ /* Initialize the VFs and attach them to parent */
+ for (changed = false, i = 0; i < iov->num_VFs; i++) {
+ vf = &iov->VFs[i];
+ vf->bdfn = pd->bdfn + iov->offset + iov->stride * i;
+ list_add_tail(&pd->children, &vf->link);
+
+ /*
+ * We don't populate the capabilities again if they have
+ * been existing, to save time. Also, we need delay for
+ * 100ms before the VF's config space becomes ready.
+ */
+ if (!pci_has_cap(vf, PCI_CFG_CAP_ID_EXP, false)) {
+ if (!changed) {
+ changed = !changed;
+ time_wait_ms(100);
+ }
+
+ pci_init_capabilities(phb, vf);
+ pci_iov_vf_quirk(phb, vf);
+ }
+
+ /* Call PHB hook */
+ if (phb->ops->device_init)
+ phb->ops->device_init(phb, pd, NULL);
+ }
+
+ return OPAL_SUCCESS;
+}
+
+/*
+ * This function is called with disabled SRIOV capability. So the VF's
+ * config address isn't finalized and its config space isn't accessible.
+ */
+static void pci_iov_init_VF(struct pci_device *pd, struct pci_device *vf)
+{
+ vf->is_bridge = false;
+ vf->is_multifunction = false;
+ vf->dev_type = PCIE_TYPE_ENDPOINT;
+ vf->scan_map = -1;
+ vf->vdid = pd->vdid;
+ vf->sub_vdid = pd->sub_vdid;
+ vf->class = pd->class;
+ vf->dn = NULL;
+ vf->slot = NULL;
+ vf->parent = pd;
+ list_head_init(&vf->pcrf);
+ list_head_init(&vf->children);
+}
+
+void pci_init_iov_cap(struct phb *phb, struct pci_device *pd)
+{
+ int64_t pos;
+ struct pci_iov *iov;
+ struct pci_cfg_reg_filter *pcrf;
+ uint32_t i;
+
+ /* Search for SRIOV capability */
+ if (!pci_has_cap(pd, PCI_CFG_CAP_ID_EXP, false))
+ return;
+
+ pos = pci_find_ecap(phb, pd->bdfn, PCIECAP_ID_SRIOV, NULL);
+ if (pos <= 0)
+ return;
+
+ /* Allocate IOV */
+ iov = zalloc(sizeof(*iov));
+ if (!iov) {
+ prlog(PR_ERR, "%s: Cannot alloc IOV for %04x:%02x:%02x.%01x\n",
+ __func__, phb->opal_id, (pd->bdfn >> 8),
+ ((pd->bdfn >> 3) & 0x1f), (pd->bdfn & 0x7));
+ return;
+ }
+
+ /* Allocate VFs */
+ pci_cfg_read16(phb, pd->bdfn, pos + PCIECAP_SRIOV_TOTAL_VF,
+ &iov->total_VFs);
+ iov->VFs = zalloc(sizeof(*iov->VFs) * iov->total_VFs);
+ if (!iov->VFs) {
+ prlog(PR_ERR, "%s: Cannot alloc %d VFs for %04x:%02x:%02x.%01x\n",
+ __func__, iov->total_VFs, phb->opal_id,
+ (pd->bdfn >> 8), ((pd->bdfn >> 3) & 0x1f),
+ (pd->bdfn & 0x7));
+ free(iov);
+ return;
+ }
+
+ /* Initialize VFs */
+ for (i = 0; i < iov->total_VFs; i++)
+ pci_iov_init_VF(pd, &iov->VFs[i]);
+
+ /* Register filter for enabling or disabling SRIOV capability */
+ pcrf = pci_add_cfg_reg_filter(pd, pos + PCIECAP_SRIOV_CTRL, 2,
+ PCI_REG_FLAG_WRITE, pci_iov_change);
+ if (!pcrf) {
+ prlog(PR_ERR, "%s: Cannot set filter on %04x:%02x:%02x.%01x\n",
+ __func__, phb->opal_id, (pd->bdfn >> 8),
+ ((pd->bdfn >> 3) & 0x1f), (pd->bdfn & 0x7));
+ free(iov->VFs);
+ free(iov);
+ return;
+ }
+
+ /* Associate filter and IOV capability */
+ pcrf->data = (void *)iov;
+
+ /*
+ * Retrieve the number of VFs and other information if applicable.
+ * Register the SRIOV capability in the mean while.
+ */
+ iov->phb = phb;
+ iov->pd = pd;
+ iov->pos = pos;
+ iov->enabled = false;
+ pci_iov_update_parameters(iov);
+ pci_set_cap(pd, PCIECAP_ID_SRIOV, pos, iov, true);
+}
diff --git a/core/pci.c b/core/pci.c
index 142a1a1..9889dbf 100644
--- a/core/pci.c
+++ b/core/pci.c
@@ -18,6 +18,7 @@
#include <cpu.h>
#include <pci.h>
#include <pci-cfg.h>
+#include <pci-iov.h>
#include <pci-slot.h>
#include <timebase.h>
#include <device.h>
@@ -204,6 +205,7 @@ void pci_init_capabilities(struct phb *phb, struct pci_device *pd)
{
pci_init_pcie_cap(phb, pd);
pci_init_aer_cap(phb, pd);
+ pci_init_iov_cap(phb, pd);
}
static struct pci_device *pci_scan_one(struct phb *phb, struct pci_device *parent,
diff --git a/include/pci-cfg.h b/include/pci-cfg.h
index 27c0f74..530f0a8 100644
--- a/include/pci-cfg.h
+++ b/include/pci-cfg.h
@@ -486,6 +486,38 @@
#define PCIECAP_AER_TLP_PFX_LOG2 0x40
#define PCIECAP_AER_TLP_PFX_LOG3 0x44
+/* SRIOV capability */
+#define PCIECAP_ID_SRIOV 0x10
+#define PCIECAP_SRIOV_CAP 0x04
+#define PCIECAP_SRIOV_CAP_VFM 0x01
+#define PCIECAP_SRIOV_CAP_INTR(x) ((x) >> 21)
+#define PCIECAP_SRIOV_CTRL 0x08
+#define PCIECAP_SRIOV_CTRL_VFE 0x01
+#define PCIECAP_SRIOV_CTRL_VFM 0x02
+#define PCIECAP_SRIOV_CTRL_INTR 0x04
+#define PCIECAP_SRIOV_CTRL_MSE 0x08
+#define PCIECAP_SRIOV_CTRL_ARI 0x10
+#define PCIECAP_SRIOV_STATUS 0x0a
+#define PCIECAP_SRIOV_STATUS_VFM 0x01
+#define PCIECAP_SRIOV_INITIAL_VF 0x0c
+#define PCIECAP_SRIOV_TOTAL_VF 0x0e
+#define PCIECAP_SRIOV_NUM_VF 0x10
+#define PCIECAP_SRIOV_FUNC_LINK 0x12
+#define PCIECAP_SRIOV_VF_OFFSET 0x14
+#define PCIECAP_SRIOV_VF_STRIDE 0x16
+#define PCIECAP_SRIOV_VF_DID 0x1a
+#define PCIECAP_SRIOV_SUP_PGSIZE 0x1c
+#define PCIECAP_SRIOV_SYS_PGSIZE 0x20
+#define PCIECAP_SRIOV_BAR 0x24
+#define PCIECAP_SRIOV_NUM_BARS 6
+#define PCIECAP_SRIOV_VFM 0x3c
+#define PCIECAP_SRIOV_VFM_BIR(x) ((x) & 7)
+#define PCIECAP_SRIOV_VFM_OFFSET(x) ((x) & ~7)
+#define PCIECAP_SRIOV_VFM_UA 0x0
+#define PCIECAP_SRIOV_VFM_MI 0x1
+#define PCIECAP_SRIOV_VFM_MO 0x2
+#define PCIECAP_SRIOV_VFM_AV 0x3
+
/* Vendor specific extend capability */
#define PCIECAP_ID_VNDR 0x0b
#define PCIECAP_VNDR_HDR 0x04
diff --git a/include/pci-iov.h b/include/pci-iov.h
new file mode 100644
index 0000000..787b2cd
--- /dev/null
+++ b/include/pci-iov.h
@@ -0,0 +1,37 @@
+/* Copyright 2013-2016 IBM Corp.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __PCI_IOV_H
+#define __PCI_IOV_H
+
+struct pci_iov {
+ struct phb *phb;
+ struct pci_device *pd;
+ struct pci_device *VFs;
+ uint32_t pos;
+ bool enabled;
+ struct pci_cfg_reg_filter pcrf;
+
+ uint16_t init_VFs;
+ uint16_t total_VFs;
+ uint16_t num_VFs;
+ uint16_t offset;
+ uint16_t stride;
+};
+
+extern void pci_init_iov_cap(struct phb *phb, struct pci_device *pd);
+
+#endif
--
2.7.4
More information about the Skiboot
mailing list