[Skiboot] [PATCH 5/5] core/pci: Support SRIOV VFs
Russell Currey
ruscur at russell.cc
Mon Feb 6 17:29:22 AEDT 2017
On Fri, 2016-11-18 at 16:05 +1100, Gavin Shan wrote:
> Currently, skiboot can't see SRIOV VFs. It introduces some troubles
> as I can see: The device initialization logic (phb->ops->device_init())
> isn't applied to VFs, meaning we have to maintain same and duplicated
> mechanism in kernel for VFs only. It introduces difficulty to code
> maintaining and prone to lose sychronization.
>
> This was motivated by bug reported by Carol: The VF's Max Payload
> Size (MPS) isn't matched with PF's on Mellanox's adapter even kernel
> tried to make them same. It's caused by readonly PCIECAP_EXP_DEVCTL
> register on VFs. The skiboot would be best place to emulate this bits
> to eliminate the gap as I can see.
>
> This supports SRIOV VFs. When the PF's SRIOV capability is populated,
> the number of maximal VFs (struct pci_device) are instanciated, but
> usable yet. In the mean while, PCI config register filter is registered
should this be "but not usable yet"?
> against PCIECAP_SRIOV_CTRL_VFE to capture the event of enabling or
> disabling VFs. The VFs are initialized, put into the PF's children
> list (pd->children), populate its PCI capabilities, and register PCI
> config register filter against PCICAP_EXP_DEVCTL. The filter's handler
> caches what is written to MPS field and returns the cached value on
> read, to eliminate the gap mentioned as above.
>
> Signed-off-by: Gavin Shan <gwshan at linux.vnet.ibm.com>
> ---
Having knowledge of VFs in skiboot seems like a good idea.
Minor comments below.
Reviewed-by: Russell Currey <ruscur at russell.cc>
> core/Makefile.inc | 4 +-
> core/pci-iov.c | 252
> ++++++++++++++++++++++++++++++++++++++++++++++++++++++
> core/pci.c | 2 +
> include/pci-cfg.h | 32 +++++++
> include/pci-iov.h | 37 ++++++++
> 5 files changed, 325 insertions(+), 2 deletions(-)
> create mode 100644 core/pci-iov.c
> create mode 100644 include/pci-iov.h
>
> diff --git a/core/Makefile.inc b/core/Makefile.inc
> index 9223af1..5ff84d1 100644
> --- a/core/Makefile.inc
> +++ b/core/Makefile.inc
> @@ -2,8 +2,8 @@
>
> SUBDIRS += core
> CORE_OBJS = relocate.o console.o stack.o init.o chip.o mem_region.o
> -CORE_OBJS += malloc.o lock.o cpu.o utils.o fdt.o opal.o interrupts.o
> -CORE_OBJS += timebase.o opal-msg.o pci.o pci-virt.o pci-slot.o pcie-slot.o
> +CORE_OBJS += malloc.o lock.o cpu.o utils.o fdt.o opal.o interrupts.o
> timebase.o
> +CORE_OBJS += opal-msg.o pci.o pci-iov.o pci-virt.o pci-slot.o pcie-slot.o
> CORE_OBJS += pci-opal.o fast-reboot.o device.o exceptions.o trace.o
> affinity.o
> CORE_OBJS += vpd.o hostservices.o platform.o nvram.o nvram-format.o hmi.o
> CORE_OBJS += console-log.o ipmi.o time-utils.o pel.o pool.o errorlog.o
> diff --git a/core/pci-iov.c b/core/pci-iov.c
> new file mode 100644
> index 0000000..2d37b15
> --- /dev/null
> +++ b/core/pci-iov.c
> @@ -0,0 +1,252 @@
> +/* Copyright 2013-2016 IBM Corp.
> + *
> + * Licensed under the Apache License, Version 2.0 (the "License");
> + * you may not use this file except in compliance with the License.
> + * You may obtain a copy of the License at
> + *
> + * http://www.apache.org/licenses/LICENSE-2.0
> + *
> + * Unless required by applicable law or agreed to in writing, software
> + * distributed under the License is distributed on an "AS IS" BASIS,
> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
> + * implied.
> + * See the License for the specific language governing permissions and
> + * limitations under the License.
> + */
> +
> +#include <skiboot.h>
> +#include <pci.h>
> +#include <pci-cfg.h>
> +#include <pci-slot.h>
> +#include <pci-iov.h>
> +
> +/*
> + * Tackle the VF's MPS in PCIe capability. The field is read only.
> + * This function caches what is written and returns the cached
> + * MPS on read.
> + */
> +static int64_t pci_iov_vf_devctl(void *dev, struct pci_cfg_reg_filter *pcrf,
> + uint32_t offset, uint32_t len,
> + uint32_t *data, bool write)
> +{
> + struct pci_device *vf = (struct pci_device *)dev;
> + uint32_t pos = pci_cap(vf, PCI_CFG_CAP_ID_EXP, false);
> + uint8_t *pcache;
> +
> + if (offset != pcrf->start ||
> + offset != (pos + PCICAP_EXP_DEVCTL))
> + return OPAL_SUCCESS;
In what situations would this be called with a wrong offset? Does there need to
be some kind of message or different return here, given this function always
returns OPAL_SUCCESS?
> +
> + pcache = &pcrf->data[0];
> + if (write) {
> + *pcache = ((uint8_t)(*data >> (8 * (4 - len)))) &
> + PCICAP_EXP_DEVCTL_MPS;
> + } else {
> + *data &= ~(PCICAP_EXP_DEVCTL_MPS << (8 * (4 - len)));
> + *data |= (((uint32_t)(*pcache & PCICAP_EXP_DEVCTL_MPS))
> + << (8 * (4 - len)));
> + }
> +
> + return OPAL_SUCCESS;
> +}
> +
> +static void pci_iov_vf_quirk(struct phb *phb, struct pci_device *vf)
> +{
> + struct pci_cfg_reg_filter *pcrf;
> + uint32_t pos;
> +
> + if (!pci_has_cap(vf, PCI_CFG_CAP_ID_EXP, false))
> + return;
> +
> + /* Mellanox MT27500 Family [ConnectX-3] */
It might be nice to extend the comment to explain why this is broken for this
particular family of devices.
> + if (vf->vdid != 0x100315b3)
> + return;
> +
> + pos = pci_cap(vf, PCI_CFG_CAP_ID_EXP, false);
> + pcrf = pci_add_cfg_reg_filter(vf, pos + PCICAP_EXP_DEVCTL, 4,
> + PCI_REG_FLAG_MASK, pci_iov_vf_devctl);
> + if (!pcrf)
> + prlog(PR_WARNING, "%s: Missed DEVCTL filter on
> %04x:%02x:%02x.%01x\n",
> + __func__, phb->opal_id, (vf->bdfn >> 8),
> + ((vf->bdfn >> 3) & 0x1f), (vf->bdfn & 0x7));
> +}
> +
> +/*
> + * Update the SRIOV parameters that change when the number of
> + * VFs is configured.
> + */
> +static bool pci_iov_update_parameters(struct pci_iov *iov)
> +{
> + struct phb *phb = iov->phb;
> + uint16_t bdfn = iov->pd->bdfn;
> + uint32_t pos = iov->pos;
> + uint16_t val;
> + bool enabled;
> +
> + pci_cfg_read16(phb, bdfn, pos + PCIECAP_SRIOV_CTRL, &val);
> + enabled = !!(val & PCIECAP_SRIOV_CTRL_VFE);
> + if (iov->enabled == enabled)
> + return false;
> +
> + if (enabled) {
> + pci_cfg_read16(phb, bdfn, pos + PCIECAP_SRIOV_INITIAL_VF,
> + &iov->init_VFs);
> + pci_cfg_read16(phb, bdfn, pos + PCIECAP_SRIOV_NUM_VF,
> + &iov->num_VFs);
> + pci_cfg_read16(phb, bdfn, pos + PCIECAP_SRIOV_VF_OFFSET,
> + &iov->offset);
> + pci_cfg_read16(phb, bdfn, pos + PCIECAP_SRIOV_VF_STRIDE,
> + &iov->stride);
> + } else {
> + iov->init_VFs = 0;
> + iov->num_VFs = 0;
> + iov->offset = 0;
> + iov->stride = 0;
> + }
> +
> + iov->enabled = enabled;
> + return true;
> +}
> +
> +static int64_t pci_iov_change(void *dev __unused,
> + struct pci_cfg_reg_filter *pcrf,
> + uint32_t offset __unused,
> + uint32_t len __unused,
> + uint32_t *data __unused,
> + bool write __unused)
> +{
> + struct pci_iov *iov = (struct pci_iov *)pcrf->data;
> + struct phb *phb = iov->phb;
> + struct pci_device *pd = iov->pd;
> + struct pci_device *vf, *tmp;
> + uint32_t i;
> + bool changed;
> +
> + /* Update SRIOV variable parameters */
> + changed = pci_iov_update_parameters(iov);
> + if (!changed)
> + return OPAL_SUCCESS;
> +
> + /* Remove all VFs that have been attached to the parent */
> + if (!iov->enabled) {
> + list_for_each_safe(&pd->children, vf, tmp, link)
> + list_del(&vf->link);
> + return OPAL_SUCCESS;
> + }
> +
> + /* Initialize the VFs and attach them to parent */
> + for (changed = false, i = 0; i < iov->num_VFs; i++) {
> + vf = &iov->VFs[i];
> + vf->bdfn = pd->bdfn + iov->offset + iov->stride * i;
> + list_add_tail(&pd->children, &vf->link);
> +
> + /*
> + * We don't populate the capabilities again if they have
> + * been existing, to save time. Also, we need delay for
> + * 100ms before the VF's config space becomes ready.
> + */
> + if (!pci_has_cap(vf, PCI_CFG_CAP_ID_EXP, false)) {
> + if (!changed) {
> + changed = !changed;
> + time_wait_ms(100);
> + }
> +
> + pci_init_capabilities(phb, vf);
> + pci_iov_vf_quirk(phb, vf);
> + }
> +
> + /* Call PHB hook */
> + if (phb->ops->device_init)
> + phb->ops->device_init(phb, pd, NULL);
> + }
> +
> + return OPAL_SUCCESS;
> +}
> +
> +/*
> + * This function is called with disabled SRIOV capability. So the VF's
> + * config address isn't finalized and its config space isn't accessible.
> + */
> +static void pci_iov_init_VF(struct pci_device *pd, struct pci_device *vf)
> +{
> + vf->is_bridge = false;
> + vf->is_multifunction = false;
> + vf->dev_type = PCIE_TYPE_ENDPOINT;
> + vf->scan_map = -1;
> + vf->vdid = pd->vdid;
> + vf->sub_vdid = pd->sub_vdid;
> + vf->class = pd->class;
> + vf->dn = NULL;
> + vf->slot = NULL;
> + vf->parent = pd;
> + list_head_init(&vf->pcrf);
> + list_head_init(&vf->children);
> +}
> +
> +void pci_init_iov_cap(struct phb *phb, struct pci_device *pd)
> +{
> + int64_t pos;
> + struct pci_iov *iov;
> + struct pci_cfg_reg_filter *pcrf;
> + uint32_t i;
> +
> + /* Search for SRIOV capability */
> + if (!pci_has_cap(pd, PCI_CFG_CAP_ID_EXP, false))
> + return;
> +
> + pos = pci_find_ecap(phb, pd->bdfn, PCIECAP_ID_SRIOV, NULL);
> + if (pos <= 0)
> + return;
> +
> + /* Allocate IOV */
> + iov = zalloc(sizeof(*iov));
> + if (!iov) {
> + prlog(PR_ERR, "%s: Cannot alloc IOV for
> %04x:%02x:%02x.%01x\n",
> + __func__, phb->opal_id, (pd->bdfn >> 8),
> + ((pd->bdfn >> 3) & 0x1f), (pd->bdfn & 0x7));
> + return;
> + }
> +
> + /* Allocate VFs */
> + pci_cfg_read16(phb, pd->bdfn, pos + PCIECAP_SRIOV_TOTAL_VF,
> + &iov->total_VFs);
> + iov->VFs = zalloc(sizeof(*iov->VFs) * iov->total_VFs);
> + if (!iov->VFs) {
> + prlog(PR_ERR, "%s: Cannot alloc %d VFs for
> %04x:%02x:%02x.%01x\n",
> + __func__, iov->total_VFs, phb->opal_id,
> + (pd->bdfn >> 8), ((pd->bdfn >> 3) & 0x1f),
> + (pd->bdfn & 0x7));
> + free(iov);
> + return;
> + }
> +
> + /* Initialize VFs */
> + for (i = 0; i < iov->total_VFs; i++)
> + pci_iov_init_VF(pd, &iov->VFs[i]);
> +
> + /* Register filter for enabling or disabling SRIOV capability */
> + pcrf = pci_add_cfg_reg_filter(pd, pos + PCIECAP_SRIOV_CTRL, 2,
> + PCI_REG_FLAG_WRITE, pci_iov_change);
> + if (!pcrf) {
> + prlog(PR_ERR, "%s: Cannot set filter on
> %04x:%02x:%02x.%01x\n",
> + __func__, phb->opal_id, (pd->bdfn >> 8),
> + ((pd->bdfn >> 3) & 0x1f), (pd->bdfn & 0x7));
> + free(iov->VFs);
> + free(iov);
> + return;
> + }
> +
> + /* Associate filter and IOV capability */
> + pcrf->data = (void *)iov;
> +
> + /*
> + * Retrieve the number of VFs and other information if applicable.
> + * Register the SRIOV capability in the mean while.
> + */
> + iov->phb = phb;
> + iov->pd = pd;
> + iov->pos = pos;
> + iov->enabled = false;
> + pci_iov_update_parameters(iov);
> + pci_set_cap(pd, PCIECAP_ID_SRIOV, pos, iov, true);
> +}
> diff --git a/core/pci.c b/core/pci.c
> index b38d5a0..860ff5a 100644
> --- a/core/pci.c
> +++ b/core/pci.c
> @@ -18,6 +18,7 @@
> #include <cpu.h>
> #include <pci.h>
> #include <pci-cfg.h>
> +#include <pci-iov.h>
> #include <pci-slot.h>
> #include <timebase.h>
> #include <device.h>
> @@ -204,6 +205,7 @@ void pci_init_capabilities(struct phb *phb, struct
> pci_device *pd)
> {
> pci_init_pcie_cap(phb, pd);
> pci_init_aer_cap(phb, pd);
> + pci_init_iov_cap(phb, pd);
> }
>
> static struct pci_device *pci_scan_one(struct phb *phb, struct pci_device
> *parent,
> diff --git a/include/pci-cfg.h b/include/pci-cfg.h
> index 27c0f74..530f0a8 100644
> --- a/include/pci-cfg.h
> +++ b/include/pci-cfg.h
> @@ -486,6 +486,38 @@
> #define PCIECAP_AER_TLP_PFX_LOG2 0x40
> #define PCIECAP_AER_TLP_PFX_LOG3 0x44
>
> +/* SRIOV capability */
> +#define PCIECAP_ID_SRIOV 0x10
> +#define PCIECAP_SRIOV_CAP 0x04
> +#define PCIECAP_SRIOV_CAP_VFM 0x01
> +#define PCIECAP_SRIOV_CAP_INTR(x) ((x) >> 21)
> +#define PCIECAP_SRIOV_CTRL 0x08
> +#define PCIECAP_SRIOV_CTRL_VFE 0x01
> +#define PCIECAP_SRIOV_CTRL_VFM 0x02
> +#define PCIECAP_SRIOV_CTRL_INTR 0x04
> +#define PCIECAP_SRIOV_CTRL_MSE 0x08
> +#define PCIECAP_SRIOV_CTRL_ARI 0x10
> +#define PCIECAP_SRIOV_STATUS 0x0a
> +#define PCIECAP_SRIOV_STATUS_VFM 0x01
> +#define PCIECAP_SRIOV_INITIAL_VF 0x0c
> +#define PCIECAP_SRIOV_TOTAL_VF 0x0e
> +#define PCIECAP_SRIOV_NUM_VF 0x10
> +#define PCIECAP_SRIOV_FUNC_LINK 0x12
> +#define PCIECAP_SRIOV_VF_OFFSET 0x14
> +#define PCIECAP_SRIOV_VF_STRIDE 0x16
> +#define PCIECAP_SRIOV_VF_DID 0x1a
> +#define PCIECAP_SRIOV_SUP_PGSIZE 0x1c
> +#define PCIECAP_SRIOV_SYS_PGSIZE 0x20
> +#define PCIECAP_SRIOV_BAR 0x24
> +#define PCIECAP_SRIOV_NUM_BARS 6
> +#define PCIECAP_SRIOV_VFM 0x3c
> +#define PCIECAP_SRIOV_VFM_BIR(x) ((x) & 7)
> +#define PCIECAP_SRIOV_VFM_OFFSET(x) ((x) & ~7)
> +#define PCIECAP_SRIOV_VFM_UA 0x0
> +#define PCIECAP_SRIOV_VFM_MI 0x1
> +#define PCIECAP_SRIOV_VFM_MO 0x2
> +#define PCIECAP_SRIOV_VFM_AV 0x3
> +
> /* Vendor specific extend capability */
> #define PCIECAP_ID_VNDR 0x0b
> #define PCIECAP_VNDR_HDR 0x04
> diff --git a/include/pci-iov.h b/include/pci-iov.h
> new file mode 100644
> index 0000000..787b2cd
> --- /dev/null
> +++ b/include/pci-iov.h
> @@ -0,0 +1,37 @@
> +/* Copyright 2013-2016 IBM Corp.
> + *
> + * Licensed under the Apache License, Version 2.0 (the "License");
> + * you may not use this file except in compliance with the License.
> + * You may obtain a copy of the License at
> + *
> + * http://www.apache.org/licenses/LICENSE-2.0
> + *
> + * Unless required by applicable law or agreed to in writing, software
> + * distributed under the License is distributed on an "AS IS" BASIS,
> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
> + * implied.
> + * See the License for the specific language governing permissions and
> + * limitations under the License.
> + */
> +
> +#ifndef __PCI_IOV_H
> +#define __PCI_IOV_H
> +
> +struct pci_iov {
> + struct phb *phb;
> + struct pci_device *pd;
> + struct pci_device *VFs;
> + uint32_t pos;
> + bool enabled;
> + struct pci_cfg_reg_filter pcrf;
> +
> + uint16_t init_VFs;
> + uint16_t total_VFs;
> + uint16_t num_VFs;
> + uint16_t offset;
> + uint16_t stride;
> +};
> +
> +extern void pci_init_iov_cap(struct phb *phb, struct pci_device *pd);
> +
> +#endif
More information about the Skiboot
mailing list