[PATCH 3/4] drivers/vfio: New IOCTL command VFIO_EEH_INFO
Gavin Shan
gwshan at linux.vnet.ibm.com
Tue May 20 18:30:11 EST 2014
The patch adds new IOCTL command VFIO_EEH_OP to VFIO PCI device
to support EEH functionality for PCI devices, which have been
passed from host to guest via VFIO.
Signed-off-by: Gavin Shan <gwshan at linux.vnet.ibm.com>
---
arch/powerpc/platforms/powernv/Makefile | 1 +
arch/powerpc/platforms/powernv/eeh-vfio.c | 445 ++++++++++++++++++++++++++++++
drivers/vfio/pci/vfio_pci.c | 24 +-
drivers/vfio/pci/vfio_pci_private.h | 16 ++
include/uapi/linux/vfio.h | 43 +++
5 files changed, 523 insertions(+), 6 deletions(-)
create mode 100644 arch/powerpc/platforms/powernv/eeh-vfio.c
diff --git a/arch/powerpc/platforms/powernv/Makefile b/arch/powerpc/platforms/powernv/Makefile
index 63cebb9..45cd833 100644
--- a/arch/powerpc/platforms/powernv/Makefile
+++ b/arch/powerpc/platforms/powernv/Makefile
@@ -6,5 +6,6 @@ obj-y += opal-msglog.o
obj-$(CONFIG_SMP) += smp.o
obj-$(CONFIG_PCI) += pci.o pci-p5ioc2.o pci-ioda.o
obj-$(CONFIG_EEH) += eeh-ioda.o eeh-powernv.o
+obj-$(CONFIG_VFIO_PCI_EEH) += eeh-vfio.o
obj-$(CONFIG_PPC_SCOM) += opal-xscom.o
obj-$(CONFIG_MEMORY_FAILURE) += opal-memory-errors.o
diff --git a/arch/powerpc/platforms/powernv/eeh-vfio.c b/arch/powerpc/platforms/powernv/eeh-vfio.c
new file mode 100644
index 0000000..11adc55
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/eeh-vfio.c
@@ -0,0 +1,445 @@
+/*
+ * The file intends to support EEH funtionality for those PCI devices,
+ * which have been passed through from host to guest via VFIO. So this
+ * file is naturally part of VFIO implementation on PowerNV platform.
+ *
+ * Copyright Benjamin Herrenschmidt & Gavin Shan, IBM Corporation 2014.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/init.h>
+#include <linux/io.h>
+#include <linux/irq.h>
+#include <linux/kernel.h>
+#include <linux/kvm_host.h>
+#include <linux/msi.h>
+#include <linux/pci.h>
+#include <linux/string.h>
+#include <linux/vfio.h>
+
+#include <asm/eeh.h>
+#include <asm/eeh_event.h>
+#include <asm/io.h>
+#include <asm/iommu.h>
+#include <asm/opal.h>
+#include <asm/msi_bitmap.h>
+#include <asm/pci-bridge.h>
+#include <asm/ppc-pci.h>
+#include <asm/tce.h>
+#include <asm/uaccess.h>
+
+#include "powernv.h"
+#include "pci.h"
+
+static int powernv_eeh_vfio_check_dev(struct pci_dev *pdev,
+ struct eeh_dev **pedev,
+ struct eeh_pe **ppe,
+ struct pnv_phb **pphb)
+{
+ struct eeh_dev *edev;
+ struct pnv_phb *phb;
+
+ /* No device ? */
+ if (!pdev)
+ return -ENODEV;
+
+ edev = pci_dev_to_eeh_dev(pdev);
+ if (!edev || !eeh_dev_passed(edev) ||
+ !edev->pe || !eeh_pe_passed(edev->pe))
+ return -ENODEV;
+
+ /* EEH isn't supported ? */
+ phb = edev->phb->private_data;
+ if (!(phb->flags & PNV_PHB_FLAG_EEH))
+ return -EACCES;
+
+ if (pedev)
+ *pedev = edev;
+ if (ppe)
+ *ppe = edev->pe;
+ if (pphb)
+ *pphb = phb;
+
+ return 0;
+}
+
+static int powernv_eeh_vfio_set_option(struct pci_dev *pdev,
+ struct vfio_eeh_op *info)
+{
+ struct eeh_dev *edev;
+ struct eeh_pe *pe;
+ struct pnv_phb *phb;
+ int opcode = info->option.option;
+ int ret = 0;
+
+ /* Device existing ? */
+ ret = powernv_eeh_vfio_check_dev(pdev, &edev, &pe, &phb);
+ if (ret) {
+ pr_debug("%s: Cannot find device\n",
+ __func__);
+ info->option.ret = -7;
+ goto out;
+ }
+
+ /* Invalid opcode ? */
+ if (opcode < EEH_OPT_DISABLE ||
+ opcode > EEH_OPT_THAW_DMA) {
+ pr_debug("%s: Opcode#%d out of range (%d, %d)\n",
+ __func__, opcode, EEH_OPT_DISABLE, EEH_OPT_THAW_DMA);
+ info->option.ret = -3;
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (opcode == EEH_OPT_DISABLE ||
+ opcode == EEH_OPT_ENABLE) {
+ info->option.ret = 0;
+ } else {
+ if (!phb->eeh_ops || !phb->eeh_ops->set_option) {
+ info->option.ret = -7;
+ ret = -ENOENT;
+ goto out;
+ }
+
+ ret = phb->eeh_ops->set_option(pe, opcode);
+ if (ret) {
+ pr_debug("%s: Failure %d from backend\n",
+ __func__, ret);
+ info->option.ret = -3;
+ goto out;
+ }
+
+ info->option.ret = 0;
+ }
+out:
+ return ret;
+}
+
+static int powernv_eeh_vfio_get_addr(struct pci_dev *pdev,
+ struct vfio_eeh_op *info)
+{
+ struct pci_bus *bus;
+ struct eeh_dev *edev;
+ struct eeh_pe *pe;
+ struct pnv_phb *phb;
+ int opcode = info->addr.option;
+ int ret = 0;
+
+ /* Device existing ? */
+ ret = powernv_eeh_vfio_check_dev(pdev, &edev, &pe, &phb);
+ if (ret) {
+ info->addr.ret = -3;
+ goto out;
+ }
+
+ /* Invalid opcode ? */
+ if (opcode != 0 && opcode != 1) {
+ pr_debug("%s: opcode %d out of range (0, 1)\n",
+ __func__, opcode);
+ info->addr.ret = -3;
+ ret = -EINVAL;
+ goto out;
+ }
+
+ /*
+ * Fill result according to opcode. We don't differentiate
+ * PCI bus and device sensitive PE here.
+ */
+ if (opcode == 0) {
+ bus = eeh_pe_bus_get(pe);
+ if (!bus) {
+ info->addr.ret = -3;
+ ret = -ENODEV;
+ goto out;
+ }
+
+ info->addr.ret = 0;
+ info->addr.info = bus->number << 16;
+ } else {
+ info->addr.info = 1;
+ info->addr.ret = 1;
+ }
+out:
+ return ret;
+}
+
+static int powernv_eeh_vfio_get_state(struct pci_dev *pdev,
+ struct vfio_eeh_op *info)
+{
+ struct eeh_dev *edev;
+ struct eeh_pe *pe;
+ struct pnv_phb *phb;
+ int result, ret = 0;
+
+ /* Device existing ? */
+ ret = powernv_eeh_vfio_check_dev(pdev, &edev, &pe, &phb);
+ if (ret) {
+ info->state.ret = -3;
+ goto out;
+ }
+
+ if (!phb->eeh_ops || !phb->eeh_ops->get_state) {
+ pr_debug("%s: Unsupported request\n",
+ __func__);
+ ret = -ENOENT;
+ info->state.ret = -3;
+ goto out;
+ }
+
+ result = phb->eeh_ops->get_state(pe);
+
+ if (!(result & EEH_STATE_RESET_ACTIVE) &&
+ (result & EEH_STATE_DMA_ENABLED) &&
+ (result & EEH_STATE_MMIO_ENABLED))
+ info->state.reset_state = 0;
+ else if (result & EEH_STATE_RESET_ACTIVE)
+ info->state.reset_state = 1;
+ else if (!(result & EEH_STATE_RESET_ACTIVE) &&
+ !(result & EEH_STATE_DMA_ENABLED) &&
+ !(result & EEH_STATE_MMIO_ENABLED))
+ info->state.reset_state = 2;
+ else if (!(result & EEH_STATE_RESET_ACTIVE) &&
+ (result & EEH_STATE_DMA_ENABLED) &&
+ !(result & EEH_STATE_MMIO_ENABLED))
+ info->state.reset_state = 4;
+ else
+ info->state.reset_state = 5;
+
+ info->state.ret = 0;
+ info->state.cfg_cap = 1;
+ info->state.pe_unavail_info = 1000;
+ info->state.pe_recovery_info = 0;
+
+out:
+ return ret;
+}
+
+static int powernv_eeh_vfio_pe_reset(struct pci_dev *pdev,
+ struct vfio_eeh_op *info)
+{
+ struct eeh_dev *edev;
+ struct eeh_pe *pe;
+ struct pnv_phb *phb;
+ int opcode = info->reset.option;
+ int ret = 0;
+
+ /* Device existing ? */
+ ret = powernv_eeh_vfio_check_dev(pdev, &edev, &pe, &phb);
+ if (ret) {
+ info->addr.ret = -3;
+ goto out;
+ }
+
+ /* Invalid opcode ? */
+ if (opcode != EEH_RESET_DEACTIVATE &&
+ opcode != EEH_RESET_HOT &&
+ opcode != EEH_RESET_FUNDAMENTAL) {
+ pr_debug("%s: Unsupported opcode %d\n",
+ __func__, opcode);
+ ret = -EINVAL;
+ info->reset.ret = -3;
+ goto out;
+ }
+
+ /* Call into the IODA dependent backend to do the reset */
+ if (!phb->eeh_ops ||
+ !phb->eeh_ops->set_option ||
+ !phb->eeh_ops->reset) {
+ pr_debug("%s: Unsupported request\n",
+ __func__);
+ ret = -ENOENT;
+ info->reset.ret = -7;
+ goto out;
+ }
+
+ /*
+ * The frozen PE might be caused by the mechanism called
+ * PAPR error injection, which is supposed to be one-shot
+ * without "sticky" bit as being stated by the spec. But
+ * the reality isn't that, at least on P7IOC. So we have
+ * to clear that to avoid recrusive error, which fails the
+ * recovery eventually.
+ */
+ if (opcode == EEH_RESET_DEACTIVATE)
+ opal_pci_reset(phb->opal_id,
+ OPAL_PHB_ERROR,
+ OPAL_ASSERT_RESET);
+
+ ret = phb->eeh_ops->reset(pe, opcode);
+ if (ret) {
+ pr_debug("%s: Failure %d from backend\n",
+ __func__, ret);
+ info->reset.ret = -1;
+ goto out;
+ }
+
+ /*
+ * The PE is still in frozen state and we need clear that.
+ * It's good to clear frozen state after deassert to avoid
+ * messy IO access during reset, which might cause recrusive
+ * frozen PE.
+ */
+ if (opcode == EEH_RESET_DEACTIVATE) {
+ ret = phb->eeh_ops->set_option(pe, EEH_OPT_THAW_MMIO);
+ if (ret) {
+ pr_debug("%s: Cannot enable DMA for PHB#%d-PE#%d (%d)\n",
+ __func__, pe->phb->global_number, pe->addr, ret);
+ info->reset.ret = -1;
+ goto out;
+ }
+
+ ret = phb->eeh_ops->set_option(pe, EEH_OPT_THAW_DMA);
+ if (ret) {
+ pr_debug("%s: Cannot enable IO for PHB#%d-PE#%d (%d)\n",
+ __func__, pe->phb->global_number, pe->addr, ret);
+ info->reset.ret = -1;
+ goto out;
+ }
+
+ eeh_pe_state_clear(pe, EEH_PE_ISOLATED);
+ }
+
+ info->reset.ret = 0;
+out:
+ return ret;
+}
+
+static int powernv_eeh_vfio_pe_config(struct pci_dev *pdev,
+ struct vfio_eeh_op *info)
+{
+ struct eeh_dev *edev;
+ struct eeh_pe *pe;
+ struct pnv_phb *phb;
+ int ret = 0;
+
+ /* Device existing ? */
+ ret = powernv_eeh_vfio_check_dev(pdev, &edev, &pe, &phb);
+ if (ret) {
+ info->config.ret = -3;
+ goto out;
+ }
+
+ /*
+ * The access to PCI config space on VFIO device has some
+ * limitations. Part of PCI config space, including BAR
+ * registers are not readable and writable. So the guest
+ * should have stale values for those registers and we have
+ * to restore them in host side.
+ */
+ eeh_pe_restore_bars(pe);
+ info->config.ret = 0;
+
+out:
+ return ret;
+}
+
+int eeh_vfio_pci_open(struct pci_dev *pdev)
+{
+ struct eeh_dev *edev;
+
+ /* No PCI device ? */
+ if (!pdev)
+ return -ENODEV;
+
+ /* No EEH device ? */
+ edev = pci_dev_to_eeh_dev(pdev);
+ if (!edev || !edev->pe)
+ return -ENODEV;
+
+ eeh_dev_set_passed(edev, true);
+ eeh_pe_set_passed(edev->pe, true);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(eeh_vfio_pci_open);
+
+void eeh_vfio_pci_release(struct pci_dev *pdev)
+{
+ bool release_pe = true;
+ struct eeh_pe *pe = NULL;
+ struct eeh_dev *tmp, *edev;
+
+ /* No PCI device ? */
+ if (!pdev)
+ return;
+
+ /* No EEH device ? */
+ edev = pci_dev_to_eeh_dev(pdev);
+ if (!edev || !eeh_dev_passed(edev) ||
+ !edev->pe || !eeh_pe_passed(pe))
+ return;
+
+ /* Release device */
+ pe = edev->pe;
+ eeh_dev_set_passed(edev, false);
+
+ /* Release PE */
+ eeh_pe_for_each_dev(pe, edev, tmp) {
+ if (eeh_dev_passed(edev)) {
+ release_pe = false;
+ break;
+ }
+ }
+
+ if (release_pe)
+ eeh_pe_set_passed(pe, false);
+}
+EXPORT_SYMBOL(eeh_vfio_pci_release);
+
+int eeh_vfio_pci_ioctl(struct pci_dev *pdev,
+ unsigned long arg)
+{
+ struct vfio_eeh_op info;
+ unsigned long minsz = sizeof(info);
+ int ret = -EINVAL;
+
+ /* Copy over user argument */
+ if (copy_from_user(&info, (void __user *)arg, minsz)) {
+ pr_debug("%s: Cannot copy parameter 0x%lx\n",
+ __func__, arg);
+ return -EFAULT;
+ }
+
+ /* Sanity check */
+ if (info.argsz < minsz) {
+ pr_debug("%s: Invalid size (%d, %ld)\n",
+ __func__, info.argsz, minsz);
+ return -EINVAL;
+ }
+
+ /* Route according to operation */
+ switch (info.op) {
+ case VFIO_EEH_OP_SET_OPTION:
+ ret = powernv_eeh_vfio_set_option(pdev, &info);
+ break;
+ case VFIO_EEH_OP_GET_ADDR:
+ ret = powernv_eeh_vfio_get_addr(pdev, &info);
+ break;
+ case VFIO_EEH_OP_GET_STATE:
+ ret = powernv_eeh_vfio_get_state(pdev, &info);
+ break;
+ case VFIO_EEH_OP_PE_RESET:
+ ret = powernv_eeh_vfio_pe_reset(pdev, &info);
+ break;
+ case VFIO_EEH_OP_PE_CONFIG:
+ ret = powernv_eeh_vfio_pe_config(pdev, &info);
+ break;
+ default:
+ pr_debug("%s: Cannot handle op#%d\n",
+ __func__, info.op);
+ }
+
+ /* Copy data back */
+ if (copy_to_user((void __user *)arg, &info, minsz)) {
+ pr_debug("%s: Cannot copy parameter to user 0x%lx\n",
+ __func__, arg);
+ return -EFAULT;
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(eeh_vfio_pci_ioctl);
diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index 7ba0424..ee82c7f 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -156,8 +156,11 @@ static void vfio_pci_release(void *device_data)
{
struct vfio_pci_device *vdev = device_data;
- if (atomic_dec_and_test(&vdev->refcnt))
+
+ if (atomic_dec_and_test(&vdev->refcnt)) {
+ eeh_vfio_pci_release(vdev->pdev);
vfio_pci_disable(vdev);
+ }
module_put(THIS_MODULE);
}
@@ -165,19 +168,26 @@ static void vfio_pci_release(void *device_data)
static int vfio_pci_open(void *device_data)
{
struct vfio_pci_device *vdev = device_data;
+ int ret;
if (!try_module_get(THIS_MODULE))
return -ENODEV;
if (atomic_inc_return(&vdev->refcnt) == 1) {
- int ret = vfio_pci_enable(vdev);
- if (ret) {
- module_put(THIS_MODULE);
- return ret;
- }
+ ret = vfio_pci_enable(vdev);
+ if (ret)
+ goto error;
+
+ ret = eeh_vfio_pci_open(vdev->pdev);
+ if (ret)
+ goto error;
}
return 0;
+
+error:
+ module_put(THIS_MODULE);
+ return ret;
}
static int vfio_pci_get_irq_count(struct vfio_pci_device *vdev, int irq_type)
@@ -682,6 +692,8 @@ hot_reset_release:
kfree(groups);
return ret;
+ } else if (cmd == VFIO_EEH_OP) {
+ return eeh_vfio_pci_ioctl(vdev->pdev, arg);
}
return -ENOTTY;
diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h
index 9c6d5d0..1273bb6 100644
--- a/drivers/vfio/pci/vfio_pci_private.h
+++ b/drivers/vfio/pci/vfio_pci_private.h
@@ -90,4 +90,20 @@ extern void vfio_pci_virqfd_exit(void);
extern int vfio_config_init(struct vfio_pci_device *vdev);
extern void vfio_config_free(struct vfio_pci_device *vdev);
+
+#ifdef CONFIG_VFIO_PCI_EEH
+extern int eeh_vfio_pci_open(struct pci_dev *pdev);
+extern void eeh_vfio_pci_release(struct pci_dev *pdev);
+extern int eeh_vfio_pci_ioctl(struct pci_dev *pdev, unsigned long arg);
+#else
+static inline int eeh_vfio_pci_open(struct pci_dev *pdev)
+{
+ return 0;
+}
+static inline eeh_vfio_pci_release(struct pci_dev *pdev) { }
+static int eeh_vfio_pci_ioctl(struct pci_dev *pdev, unsigned long arg)
+{
+ return -ENOENT;
+}
+#endif /* COFNIG_VFIO_PCI_EEH */
#endif /* VFIO_PCI_PRIVATE_H */
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index cb9023d..6e7f033 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -455,6 +455,49 @@ struct vfio_iommu_spapr_tce_info {
#define VFIO_IOMMU_SPAPR_TCE_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12)
+/*
+ * The VFIO operation struct provides way to support EEH functionality
+ * for PCI device that is passed from host to guest via VFIO.
+ */
+#define VFIO_EEH_OP_SET_OPTION 0
+#define VFIO_EEH_OP_GET_ADDR 1
+#define VFIO_EEH_OP_GET_STATE 2
+#define VFIO_EEH_OP_PE_RESET 3
+#define VFIO_EEH_OP_PE_CONFIG 4
+
+struct vfio_eeh_op {
+ __u32 argsz;
+ __u32 op;
+
+ union {
+ struct vfio_eeh_set_option {
+ __u32 option;
+ __s32 ret;
+ } option;
+ struct vfio_eeh_pe_addr {
+ __u32 option;
+ __s32 ret;
+ __u32 info;
+ } addr;
+ struct vfio_eeh_pe_state {
+ __s32 ret;
+ __u32 reset_state;
+ __u32 cfg_cap;
+ __u32 pe_unavail_info;
+ __u32 pe_recovery_info;
+ } state;
+ struct vfio_eeh_reset {
+ __u32 option;
+ __s32 ret;
+ } reset;
+ struct vfio_eeh_config {
+ __s32 ret;
+ } config;
+ };
+};
+
+#define VFIO_EEH_OP _IO(VFIO_TYPE, VFIO_BASE + 21)
+
/* ***************************************************************** */
#endif /* _UAPIVFIO_H */
--
1.8.3.2
More information about the Linuxppc-dev
mailing list