[PATCH v5 3/4] drivers/vfio: EEH support for VFIO PCI device
Gavin Shan
gwshan at linux.vnet.ibm.com
Wed May 21 15:03:42 EST 2014
The patch adds new IOCTL command VFIO_EEH_OP to VFIO PCI device
to support EEH functionality for PCI devices, which have been
passed from host to guest via VFIO.
Signed-off-by: Gavin Shan <gwshan at linux.vnet.ibm.com>
---
Documentation/vfio.txt | 6 +-
arch/powerpc/include/asm/eeh.h | 10 ++
arch/powerpc/kernel/eeh.c | 323 +++++++++++++++++++++++++++++++++++++++++
drivers/vfio/pci/vfio_pci.c | 99 ++++++++++++-
include/uapi/linux/vfio.h | 43 ++++++
5 files changed, 474 insertions(+), 7 deletions(-)
diff --git a/Documentation/vfio.txt b/Documentation/vfio.txt
index b9ca023..bb17ec7 100644
--- a/Documentation/vfio.txt
+++ b/Documentation/vfio.txt
@@ -305,7 +305,10 @@ faster, the map/unmap handling has been implemented in real mode which provides
an excellent performance which has limitations such as inability to do
locked pages accounting in real time.
-So 3 additional ioctls have been added:
+4) PPC64 guests detect PCI errors and recover from them via EEH RTAS services,
+which works on the basis of additional ioctl command VFIO_EEH_OP.
+
+So 4 additional ioctls have been added:
VFIO_IOMMU_SPAPR_TCE_GET_INFO - returns the size and the start
of the DMA window on the PCI bus.
@@ -316,6 +319,7 @@ So 3 additional ioctls have been added:
VFIO_IOMMU_DISABLE - disables the container.
+ VFIO_EEH_OP - EEH dependent operations
The code flow from the example above should be slightly changed:
diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index 34a2d83..93922ef 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -305,6 +305,16 @@ void eeh_add_device_late(struct pci_dev *);
void eeh_add_device_tree_late(struct pci_bus *);
void eeh_add_sysfs_files(struct pci_bus *);
void eeh_remove_device(struct pci_dev *);
+#ifdef CONFIG_VFIO_PCI_EEH
+int eeh_vfio_open(struct pci_dev *pdev);
+void eeh_vfio_release(struct pci_dev *pdev);
+int eeh_vfio_set_pe_option(struct pci_dev *pdev, int option, int *retval);
+int eeh_vfio_get_pe_addr(struct pci_dev *pdev, int option,
+ int *retval, int *info);
+int eeh_vfio_get_pe_state(struct pci_dev *pdev, int *retval, int *state);
+int eeh_vfio_reset_pe(struct pci_dev *pdev, int option, int *retval);
+int eeh_vfio_configure_pe(struct pci_dev *pdev, int *retval);
+#endif
/**
* EEH_POSSIBLE_ERROR() -- test for possible MMIO failure.
diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index 9c6b899..2aaf90e 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -1098,6 +1098,329 @@ void eeh_remove_device(struct pci_dev *dev)
edev->mode &= ~EEH_DEV_SYSFS;
}
+#ifdef CONFIG_VFIO_PCI_EEH
+int eeh_vfio_open(struct pci_dev *pdev)
+{
+ struct eeh_dev *edev;
+
+ /* No PCI device ? */
+ if (!pdev)
+ return -ENODEV;
+
+ /* No EEH device ? */
+ edev = pci_dev_to_eeh_dev(pdev);
+ if (!edev || !edev->pe)
+ return -ENODEV;
+
+ eeh_dev_set_passed(edev, true);
+ eeh_pe_set_passed(edev->pe, true);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(eeh_vfio_open);
+
+void eeh_vfio_release(struct pci_dev *pdev)
+{
+ bool release_pe = true;
+ struct eeh_pe *pe = NULL;
+ struct eeh_dev *tmp, *edev;
+
+ /* No PCI device ? */
+ if (!pdev)
+ return;
+
+ /* No EEH device ? */
+ edev = pci_dev_to_eeh_dev(pdev);
+ if (!edev || !eeh_dev_passed(edev) ||
+ !edev->pe || !eeh_pe_passed(pe))
+ return;
+
+ /* Release device */
+ pe = edev->pe;
+ eeh_dev_set_passed(edev, false);
+
+ /* Release PE */
+ eeh_pe_for_each_dev(pe, edev, tmp) {
+ if (eeh_dev_passed(edev)) {
+ release_pe = false;
+ break;
+ }
+ }
+
+ if (release_pe)
+ eeh_pe_set_passed(pe, false);
+}
+EXPORT_SYMBOL(eeh_vfio_release);
+
+static int eeh_vfio_check_dev(struct pci_dev *pdev,
+ struct eeh_dev **pedev,
+ struct eeh_pe **ppe)
+{
+ struct eeh_dev *edev;
+
+ /* No device ? */
+ if (!pdev)
+ return -ENODEV;
+
+ edev = pci_dev_to_eeh_dev(pdev);
+ if (!edev || !eeh_dev_passed(edev) ||
+ !edev->pe || !eeh_pe_passed(edev->pe))
+ return -ENODEV;
+
+ if (pedev)
+ *pedev = edev;
+ if (ppe)
+ *ppe = edev->pe;
+
+ return 0;
+}
+
+int eeh_vfio_set_pe_option(struct pci_dev *pdev, int option, int *retval)
+{
+ struct eeh_dev *edev;
+ struct eeh_pe *pe;
+ int ret = 0;
+
+ /* Device existing ? */
+ ret = eeh_vfio_check_dev(pdev, &edev, &pe);
+ if (ret) {
+ pr_debug("%s: Cannot find device %s\n",
+ __func__, pdev ? pci_name(pdev) : "NULL");
+ *retval = -7;
+ goto out;
+ }
+
+ /* Invalid option ? */
+ if (option < EEH_OPT_DISABLE ||
+ option > EEH_OPT_THAW_DMA) {
+ pr_debug("%s: Option %d out of range (%d, %d)\n",
+ __func__, option, EEH_OPT_DISABLE, EEH_OPT_THAW_DMA);
+ *retval = -3;
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (option == EEH_OPT_DISABLE ||
+ option == EEH_OPT_ENABLE) {
+ *retval = 0;
+ } else {
+ if (!eeh_ops || !eeh_ops->set_option) {
+ *retval = -7;
+ ret = -ENOENT;
+ goto out;
+ }
+
+ ret = eeh_ops->set_option(pe, option);
+ if (ret) {
+ pr_debug("%s: Failure %d from backend\n",
+ __func__, ret);
+ *retval = -1;
+ goto out;
+ }
+
+ *retval = 0;
+ }
+out:
+ return ret;
+}
+EXPORT_SYMBOL_GPL(eeh_vfio_set_pe_option);
+
+int eeh_vfio_get_pe_addr(struct pci_dev *pdev, int option,
+ int *retval, int *info)
+{
+ struct pci_bus *bus;
+ struct eeh_dev *edev;
+ struct eeh_pe *pe;
+ int ret = 0;
+
+ /* Device existing ? */
+ ret = eeh_vfio_check_dev(pdev, &edev, &pe);
+ if (ret) {
+ *retval = -3;
+ goto out;
+ }
+
+ /* Invalid option ? */
+ if (option != 0 && option != 1) {
+ pr_debug("%s: option %d out of range (0, 1)\n",
+ __func__, option);
+ *retval = -3;
+ ret = -EINVAL;
+ goto out;
+ }
+
+ /*
+ * Fill result according to option. We don't differentiate
+ * PCI bus and device dependent PE here. So all PEs are
+ * built in "shared" mode. Also, the PE address has the format
+ * of "00BBSS00".
+ */
+ if (option == 0) {
+ bus = eeh_pe_bus_get(pe);
+ if (!bus) {
+ *retval = -3;
+ ret = -ENODEV;
+ goto out;
+ }
+
+ *retval = 0;
+ *info = bus->number << 16;
+ } else {
+ *retval = 0;
+ *info = 1;
+ }
+out:
+ return ret;
+}
+EXPORT_SYMBOL_GPL(eeh_vfio_get_pe_addr);
+
+int eeh_vfio_get_pe_state(struct pci_dev *pdev, int *retval, int *state)
+{
+ struct eeh_dev *edev;
+ struct eeh_pe *pe;
+ int result, ret = 0;
+
+ /* Device existing ? */
+ ret = eeh_vfio_check_dev(pdev, &edev, &pe);
+ if (ret) {
+ *retval = -3;
+ goto out;
+ }
+
+ if (!eeh_ops || !eeh_ops->get_state) {
+ pr_debug("%s: Unsupported request\n",
+ __func__);
+ ret = -ENOENT;
+ *retval = -3;
+ goto out;
+ }
+
+ result = eeh_ops->get_state(pe, NULL);
+ if (!(result & EEH_STATE_RESET_ACTIVE) &&
+ (result & EEH_STATE_DMA_ENABLED) &&
+ (result & EEH_STATE_MMIO_ENABLED))
+ *state = 0;
+ else if (result & EEH_STATE_RESET_ACTIVE)
+ *state = 1;
+ else if (!(result & EEH_STATE_RESET_ACTIVE) &&
+ !(result & EEH_STATE_DMA_ENABLED) &&
+ !(result & EEH_STATE_MMIO_ENABLED))
+ *state = 2;
+ else if (!(result & EEH_STATE_RESET_ACTIVE) &&
+ (result & EEH_STATE_DMA_ENABLED) &&
+ !(result & EEH_STATE_MMIO_ENABLED))
+ *state = 4;
+ else
+ *state = 5;
+
+ *retval = 0;
+out:
+ return ret;
+}
+EXPORT_SYMBOL_GPL(eeh_vfio_get_pe_state);
+
+int eeh_vfio_reset_pe(struct pci_dev *pdev, int option, int *retval)
+{
+ struct eeh_dev *edev;
+ struct eeh_pe *pe;
+ int ret = 0;
+
+ /* Device existing ? */
+ ret = eeh_vfio_check_dev(pdev, &edev, &pe);
+ if (ret) {
+ *retval = -3;
+ goto out;
+ }
+
+ /* Invalid option ? */
+ if (option != EEH_RESET_DEACTIVATE &&
+ option != EEH_RESET_HOT &&
+ option != EEH_RESET_FUNDAMENTAL) {
+ pr_debug("%s: Unsupported option %d\n",
+ __func__, option);
+ ret = -EINVAL;
+ *retval = -3;
+ goto out;
+ }
+
+ if (!eeh_ops || !eeh_ops->set_option || !eeh_ops->reset) {
+ pr_debug("%s: Unsupported request\n",
+ __func__);
+ ret = -ENOENT;
+ *retval = -7;
+ goto out;
+ }
+
+ ret = eeh_ops->reset(pe, option);
+ if (ret) {
+ pr_debug("%s: Failure %d from backend\n",
+ __func__, ret);
+ *retval = -1;
+ goto out;
+ }
+
+ /*
+ * The PE is still in frozen state and we need clear that.
+ * It's good to clear frozen state after deassert to avoid
+ * messy IO access during reset, which might cause recrusive
+ * frozen PE.
+ */
+ if (option == EEH_RESET_DEACTIVATE) {
+ ret = eeh_ops->set_option(pe, EEH_OPT_THAW_MMIO);
+ if (ret) {
+ pr_debug("%s: Cannot enable IO for PHB#%d-PE#%d (%d)\n",
+ __func__, pe->phb->global_number, pe->addr, ret);
+ *retval = -1;
+ goto out;
+ }
+
+ ret = eeh_ops->set_option(pe, EEH_OPT_THAW_DMA);
+ if (ret) {
+ pr_debug("%s: Cannot enable DMA for PHB#%d-PE#%d (%d)\n",
+ __func__, pe->phb->global_number, pe->addr, ret);
+ *retval = -1;
+ goto out;
+ }
+
+ eeh_pe_state_clear(pe, EEH_PE_ISOLATED);
+ }
+
+ *retval = 0;
+out:
+ return ret;
+}
+EXPORT_SYMBOL_GPL(eeh_vfio_reset_pe);
+
+int eeh_vfio_configure_pe(struct pci_dev *pdev, int *retval)
+{
+ struct eeh_dev *edev;
+ struct eeh_pe *pe;
+ int ret = 0;
+
+ /* Device existing ? */
+ ret = eeh_vfio_check_dev(pdev, &edev, &pe);
+ if (ret) {
+ *retval = -3;
+ goto out;
+ }
+
+ /*
+ * The access to PCI config space on VFIO device has some
+ * limitations. Part of PCI config space, including BAR
+ * registers are not readable and writable. So the guest
+ * should have stale values for those registers and we have
+ * to restore them in host side.
+ */
+ eeh_pe_restore_bars(pe);
+ *retval = 0;
+
+out:
+ return ret;
+}
+EXPORT_SYMBOL_GPL(eeh_vfio_configure_pe);
+
+#endif /* CONFIG_VFIO_PCI_EEH */
+
static int proc_eeh_show(struct seq_file *m, void *v)
{
if (!eeh_enabled()) {
diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index 7ba0424..05c3dde 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -25,6 +25,9 @@
#include <linux/types.h>
#include <linux/uaccess.h>
#include <linux/vfio.h>
+#ifdef CONFIG_VFIO_PCI_EEH
+#include <asm/eeh.h>
+#endif
#include "vfio_pci_private.h"
@@ -152,32 +155,57 @@ static void vfio_pci_disable(struct vfio_pci_device *vdev)
pci_restore_state(pdev);
}
+static void vfio_eeh_pci_release(struct pci_dev *pdev)
+{
+#ifdef CONFIG_VFIO_PCI_EEH
+ eeh_vfio_release(pdev);
+#endif
+}
+
static void vfio_pci_release(void *device_data)
{
struct vfio_pci_device *vdev = device_data;
- if (atomic_dec_and_test(&vdev->refcnt))
+ if (atomic_dec_and_test(&vdev->refcnt)) {
+ vfio_eeh_pci_release(vdev->pdev);
vfio_pci_disable(vdev);
+ }
module_put(THIS_MODULE);
}
+static int vfio_eeh_pci_open(struct pci_dev *pdev)
+{
+ int ret = 0;
+
+#ifdef CONFIG_VFIO_PCI_EEH
+ ret = eeh_vfio_open(pdev);
+#endif
+ return ret;
+}
+
static int vfio_pci_open(void *device_data)
{
struct vfio_pci_device *vdev = device_data;
+ int ret;
if (!try_module_get(THIS_MODULE))
return -ENODEV;
if (atomic_inc_return(&vdev->refcnt) == 1) {
- int ret = vfio_pci_enable(vdev);
- if (ret) {
- module_put(THIS_MODULE);
- return ret;
- }
+ ret = vfio_pci_enable(vdev);
+ if (ret)
+ goto error;
+
+ ret = vfio_eeh_pci_open(vdev->pdev);
+ if (ret)
+ goto error;
}
return 0;
+error:
+ module_put(THIS_MODULE);
+ return ret;
}
static int vfio_pci_get_irq_count(struct vfio_pci_device *vdev, int irq_type)
@@ -321,6 +349,51 @@ static int vfio_pci_for_each_slot_or_bus(struct pci_dev *pdev,
return walk.ret;
}
+static int vfio_eeh_pci_ioctl(struct pci_dev *pdev, struct vfio_eeh_op *info)
+{
+ int ret = 0;
+
+#ifdef CONFIG_VFIO_PCI_EEH
+ switch (info->op) {
+ case VFIO_EEH_OP_SET_OPTION:
+ ret = eeh_vfio_set_pe_option(pdev,
+ info->option.option,
+ &info->option.ret);
+ break;
+ case VFIO_EEH_OP_GET_ADDR:
+ ret = eeh_vfio_get_pe_addr(pdev,
+ info->addr.option,
+ &info->addr.ret,
+ &info->addr.info);
+ break;
+ case VFIO_EEH_OP_GET_STATE:
+ ret = eeh_vfio_get_pe_state(pdev,
+ &info->state.ret,
+ &info->state.reset_state);
+ info->state.cfg_cap = 1;
+ info->state.pe_unavail_info = 1000;
+ info->state.pe_recovery_info = 0;
+ break;
+ case VFIO_EEH_OP_PE_RESET:
+ ret = eeh_vfio_reset_pe(pdev,
+ info->reset.option,
+ &info->reset.ret);
+ break;
+ case VFIO_EEH_OP_PE_CONFIG:
+ ret = eeh_vfio_configure_pe(pdev,
+ &info->config.ret);
+ default:
+ ret = -EINVAL;
+ pr_debug("%s: Cannot handle op#%d\n",
+ __func__, info->op);
+ }
+#else
+ ret = -ENOENT;
+#endif
+
+ return ret;
+}
+
static long vfio_pci_ioctl(void *device_data,
unsigned int cmd, unsigned long arg)
{
@@ -682,6 +755,20 @@ hot_reset_release:
kfree(groups);
return ret;
+ } else if (cmd == VFIO_EEH_OP) {
+ struct vfio_eeh_op info;
+ int ret = 0;
+
+ minsz = sizeof(info);
+ if (copy_from_user(&info, (void __user *)arg, minsz))
+ return -EFAULT;
+ if (info.argsz < minsz)
+ return -EINVAL;
+
+ ret = vfio_eeh_pci_ioctl(vdev->pdev, &info);
+ if (copy_to_user((void __user *)arg, &info, minsz))
+ ret = -EFAULT;
+ return ret;
}
return -ENOTTY;
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index cb9023d..518961d 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -455,6 +455,49 @@ struct vfio_iommu_spapr_tce_info {
#define VFIO_IOMMU_SPAPR_TCE_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12)
+/*
+ * The VFIO operation struct provides way to support EEH functionality
+ * for PCI device that is passed from host to guest via VFIO.
+ */
+#define VFIO_EEH_OP_SET_OPTION 0
+#define VFIO_EEH_OP_GET_ADDR 1
+#define VFIO_EEH_OP_GET_STATE 2
+#define VFIO_EEH_OP_PE_RESET 3
+#define VFIO_EEH_OP_PE_CONFIG 4
+
+struct vfio_eeh_op {
+ __u32 argsz;
+ __u32 op;
+
+ union {
+ struct vfio_eeh_set_option {
+ __u32 option;
+ __s32 ret;
+ } option;
+ struct vfio_eeh_pe_addr {
+ __u32 option;
+ __s32 ret;
+ __u32 info;
+ } addr;
+ struct vfio_eeh_pe_state {
+ __s32 ret;
+ __u32 reset_state;
+ __u32 cfg_cap;
+ __u32 pe_unavail_info;
+ __u32 pe_recovery_info;
+ } state;
+ struct vfio_eeh_reset {
+ __u32 option;
+ __s32 ret;
+ } reset;
+ struct vfio_eeh_config {
+ __s32 ret;
+ } config;
+ };
+};
+
+#define VFIO_EEH_OP _IO(VFIO_TYPE, VFIO_BASE + 21)
+
/* ***************************************************************** */
#endif /* _UAPIVFIO_H */
--
1.8.3.2
More information about the Linuxppc-dev
mailing list