[PATCH 3/4] drivers/vfio: New IOCTL command VFIO_EEH_INFO

Gavin Shan gwshan at linux.vnet.ibm.com
Tue May 20 18:30:11 EST 2014


The patch adds new IOCTL command VFIO_EEH_OP to VFIO PCI device
to support EEH functionality for PCI devices, which have been
passed from host to guest via VFIO.

Signed-off-by: Gavin Shan <gwshan at linux.vnet.ibm.com>
---
 arch/powerpc/platforms/powernv/Makefile   |   1 +
 arch/powerpc/platforms/powernv/eeh-vfio.c | 445 ++++++++++++++++++++++++++++++
 drivers/vfio/pci/vfio_pci.c               |  24 +-
 drivers/vfio/pci/vfio_pci_private.h       |  16 ++
 include/uapi/linux/vfio.h                 |  43 +++
 5 files changed, 523 insertions(+), 6 deletions(-)
 create mode 100644 arch/powerpc/platforms/powernv/eeh-vfio.c

diff --git a/arch/powerpc/platforms/powernv/Makefile b/arch/powerpc/platforms/powernv/Makefile
index 63cebb9..45cd833 100644
--- a/arch/powerpc/platforms/powernv/Makefile
+++ b/arch/powerpc/platforms/powernv/Makefile
@@ -6,5 +6,6 @@ obj-y			+= opal-msglog.o
 obj-$(CONFIG_SMP)	+= smp.o
 obj-$(CONFIG_PCI)	+= pci.o pci-p5ioc2.o pci-ioda.o
 obj-$(CONFIG_EEH)	+= eeh-ioda.o eeh-powernv.o
+obj-$(CONFIG_VFIO_PCI_EEH)	+= eeh-vfio.o
 obj-$(CONFIG_PPC_SCOM)	+= opal-xscom.o
 obj-$(CONFIG_MEMORY_FAILURE)	+= opal-memory-errors.o
diff --git a/arch/powerpc/platforms/powernv/eeh-vfio.c b/arch/powerpc/platforms/powernv/eeh-vfio.c
new file mode 100644
index 0000000..11adc55
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/eeh-vfio.c
@@ -0,0 +1,445 @@
+/*
+  * The file intends to support EEH funtionality for those PCI devices,
+  * which have been passed through from host to guest via VFIO. So this
+  * file is naturally part of VFIO implementation on PowerNV platform.
+  *
+  * Copyright Benjamin Herrenschmidt & Gavin Shan, IBM Corporation 2014.
+  *
+  * This program is free software; you can redistribute it and/or modify
+  * it under the terms of the GNU General Public License as published by
+  * the Free Software Foundation; either version 2 of the License, or
+  * (at your option) any later version.
+  */
+
+#include <linux/init.h>
+#include <linux/io.h>
+#include <linux/irq.h>
+#include <linux/kernel.h>
+#include <linux/kvm_host.h>
+#include <linux/msi.h>
+#include <linux/pci.h>
+#include <linux/string.h>
+#include <linux/vfio.h>
+
+#include <asm/eeh.h>
+#include <asm/eeh_event.h>
+#include <asm/io.h>
+#include <asm/iommu.h>
+#include <asm/opal.h>
+#include <asm/msi_bitmap.h>
+#include <asm/pci-bridge.h>
+#include <asm/ppc-pci.h>
+#include <asm/tce.h>
+#include <asm/uaccess.h>
+
+#include "powernv.h"
+#include "pci.h"
+
+static int powernv_eeh_vfio_check_dev(struct pci_dev *pdev,
+				      struct eeh_dev **pedev,
+				      struct eeh_pe **ppe,
+				      struct pnv_phb **pphb)
+{
+	struct eeh_dev *edev;
+	struct pnv_phb *phb;
+
+	/* No device ? */
+	if (!pdev)
+		return -ENODEV;
+
+	edev = pci_dev_to_eeh_dev(pdev);
+	if (!edev || !eeh_dev_passed(edev) ||
+	    !edev->pe || !eeh_pe_passed(edev->pe))
+		return -ENODEV;
+
+	/* EEH isn't supported ? */
+	phb = edev->phb->private_data;
+	if (!(phb->flags & PNV_PHB_FLAG_EEH))
+		return -EACCES;
+
+	if (pedev)
+		*pedev = edev;
+	if (ppe)
+		*ppe = edev->pe;
+	if (pphb)
+		*pphb = phb;
+
+	return 0;
+}
+
+static int powernv_eeh_vfio_set_option(struct pci_dev *pdev,
+				       struct vfio_eeh_op *info)
+{
+	struct eeh_dev *edev;
+	struct eeh_pe *pe;
+	struct pnv_phb *phb;
+	int opcode = info->option.option;
+	int ret = 0;
+
+	/* Device existing ? */
+	ret = powernv_eeh_vfio_check_dev(pdev, &edev, &pe, &phb);
+	if (ret) {
+		pr_debug("%s: Cannot find device\n",
+			__func__);
+		info->option.ret = -7;
+		goto out;
+	}
+
+	/* Invalid opcode ? */
+	if (opcode < EEH_OPT_DISABLE ||
+	    opcode > EEH_OPT_THAW_DMA) {
+		pr_debug("%s: Opcode#%d out of range (%d, %d)\n",
+			 __func__, opcode, EEH_OPT_DISABLE, EEH_OPT_THAW_DMA);
+		info->option.ret = -3;
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (opcode == EEH_OPT_DISABLE ||
+	    opcode == EEH_OPT_ENABLE) {
+		info->option.ret = 0;
+	} else {
+		if (!phb->eeh_ops || !phb->eeh_ops->set_option) {
+			info->option.ret = -7;
+			ret = -ENOENT;
+			goto out;
+		}
+
+		ret = phb->eeh_ops->set_option(pe, opcode);
+		if (ret) {
+			pr_debug("%s: Failure %d from backend\n",
+				__func__, ret);
+			info->option.ret = -3;
+			goto out;
+		}
+
+		info->option.ret = 0;
+	}
+out:
+	return ret;
+}
+
+static int powernv_eeh_vfio_get_addr(struct pci_dev *pdev,
+				     struct vfio_eeh_op *info)
+{
+	struct pci_bus *bus;
+	struct eeh_dev *edev;
+	struct eeh_pe *pe;
+	struct pnv_phb *phb;
+	int opcode = info->addr.option;
+	int ret = 0;
+
+	/* Device existing ? */
+	ret = powernv_eeh_vfio_check_dev(pdev, &edev, &pe, &phb);
+	if (ret) {
+		info->addr.ret = -3;
+		goto out;
+	}
+
+	/* Invalid opcode ? */
+	if (opcode != 0 && opcode != 1) {
+		pr_debug("%s: opcode %d out of range (0, 1)\n",
+			__func__, opcode);
+		info->addr.ret = -3;
+		ret = -EINVAL;
+		goto out;
+	}
+
+	/*
+	 * Fill result according to opcode. We don't differentiate
+	 * PCI bus and device sensitive PE here.
+	 */
+	if (opcode == 0) {
+		bus = eeh_pe_bus_get(pe);
+		if (!bus) {
+			info->addr.ret = -3;
+			ret = -ENODEV;
+			goto out;
+		}
+
+		info->addr.ret = 0;
+		info->addr.info = bus->number << 16;
+	} else {
+		info->addr.info = 1;
+		info->addr.ret = 1;
+	}
+out:
+	return ret;
+}
+
+static int powernv_eeh_vfio_get_state(struct pci_dev *pdev,
+				      struct vfio_eeh_op *info)
+{
+	struct eeh_dev *edev;
+	struct eeh_pe *pe;
+	struct pnv_phb *phb;
+	int result, ret = 0;
+
+	/* Device existing ? */
+	ret = powernv_eeh_vfio_check_dev(pdev, &edev, &pe, &phb);
+	if (ret) {
+		info->state.ret = -3;
+		goto out;
+	}
+
+	if (!phb->eeh_ops || !phb->eeh_ops->get_state) {
+		pr_debug("%s: Unsupported request\n",
+			__func__);
+		ret = -ENOENT;
+		info->state.ret = -3;
+		goto out;
+	}
+
+	result = phb->eeh_ops->get_state(pe);
+
+	if (!(result & EEH_STATE_RESET_ACTIVE) &&
+	     (result & EEH_STATE_DMA_ENABLED) &&
+	     (result & EEH_STATE_MMIO_ENABLED))
+		info->state.reset_state = 0;
+	else if (result & EEH_STATE_RESET_ACTIVE)
+		info->state.reset_state = 1;
+	else if (!(result & EEH_STATE_RESET_ACTIVE) &&
+		 !(result & EEH_STATE_DMA_ENABLED) &&
+		 !(result & EEH_STATE_MMIO_ENABLED))
+		info->state.reset_state = 2;
+	else if (!(result & EEH_STATE_RESET_ACTIVE) &&
+		 (result & EEH_STATE_DMA_ENABLED) &&
+		 !(result & EEH_STATE_MMIO_ENABLED))
+		info->state.reset_state = 4;
+	else
+		info->state.reset_state = 5;
+
+	info->state.ret = 0;
+	info->state.cfg_cap = 1;
+	info->state.pe_unavail_info = 1000;
+	info->state.pe_recovery_info = 0;
+
+out:
+	return ret;
+}
+
+static int powernv_eeh_vfio_pe_reset(struct pci_dev *pdev,
+				     struct vfio_eeh_op *info)
+{
+	struct eeh_dev *edev;
+	struct eeh_pe *pe;
+	struct pnv_phb *phb;
+	int opcode = info->reset.option;
+	int ret = 0;
+
+	/* Device existing ? */
+	ret = powernv_eeh_vfio_check_dev(pdev, &edev, &pe, &phb);
+	if (ret) {
+		info->addr.ret = -3;
+		goto out;
+	}
+
+	/* Invalid opcode ? */
+	if (opcode != EEH_RESET_DEACTIVATE &&
+	    opcode != EEH_RESET_HOT &&
+	    opcode != EEH_RESET_FUNDAMENTAL) {
+		pr_debug("%s: Unsupported opcode %d\n",
+			__func__, opcode);
+		ret = -EINVAL;
+		info->reset.ret = -3;
+		goto out;
+	}
+
+	/* Call into the IODA dependent backend to do the reset */
+	if (!phb->eeh_ops ||
+	    !phb->eeh_ops->set_option ||
+	    !phb->eeh_ops->reset) {
+		pr_debug("%s: Unsupported request\n",
+			__func__);
+		ret = -ENOENT;
+		info->reset.ret = -7;
+		goto out;
+	}
+
+	/*
+	 * The frozen PE might be caused by the mechanism called
+	 * PAPR error injection, which is supposed to be one-shot
+	 * without "sticky" bit as being stated by the spec. But
+	 * the reality isn't that, at least on P7IOC. So we have
+	 * to clear that to avoid recrusive error, which fails the
+	 * recovery eventually.
+	 */
+	if (opcode == EEH_RESET_DEACTIVATE)
+		opal_pci_reset(phb->opal_id,
+			       OPAL_PHB_ERROR,
+			       OPAL_ASSERT_RESET);
+
+	ret = phb->eeh_ops->reset(pe, opcode);
+	if (ret) {
+		pr_debug("%s: Failure %d from backend\n",
+			__func__, ret);
+		info->reset.ret = -1;
+		goto out;
+	}
+
+	/*
+	 * The PE is still in frozen state and we need clear that.
+	 * It's good to clear frozen state after deassert to avoid
+	 * messy IO access during reset, which might cause recrusive
+	 * frozen PE.
+	 */
+	if (opcode == EEH_RESET_DEACTIVATE) {
+		ret = phb->eeh_ops->set_option(pe, EEH_OPT_THAW_MMIO);
+		if (ret) {
+			pr_debug("%s: Cannot enable DMA for PHB#%d-PE#%d (%d)\n",
+				__func__, pe->phb->global_number, pe->addr, ret);
+			info->reset.ret = -1;
+			goto out;
+		}
+
+		ret = phb->eeh_ops->set_option(pe, EEH_OPT_THAW_DMA);
+		if (ret) {
+			pr_debug("%s: Cannot enable IO for PHB#%d-PE#%d (%d)\n",
+				__func__, pe->phb->global_number, pe->addr, ret);
+			info->reset.ret = -1;
+			goto out;
+		}
+
+		eeh_pe_state_clear(pe, EEH_PE_ISOLATED);
+	}
+
+	info->reset.ret = 0;
+out:
+	return ret;
+}
+
+static int powernv_eeh_vfio_pe_config(struct pci_dev *pdev,
+				      struct vfio_eeh_op *info)
+{
+	struct eeh_dev *edev;
+	struct eeh_pe *pe;
+	struct pnv_phb *phb;
+	int ret = 0;
+
+	/* Device existing ? */
+	ret = powernv_eeh_vfio_check_dev(pdev, &edev, &pe, &phb);
+	if (ret) {
+		info->config.ret = -3;
+		goto out;
+	}
+
+	/*
+	 * The access to PCI config space on VFIO device has some
+	 * limitations. Part of PCI config space, including BAR
+	 * registers are not readable and writable. So the guest
+	 * should have stale values for those registers and we have
+	 * to restore them in host side.
+	 */
+	eeh_pe_restore_bars(pe);
+	info->config.ret = 0;
+
+out:
+	return ret;
+}
+
+int eeh_vfio_pci_open(struct pci_dev *pdev)
+{
+	struct eeh_dev *edev;
+
+	/* No PCI device ? */
+	if (!pdev)
+		return -ENODEV;
+
+	/* No EEH device ? */
+	edev = pci_dev_to_eeh_dev(pdev);
+	if (!edev || !edev->pe)
+		return -ENODEV;
+
+	eeh_dev_set_passed(edev, true);
+	eeh_pe_set_passed(edev->pe, true);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(eeh_vfio_pci_open);
+
+void eeh_vfio_pci_release(struct pci_dev *pdev)
+{
+	bool release_pe = true;
+	struct eeh_pe *pe = NULL;
+	struct eeh_dev *tmp, *edev;
+
+	/* No PCI device ? */
+	if (!pdev)
+		return;
+
+	/* No EEH device ? */
+	edev = pci_dev_to_eeh_dev(pdev);
+	if (!edev || !eeh_dev_passed(edev) ||
+	    !edev->pe || !eeh_pe_passed(pe))
+		return;
+
+	/* Release device */
+	pe = edev->pe;
+	eeh_dev_set_passed(edev, false);
+
+	/* Release PE */
+	eeh_pe_for_each_dev(pe, edev, tmp) {
+		if (eeh_dev_passed(edev)) {
+			release_pe = false;
+			break;
+		}
+	}
+
+	if (release_pe)
+		eeh_pe_set_passed(pe, false);
+}
+EXPORT_SYMBOL(eeh_vfio_pci_release);
+
+int eeh_vfio_pci_ioctl(struct pci_dev *pdev,
+		       unsigned long arg)
+{
+	struct vfio_eeh_op info;
+	unsigned long minsz = sizeof(info);
+	int ret = -EINVAL;
+
+	/* Copy over user argument */
+	if (copy_from_user(&info, (void __user *)arg, minsz)) {
+		pr_debug("%s: Cannot copy parameter 0x%lx\n",
+			__func__, arg);
+		return -EFAULT;
+	}
+
+	/* Sanity check */
+	if (info.argsz < minsz) {
+		pr_debug("%s: Invalid size (%d, %ld)\n",
+			__func__, info.argsz, minsz);
+		return -EINVAL;
+	}
+
+	/* Route according to operation */
+	switch (info.op) {
+	case VFIO_EEH_OP_SET_OPTION:
+		ret = powernv_eeh_vfio_set_option(pdev, &info);
+		break;
+	case VFIO_EEH_OP_GET_ADDR:
+		ret = powernv_eeh_vfio_get_addr(pdev, &info);
+		break;
+	case VFIO_EEH_OP_GET_STATE:
+		ret = powernv_eeh_vfio_get_state(pdev, &info);
+		break;
+	case VFIO_EEH_OP_PE_RESET:
+		ret = powernv_eeh_vfio_pe_reset(pdev, &info);
+		break;
+	case VFIO_EEH_OP_PE_CONFIG:
+		ret = powernv_eeh_vfio_pe_config(pdev, &info);
+		break;
+	default:
+		pr_debug("%s: Cannot handle op#%d\n",
+			__func__, info.op);
+	}
+
+	/* Copy data back */
+	if (copy_to_user((void __user *)arg, &info, minsz)) {
+		pr_debug("%s: Cannot copy parameter to user 0x%lx\n",
+			__func__, arg);
+		return -EFAULT;
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(eeh_vfio_pci_ioctl);
diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index 7ba0424..ee82c7f 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -156,8 +156,11 @@ static void vfio_pci_release(void *device_data)
 {
 	struct vfio_pci_device *vdev = device_data;
 
-	if (atomic_dec_and_test(&vdev->refcnt))
+
+	if (atomic_dec_and_test(&vdev->refcnt)) {
+		eeh_vfio_pci_release(vdev->pdev);
 		vfio_pci_disable(vdev);
+	}
 
 	module_put(THIS_MODULE);
 }
@@ -165,19 +168,26 @@ static void vfio_pci_release(void *device_data)
 static int vfio_pci_open(void *device_data)
 {
 	struct vfio_pci_device *vdev = device_data;
+	int ret;
 
 	if (!try_module_get(THIS_MODULE))
 		return -ENODEV;
 
 	if (atomic_inc_return(&vdev->refcnt) == 1) {
-		int ret = vfio_pci_enable(vdev);
-		if (ret) {
-			module_put(THIS_MODULE);
-			return ret;
-		}
+		ret = vfio_pci_enable(vdev);
+		if (ret)
+			goto error;
+
+		ret = eeh_vfio_pci_open(vdev->pdev);
+		if (ret)
+			goto error;
 	}
 
 	return 0;
+
+error:
+	module_put(THIS_MODULE);
+	return ret;
 }
 
 static int vfio_pci_get_irq_count(struct vfio_pci_device *vdev, int irq_type)
@@ -682,6 +692,8 @@ hot_reset_release:
 
 		kfree(groups);
 		return ret;
+	} else if (cmd == VFIO_EEH_OP) {
+		return eeh_vfio_pci_ioctl(vdev->pdev, arg);
 	}
 
 	return -ENOTTY;
diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h
index 9c6d5d0..1273bb6 100644
--- a/drivers/vfio/pci/vfio_pci_private.h
+++ b/drivers/vfio/pci/vfio_pci_private.h
@@ -90,4 +90,20 @@ extern void vfio_pci_virqfd_exit(void);
 
 extern int vfio_config_init(struct vfio_pci_device *vdev);
 extern void vfio_config_free(struct vfio_pci_device *vdev);
+
+#ifdef CONFIG_VFIO_PCI_EEH
+extern int eeh_vfio_pci_open(struct pci_dev *pdev);
+extern void eeh_vfio_pci_release(struct pci_dev *pdev);
+extern int eeh_vfio_pci_ioctl(struct pci_dev *pdev, unsigned long arg);
+#else
+static inline int eeh_vfio_pci_open(struct pci_dev *pdev)
+{
+	return 0;
+}
+static inline eeh_vfio_pci_release(struct pci_dev *pdev) { }
+static int eeh_vfio_pci_ioctl(struct pci_dev *pdev, unsigned long arg)
+{
+	return -ENOENT;
+}
+#endif /* COFNIG_VFIO_PCI_EEH */
 #endif /* VFIO_PCI_PRIVATE_H */
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index cb9023d..6e7f033 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -455,6 +455,49 @@ struct vfio_iommu_spapr_tce_info {
 
 #define VFIO_IOMMU_SPAPR_TCE_GET_INFO	_IO(VFIO_TYPE, VFIO_BASE + 12)
 
+/*
+ * The VFIO operation struct provides way to support EEH functionality
+ * for PCI device that is passed from host to guest via VFIO.
+ */
+#define VFIO_EEH_OP_SET_OPTION	0
+#define VFIO_EEH_OP_GET_ADDR	1
+#define VFIO_EEH_OP_GET_STATE	2
+#define VFIO_EEH_OP_PE_RESET	3
+#define VFIO_EEH_OP_PE_CONFIG	4
+
+struct vfio_eeh_op {
+	__u32 argsz;
+	__u32 op;
+
+	union {
+		struct vfio_eeh_set_option {
+			__u32 option;
+			__s32 ret;
+		} option;
+		struct vfio_eeh_pe_addr {
+			__u32 option;
+			__s32 ret;
+			__u32 info;
+		} addr;
+		struct vfio_eeh_pe_state {
+			__s32 ret;
+			__u32 reset_state;
+			__u32 cfg_cap;
+			__u32 pe_unavail_info;
+			__u32 pe_recovery_info;
+                } state;
+		struct vfio_eeh_reset {
+			__u32 option;
+			__s32 ret;
+		} reset;
+		struct vfio_eeh_config {
+			__s32 ret;
+		} config;
+	};
+};
+
+#define VFIO_EEH_OP	_IO(VFIO_TYPE, VFIO_BASE + 21)
+
 /* ***************************************************************** */
 
 #endif /* _UAPIVFIO_H */
-- 
1.8.3.2



More information about the Linuxppc-dev mailing list