[PATCH v5 3/4] drivers/vfio: EEH support for VFIO PCI device

Alexander Graf agraf at suse.de
Wed May 21 23:07:26 EST 2014


On 21.05.14 07:03, Gavin Shan wrote:
> The patch adds new IOCTL command VFIO_EEH_OP to VFIO PCI device
> to support EEH functionality for PCI devices, which have been
> passed from host to guest via VFIO.
>
> Signed-off-by: Gavin Shan <gwshan at linux.vnet.ibm.com>
> ---
>   Documentation/vfio.txt         |   6 +-
>   arch/powerpc/include/asm/eeh.h |  10 ++
>   arch/powerpc/kernel/eeh.c      | 323 +++++++++++++++++++++++++++++++++++++++++
>   drivers/vfio/pci/vfio_pci.c    |  99 ++++++++++++-
>   include/uapi/linux/vfio.h      |  43 ++++++
>   5 files changed, 474 insertions(+), 7 deletions(-)
>
> diff --git a/Documentation/vfio.txt b/Documentation/vfio.txt
> index b9ca023..bb17ec7 100644
> --- a/Documentation/vfio.txt
> +++ b/Documentation/vfio.txt
> @@ -305,7 +305,10 @@ faster, the map/unmap handling has been implemented in real mode which provides
>   an excellent performance which has limitations such as inability to do
>   locked pages accounting in real time.
>   
> -So 3 additional ioctls have been added:
> +4) PPC64 guests detect PCI errors and recover from them via EEH RTAS services,
> +which works on the basis of additional ioctl command VFIO_EEH_OP.
> +
> +So 4 additional ioctls have been added:
>   
>   	VFIO_IOMMU_SPAPR_TCE_GET_INFO - returns the size and the start
>   		of the DMA window on the PCI bus.
> @@ -316,6 +319,7 @@ So 3 additional ioctls have been added:
>   
>   	VFIO_IOMMU_DISABLE - disables the container.
>   
> +	VFIO_EEH_OP - EEH dependent operations

Please document exactly what the ioctl does. In an ideal world, a VFIO 
user will just look at the documentation and be able to write a program 
against the API with it.

>   
>   The code flow from the example above should be slightly changed:
>   
> diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
> index 34a2d83..93922ef 100644
> --- a/arch/powerpc/include/asm/eeh.h
> +++ b/arch/powerpc/include/asm/eeh.h
> @@ -305,6 +305,16 @@ void eeh_add_device_late(struct pci_dev *);
>   void eeh_add_device_tree_late(struct pci_bus *);
>   void eeh_add_sysfs_files(struct pci_bus *);
>   void eeh_remove_device(struct pci_dev *);
> +#ifdef CONFIG_VFIO_PCI_EEH
> +int eeh_vfio_open(struct pci_dev *pdev);
> +void eeh_vfio_release(struct pci_dev *pdev);
> +int eeh_vfio_set_pe_option(struct pci_dev *pdev, int option, int *retval);
> +int eeh_vfio_get_pe_addr(struct pci_dev *pdev, int option,
> +			 int *retval, int *info);
> +int eeh_vfio_get_pe_state(struct pci_dev *pdev, int *retval, int *state);
> +int eeh_vfio_reset_pe(struct pci_dev *pdev, int option, int *retval);
> +int eeh_vfio_configure_pe(struct pci_dev *pdev, int *retval);
> +#endif
>   
>   /**
>    * EEH_POSSIBLE_ERROR() -- test for possible MMIO failure.
> diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
> index 9c6b899..2aaf90e 100644
> --- a/arch/powerpc/kernel/eeh.c
> +++ b/arch/powerpc/kernel/eeh.c
> @@ -1098,6 +1098,329 @@ void eeh_remove_device(struct pci_dev *dev)
>   	edev->mode &= ~EEH_DEV_SYSFS;
>   }
>   
> +#ifdef CONFIG_VFIO_PCI_EEH
> +int eeh_vfio_open(struct pci_dev *pdev)

Why vfio? Also that config option will not be set if vfio is compiled as 
a module.

> +{
> +	struct eeh_dev *edev;
> +
> +	/* No PCI device ? */
> +	if (!pdev)
> +		return -ENODEV;
> +
> +	/* No EEH device ? */
> +	edev = pci_dev_to_eeh_dev(pdev);
> +	if (!edev || !edev->pe)
> +		return -ENODEV;
> +
> +	eeh_dev_set_passed(edev, true);
> +	eeh_pe_set_passed(edev->pe, true);
> +
> +	return 0;
> +}
> +EXPORT_SYMBOL_GPL(eeh_vfio_open);
> +
> +void eeh_vfio_release(struct pci_dev *pdev)
> +{
> +	bool release_pe = true;
> +	struct eeh_pe *pe = NULL;
> +	struct eeh_dev *tmp, *edev;
> +
> +	/* No PCI device ? */
> +	if (!pdev)
> +		return;
> +
> +	/* No EEH device ? */
> +	edev = pci_dev_to_eeh_dev(pdev);
> +	if (!edev || !eeh_dev_passed(edev) ||
> +	    !edev->pe || !eeh_pe_passed(pe))
> +		return;
> +
> +	/* Release device */
> +	pe = edev->pe;
> +	eeh_dev_set_passed(edev, false);
> +
> +	/* Release PE */
> +	eeh_pe_for_each_dev(pe, edev, tmp) {
> +		if (eeh_dev_passed(edev)) {
> +			release_pe = false;
> +			break;
> +		}
> +	}
> +
> +	if (release_pe)
> +		eeh_pe_set_passed(pe, false);
> +}
> +EXPORT_SYMBOL(eeh_vfio_release);
> +
> +static int eeh_vfio_check_dev(struct pci_dev *pdev,
> +			      struct eeh_dev **pedev,
> +			      struct eeh_pe **ppe)
> +{
> +	struct eeh_dev *edev;
> +
> +	/* No device ? */
> +	if (!pdev)
> +		return -ENODEV;
> +
> +	edev = pci_dev_to_eeh_dev(pdev);
> +	if (!edev || !eeh_dev_passed(edev) ||
> +	    !edev->pe || !eeh_pe_passed(edev->pe))
> +		return -ENODEV;
> +
> +	if (pedev)
> +		*pedev = edev;
> +	if (ppe)
> +		*ppe = edev->pe;
> +
> +	return 0;
> +}
> +
> +int eeh_vfio_set_pe_option(struct pci_dev *pdev, int option, int *retval)
> +{
> +	struct eeh_dev *edev;
> +	struct eeh_pe *pe;
> +	int ret = 0;
> +
> +	/* Device existing ? */
> +	ret = eeh_vfio_check_dev(pdev, &edev, &pe);
> +	if (ret) {
> +		pr_debug("%s: Cannot find device %s\n",
> +			__func__, pdev ? pci_name(pdev) : "NULL");
> +		*retval = -7;

What are these? Please use proper kernel internal return values for 
errors. I don't want to see anything even remotely tied to RTAS in any 
of these patches.

> +		goto out;
> +	}
> +
> +	/* Invalid option ? */
> +	if (option < EEH_OPT_DISABLE ||
> +	    option > EEH_OPT_THAW_DMA) {

This is quite confusing to read because it's not obvious what is in 
between these. Just make this a switch() statement that lists the 
allowed options. Gcc will be smart enough to optimize that into a bounds 
check.

> +		pr_debug("%s: Option %d out of range (%d, %d)\n",
> +			__func__, option, EEH_OPT_DISABLE, EEH_OPT_THAW_DMA);
> +		*retval = -3;
> +		ret = -EINVAL;
> +		goto out;
> +	}
> +
> +	if (option == EEH_OPT_DISABLE ||
> +	    option == EEH_OPT_ENABLE) {
> +		*retval = 0;
> +	} else {
> +		if (!eeh_ops || !eeh_ops->set_option) {
> +			*retval = -7;
> +			ret = -ENOENT;
> +			goto out;
> +		}
> +
> +		ret = eeh_ops->set_option(pe, option);
> +		if (ret) {
> +			pr_debug("%s: Failure %d from backend\n",
> +				__func__, ret);
> +			*retval = -1;
> +			goto out;
> +		}
> +
> +		*retval = 0;
> +	}
> +out:
> +	return ret;
> +}
> +EXPORT_SYMBOL_GPL(eeh_vfio_set_pe_option);
> +
> +int eeh_vfio_get_pe_addr(struct pci_dev *pdev, int option,
> +			 int *retval, int *info)
> +{
> +	struct pci_bus *bus;
> +	struct eeh_dev *edev;
> +	struct eeh_pe *pe;
> +	int ret = 0;
> +
> +	/* Device existing ? */
> +	ret = eeh_vfio_check_dev(pdev, &edev, &pe);
> +	if (ret) {
> +		*retval = -3;
> +		goto out;
> +	}
> +
> +	/* Invalid option ? */
> +	if (option != 0 && option != 1) {

0? 1? What? Don't these have names? And again, please use a switch() for 
this function.

> +		pr_debug("%s: option %d out of range (0, 1)\n",
> +			__func__, option);
> +		*retval = -3;
> +		ret = -EINVAL;
> +		goto out;
> +	}
> +
> +	/*
> +	 * Fill result according to option. We don't differentiate
> +	 * PCI bus and device dependent PE here. So all PEs are
> +	 * built in "shared" mode. Also, the PE address has the format
> +	 * of "00BBSS00".
> +	 */
> +	if (option == 0) {
> +		bus = eeh_pe_bus_get(pe);
> +		if (!bus) {
> +			*retval = -3;
> +			ret = -ENODEV;
> +			goto out;
> +		}
> +
> +		*retval = 0;
> +		*info = bus->number << 16;

How about positive numbers for the number and negative ones for errors?

> +	} else {
> +		*retval = 0;
> +		*info = 1;
> +	}
> +out:
> +	return ret;
> +}
> +EXPORT_SYMBOL_GPL(eeh_vfio_get_pe_addr);
> +
> +int eeh_vfio_get_pe_state(struct pci_dev *pdev, int *retval, int *state)
> +{
> +	struct eeh_dev *edev;
> +	struct eeh_pe *pe;
> +	int result, ret = 0;
> +
> +	/* Device existing ? */
> +	ret = eeh_vfio_check_dev(pdev, &edev, &pe);
> +	if (ret) {
> +		*retval = -3;
> +		goto out;
> +	}
> +
> +	if (!eeh_ops || !eeh_ops->get_state) {
> +		pr_debug("%s: Unsupported request\n",
> +			__func__);
> +		ret = -ENOENT;
> +		*retval = -3;
> +		goto out;
> +	}
> +
> +	result = eeh_ops->get_state(pe, NULL);
> +	if (!(result & EEH_STATE_RESET_ACTIVE) &&
> +	     (result & EEH_STATE_DMA_ENABLED) &&
> +	     (result & EEH_STATE_MMIO_ENABLED))
> +		*state = 0;
> +	else if (result & EEH_STATE_RESET_ACTIVE)
> +		*state = 1;
> +	else if (!(result & EEH_STATE_RESET_ACTIVE) &&
> +		 !(result & EEH_STATE_DMA_ENABLED) &&
> +		 !(result & EEH_STATE_MMIO_ENABLED))
> +		*state = 2;
> +	else if (!(result & EEH_STATE_RESET_ACTIVE) &&
> +		 (result & EEH_STATE_DMA_ENABLED) &&
> +		 !(result & EEH_STATE_MMIO_ENABLED))
> +		*state = 4;
> +	else
> +		*state = 5;

What are these numbers?

> +
> +	*retval = 0;
> +out:
> +	return ret;
> +}
> +EXPORT_SYMBOL_GPL(eeh_vfio_get_pe_state);
> +
> +int eeh_vfio_reset_pe(struct pci_dev *pdev, int option, int *retval)
> +{
> +	struct eeh_dev *edev;
> +	struct eeh_pe *pe;
> +	int ret = 0;
> +
> +	/* Device existing ? */
> +	ret = eeh_vfio_check_dev(pdev, &edev, &pe);
> +	if (ret) {
> +		*retval = -3;
> +		goto out;
> +	}
> +
> +	/* Invalid option ? */
> +	if (option != EEH_RESET_DEACTIVATE &&
> +	    option != EEH_RESET_HOT &&
> +	    option != EEH_RESET_FUNDAMENTAL) {
> +		pr_debug("%s: Unsupported option %d\n",
> +			__func__, option);
> +		ret = -EINVAL;
> +		*retval = -3;
> +		goto out;
> +	}
> +
> +	if (!eeh_ops || !eeh_ops->set_option || !eeh_ops->reset) {
> +		pr_debug("%s: Unsupported request\n",
> +			__func__);
> +		ret = -ENOENT;
> +		*retval = -7;
> +		goto out;
> +	}
> +
> +	ret = eeh_ops->reset(pe, option);
> +	if (ret) {
> +		pr_debug("%s: Failure %d from backend\n",
> +			 __func__, ret);
> +		*retval = -1;
> +		goto out;
> +	}
> +
> +	/*
> +	 * The PE is still in frozen state and we need clear that.
> +	 * It's good to clear frozen state after deassert to avoid
> +	 * messy IO access during reset, which might cause recrusive

recursive

> +	 * frozen PE.
> +	 */
> +	if (option == EEH_RESET_DEACTIVATE) {
> +		ret = eeh_ops->set_option(pe, EEH_OPT_THAW_MMIO);
> +		if (ret) {
> +			pr_debug("%s: Cannot enable IO for PHB#%d-PE#%d (%d)\n",
> +				__func__, pe->phb->global_number, pe->addr, ret);
> +			*retval = -1;
> +			goto out;
> +		}
> +
> +		ret = eeh_ops->set_option(pe, EEH_OPT_THAW_DMA);
> +		if (ret) {
> +			pr_debug("%s: Cannot enable DMA for PHB#%d-PE#%d (%d)\n",
> +				__func__, pe->phb->global_number, pe->addr, ret);
> +			*retval = -1;
> +			goto out;
> +		}
> +
> +		eeh_pe_state_clear(pe, EEH_PE_ISOLATED);
> +	}
> +
> +	*retval = 0;
> +out:
> +	return ret;
> +}
> +EXPORT_SYMBOL_GPL(eeh_vfio_reset_pe);
> +
> +int eeh_vfio_configure_pe(struct pci_dev *pdev, int *retval)
> +{
> +	struct eeh_dev *edev;
> +	struct eeh_pe *pe;
> +	int ret = 0;
> +
> +	/* Device existing ? */
> +	ret = eeh_vfio_check_dev(pdev, &edev, &pe);
> +	if (ret) {
> +		*retval = -3;
> +		goto out;
> +	}
> +
> +	/*
> +	 * The access to PCI config space on VFIO device has some
> +	 * limitations. Part of PCI config space, including BAR
> +	 * registers are not readable and writable. So the guest
> +	 * should have stale values for those registers and we have
> +	 * to restore them in host side.

I don't understand this comment. When is "configure_pe" called in the 
first place? Please provide proper function descriptions for each of 
these exported functions that tell someone who may want to use them what 
they do.

Also, don't mention VFIO or guests in any function inside this file.

> +	 */
> +	eeh_pe_restore_bars(pe);
> +	*retval = 0;
> +
> +out:
> +	return ret;
> +}
> +EXPORT_SYMBOL_GPL(eeh_vfio_configure_pe);
> +
> +#endif /* CONFIG_VFIO_PCI_EEH */
> +
>   static int proc_eeh_show(struct seq_file *m, void *v)
>   {
>   	if (!eeh_enabled()) {
> diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
> index 7ba0424..05c3dde 100644
> --- a/drivers/vfio/pci/vfio_pci.c
> +++ b/drivers/vfio/pci/vfio_pci.c
> @@ -25,6 +25,9 @@
>   #include <linux/types.h>
>   #include <linux/uaccess.h>
>   #include <linux/vfio.h>
> +#ifdef CONFIG_VFIO_PCI_EEH
> +#include <asm/eeh.h>
> +#endif
>   
>   #include "vfio_pci_private.h"
>   
> @@ -152,32 +155,57 @@ static void vfio_pci_disable(struct vfio_pci_device *vdev)
>   	pci_restore_state(pdev);
>   }
>   
> +static void vfio_eeh_pci_release(struct pci_dev *pdev)
> +{
> +#ifdef CONFIG_VFIO_PCI_EEH
> +	eeh_vfio_release(pdev);
> +#endif
> +}
> +
>   static void vfio_pci_release(void *device_data)
>   {
>   	struct vfio_pci_device *vdev = device_data;
>   
> -	if (atomic_dec_and_test(&vdev->refcnt))
> +	if (atomic_dec_and_test(&vdev->refcnt)) {
> +		vfio_eeh_pci_release(vdev->pdev);
>   		vfio_pci_disable(vdev);
> +	}
>   
>   	module_put(THIS_MODULE);
>   }
>   
> +static int vfio_eeh_pci_open(struct pci_dev *pdev)
> +{
> +	int ret = 0;
> +
> +#ifdef CONFIG_VFIO_PCI_EEH
> +	ret = eeh_vfio_open(pdev);
> +#endif
> +	return ret;
> +}
> +
>   static int vfio_pci_open(void *device_data)
>   {
>   	struct vfio_pci_device *vdev = device_data;
> +	int ret;
>   
>   	if (!try_module_get(THIS_MODULE))
>   		return -ENODEV;
>   
>   	if (atomic_inc_return(&vdev->refcnt) == 1) {
> -		int ret = vfio_pci_enable(vdev);
> -		if (ret) {
> -			module_put(THIS_MODULE);
> -			return ret;
> -		}
> +		ret = vfio_pci_enable(vdev);
> +		if (ret)
> +			goto error;
> +
> +		ret = vfio_eeh_pci_open(vdev->pdev);
> +		if (ret)
> +			goto error;
>   	}
>   
>   	return 0;
> +error:
> +	module_put(THIS_MODULE);
> +	return ret;
>   }
>   
>   static int vfio_pci_get_irq_count(struct vfio_pci_device *vdev, int irq_type)
> @@ -321,6 +349,51 @@ static int vfio_pci_for_each_slot_or_bus(struct pci_dev *pdev,
>   	return walk.ret;
>   }
>   
> +static int vfio_eeh_pci_ioctl(struct pci_dev *pdev, struct vfio_eeh_op *info)

I still don't like the idea of that multiplexing ioctl. I don't see any 
benefit in it whatsoever. Just create 5 individual ioctls with their own 
simple interfaces.

Also, this interface has nothing to do with RTAS. So don't sneak in RTAS 
error numbers anywhere ;). It's QEMU's task to convert from kernel error 
codes to RTAS error codes.


Alex

> +{
> +	int ret = 0;
> +
> +#ifdef CONFIG_VFIO_PCI_EEH
> +	switch (info->op) {
> +	case VFIO_EEH_OP_SET_OPTION:
> +		ret = eeh_vfio_set_pe_option(pdev,
> +					     info->option.option,
> +					     &info->option.ret);
> +		break;
> +	case VFIO_EEH_OP_GET_ADDR:
> +		ret = eeh_vfio_get_pe_addr(pdev,
> +					   info->addr.option,
> +					   &info->addr.ret,
> +					   &info->addr.info);
> +		break;
> +	case VFIO_EEH_OP_GET_STATE:
> +		ret = eeh_vfio_get_pe_state(pdev,
> +					    &info->state.ret,
> +					    &info->state.reset_state);
> +		info->state.cfg_cap = 1;
> +		info->state.pe_unavail_info = 1000;
> +		info->state.pe_recovery_info = 0;
> +		break;
> +	case VFIO_EEH_OP_PE_RESET:
> +		ret = eeh_vfio_reset_pe(pdev,
> +					info->reset.option,
> +					&info->reset.ret);
> +		break;
> +	case VFIO_EEH_OP_PE_CONFIG:
> +		ret = eeh_vfio_configure_pe(pdev,
> +					    &info->config.ret);
> +	default:
> +		ret = -EINVAL;
> +		pr_debug("%s: Cannot handle op#%d\n",
> +			__func__, info->op);
> +	}
> +#else
> +	ret = -ENOENT;
> +#endif
> +
> +	return ret;
> +}
> +
>   static long vfio_pci_ioctl(void *device_data,
>   			   unsigned int cmd, unsigned long arg)
>   {
> @@ -682,6 +755,20 @@ hot_reset_release:
>   
>   		kfree(groups);
>   		return ret;
> +	} else if (cmd == VFIO_EEH_OP) {
> +		struct vfio_eeh_op info;
> +		int ret = 0;
> +
> +		minsz = sizeof(info);
> +		if (copy_from_user(&info, (void __user *)arg, minsz))
> +			return -EFAULT;
> +		if (info.argsz < minsz)
> +			return -EINVAL;
> +
> +		ret = vfio_eeh_pci_ioctl(vdev->pdev, &info);
> +		if (copy_to_user((void __user *)arg, &info, minsz))
> +			ret = -EFAULT;
> +		return ret;
>   	}
>   
>   	return -ENOTTY;
> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> index cb9023d..518961d 100644
> --- a/include/uapi/linux/vfio.h
> +++ b/include/uapi/linux/vfio.h
> @@ -455,6 +455,49 @@ struct vfio_iommu_spapr_tce_info {
>   
>   #define VFIO_IOMMU_SPAPR_TCE_GET_INFO	_IO(VFIO_TYPE, VFIO_BASE + 12)
>   
> +/*
> + * The VFIO operation struct provides way to support EEH functionality
> + * for PCI device that is passed from host to guest via VFIO.
> + */
> +#define VFIO_EEH_OP_SET_OPTION	0
> +#define VFIO_EEH_OP_GET_ADDR	1
> +#define VFIO_EEH_OP_GET_STATE	2
> +#define VFIO_EEH_OP_PE_RESET	3
> +#define VFIO_EEH_OP_PE_CONFIG	4
> +
> +struct vfio_eeh_op {
> +	__u32 argsz;
> +	__u32 op;
> +
> +	union {
> +		struct vfio_eeh_set_option {
> +			__u32 option;
> +			__s32 ret;
> +		} option;
> +		struct vfio_eeh_pe_addr {
> +			__u32 option;
> +			__s32 ret;
> +			__u32 info;
> +		} addr;
> +		struct vfio_eeh_pe_state {
> +			__s32 ret;
> +			__u32 reset_state;
> +			__u32 cfg_cap;
> +			__u32 pe_unavail_info;
> +			__u32 pe_recovery_info;
> +		} state;
> +		struct vfio_eeh_reset {
> +			__u32 option;
> +			__s32 ret;
> +		} reset;
> +		struct vfio_eeh_config {
> +			__s32 ret;
> +		} config;
> +	};
> +};
> +
> +#define VFIO_EEH_OP	_IO(VFIO_TYPE, VFIO_BASE + 21)
> +
>   /* ***************************************************************** */
>   
>   #endif /* _UAPIVFIO_H */



More information about the Linuxppc-dev mailing list