[PATCH kernel 9/9] KVM: PPC: VFIO device: support SPAPR TCE

Wed Mar 9 20:20:12 AEDT 2016

On 03/09/2016 04:45 PM, David Gibson wrote:
> On Mon, Mar 07, 2016 at 02:41:17PM +1100, Alexey Kardashevskiy wrote:
>> sPAPR TCE IOMMU is para-virtualized and the guest does map/unmap
>> via hypercalls which take a logical bus id (LIOBN) as a target IOMMU
>> identifier. LIOBNs are made up, advertised to guest systems and
>> linked to IOMMU groups by the user space.
>> In order to enable acceleration for IOMMU operations in KVM, we need
>> to tell KVM the information about the LIOBN-to-group mapping.
>>
>> For that, a new KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE_LIOBN parameter
>> is added which accepts:
>> - a VFIO group fd and IO base address to find the actual hardware
>> TCE table;
>> - a LIOBN to assign to the found table.
>>
>> Before notifying KVM about new link, this check the group for being
>> registered with KVM device in order to release them at unexpected KVM
>> finish.
>>
>> This advertises the new KVM_CAP_SPAPR_TCE_VFIO capability to the user
>> space.
>>
>> While we are here, this also fixes VFIO KVM device compiling to let it
>> link to a KVM module.
>>
>> Signed-off-by: Alexey Kardashevskiy <aik at ozlabs.ru>
>> ---
>>   Documentation/virtual/kvm/devices/vfio.txt |  21 +++++-
>>   arch/powerpc/kvm/Kconfig                   |   1 +
>>   arch/powerpc/kvm/Makefile                  |   5 +-
>>   arch/powerpc/kvm/powerpc.c                 |   1 +
>>   include/uapi/linux/kvm.h                   |   9 +++
>>   virt/kvm/vfio.c                            | 106 +++++++++++++++++++++++++++++
>>   6 files changed, 140 insertions(+), 3 deletions(-)
>>
>> diff --git a/Documentation/virtual/kvm/devices/vfio.txt b/Documentation/virtual/kvm/devices/vfio.txt
>> index ef51740..c0d3eb7 100644
>> --- a/Documentation/virtual/kvm/devices/vfio.txt
>> +++ b/Documentation/virtual/kvm/devices/vfio.txt
>> @@ -16,7 +16,24 @@ Groups:
>>
>>   KVM_DEV_VFIO_GROUP attributes:
>>     KVM_DEV_VFIO_GROUP_ADD: Add a VFIO group to VFIO-KVM device tracking
>> +	kvm_device_attr.addr points to an int32_t file descriptor
>> +	for the VFIO group.
>
> AFAICT these changes are accurate for VFIO as it is already, in which
> case it might be clearer to put them in a separate patch.
>
>>     KVM_DEV_VFIO_GROUP_DEL: Remove a VFIO group from VFIO-KVM device tracking
>> +	kvm_device_attr.addr points to an int32_t file descriptor
>> +	for the VFIO group.
>>
>> -For each, kvm_device_attr.addr points to an int32_t file descriptor
>> -for the VFIO group.
>> +  KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE_LIOBN: sets a liobn for a VFIO group
>> +	kvm_device_attr.addr points to a struct:
>> +		struct kvm_vfio_spapr_tce_liobn {
>> +			__u32	argsz;
>> +			__s32	fd;
>> +			__u32	liobn;
>> +			__u8	pad[4];
>> +			__u64	start_addr;
>> +		};
>> +		where
>> +		@argsz is the size of kvm_vfio_spapr_tce_liobn;
>> +		@fd is a file descriptor for a VFIO group;
>> +		@liobn is a logical bus id to be associated with the group;
>> +		@start_addr is a DMA window offset on the IO (PCI) bus
>
> For the cause of DDW and multiple windows, I'm assuming you can call
> this multiple times with different LIOBNs and the same IOMMU group?

Yes. It is called twice per each group (when DDW is activated) - for 32bit 
and 64bit windows, this is why @start_addr is there.

>> diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
>> index 1059846..dfa3488 100644
>> --- a/arch/powerpc/kvm/Kconfig
>> +++ b/arch/powerpc/kvm/Kconfig
>> @@ -65,6 +65,7 @@ config KVM_BOOK3S_64
>>   	select KVM
>>   	select KVM_BOOK3S_PR_POSSIBLE if !KVM_BOOK3S_HV_POSSIBLE
>>   	select SPAPR_TCE_IOMMU if IOMMU_SUPPORT
>> +	select KVM_VFIO if VFIO
>>   	---help---
>>   	  Support running unmodified book3s_64 and book3s_32 guest kernels
>>   	  in virtual machines on book3s_64 host processors.
>> diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile
>> index 7f7b6d8..71f577c 100644
>> --- a/arch/powerpc/kvm/Makefile
>> +++ b/arch/powerpc/kvm/Makefile
>> @@ -8,7 +8,7 @@ ccflags-y := -Ivirt/kvm -Iarch/powerpc/kvm
>>   KVM := ../../../virt/kvm
>>
>>   common-objs-y = $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o \
>> -		$(KVM)/eventfd.o $(KVM)/vfio.o
>> +		$(KVM)/eventfd.o
>
> Please don't disable the VFIO device for the non-book3s case.  I added
> it (even though it didn't do anything until now) so that libvirt
> wouldn't choke when it finds it's not available.  Obviously the new
> ioctl needs to be only for the right IOMMU setup, but the device
> itself should be available always.

Ah. Ok, I'll fix this. I just wanted to be able to compile kvm as a module.

>>   CFLAGS_e500_mmu.o := -I.
>>   CFLAGS_e500_mmu_host.o := -I.
>> @@ -87,6 +87,9 @@ endif
>>   kvm-book3s_64-objs-$(CONFIG_KVM_XICS) += \
>>   	book3s_xics.o
>>
>> +kvm-book3s_64-objs-$(CONFIG_KVM_VFIO) += \
>> +	$(KVM)/vfio.o \
>> +
>>   kvm-book3s_64-module-objs += \
>>   	$(KVM)/kvm_main.o \
>>   	$(KVM)/eventfd.o \
>> diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
>> index 19aa59b..63f188d 100644
>> --- a/arch/powerpc/kvm/powerpc.c
>> +++ b/arch/powerpc/kvm/powerpc.c
>> @@ -521,6 +521,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
>>   #ifdef CONFIG_PPC_BOOK3S_64
>>   	case KVM_CAP_SPAPR_TCE:
>>   	case KVM_CAP_SPAPR_TCE_64:
>> +	case KVM_CAP_SPAPR_TCE_VFIO:
>>   	case KVM_CAP_PPC_ALLOC_HTAB:
>>   	case KVM_CAP_PPC_RTAS:
>>   	case KVM_CAP_PPC_FIXUP_HCALL:
>> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
>> index 080ffbf..f1abbea 100644
>> --- a/include/uapi/linux/kvm.h
>> +++ b/include/uapi/linux/kvm.h
>> @@ -1056,6 +1056,7 @@ struct kvm_device_attr {
>>   #define  KVM_DEV_VFIO_GROUP			1
>>   #define   KVM_DEV_VFIO_GROUP_ADD			1
>>   #define   KVM_DEV_VFIO_GROUP_DEL			2
>> +#define   KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE_LIOBN	3
>>
>>   enum kvm_device_type {
>>   	KVM_DEV_TYPE_FSL_MPIC_20	= 1,
>> @@ -1075,6 +1076,14 @@ enum kvm_device_type {
>>   	KVM_DEV_TYPE_MAX,
>>   };
>>
>> +struct kvm_vfio_spapr_tce_liobn {
>> +	__u32	argsz;
>> +	__s32	fd;
>> +	__u32	liobn;
>> +	__u8	pad[4];
>> +	__u64	start_addr;
>> +};
>> +
>>   /*
>>    * ioctls for VM fds
>>    */
>> diff --git a/virt/kvm/vfio.c b/virt/kvm/vfio.c
>> index 1dd087d..87c771e 100644
>> --- a/virt/kvm/vfio.c
>> +++ b/virt/kvm/vfio.c
>> @@ -20,6 +20,10 @@
>>   #include <linux/vfio.h>
>>   #include "vfio.h"
>>
>> +#ifdef CONFIG_SPAPR_TCE_IOMMU
>> +#include <asm/kvm_ppc.h>
>> +#endif
>> +
>>   struct kvm_vfio_group {
>>   	struct list_head node;
>>   	struct vfio_group *vfio_group;
>> @@ -60,6 +64,22 @@ static void kvm_vfio_group_put_external_user(struct vfio_group *vfio_group)
>>   	symbol_put(vfio_group_put_external_user);
>>   }
>>
>> +static int kvm_vfio_external_user_iommu_id(struct vfio_group *vfio_group)
>> +{
>> +	int (*fn)(struct vfio_group *);
>> +	int ret = -1;
>
> Should this be -ESOMETHING?
>
>> +	fn = symbol_get(vfio_external_user_iommu_id);
>> +	if (!fn)
>> +		return ret;
>> +
>> +	ret = fn(vfio_group);
>> +
>> +	symbol_put(vfio_external_user_iommu_id);
>> +
>> +	return ret;
>> +}
>> +
>>   static bool kvm_vfio_group_is_coherent(struct vfio_group *vfio_group)
>>   {
>>   	long (*fn)(struct vfio_group *, unsigned long);
>> @@ -110,6 +130,22 @@ static void kvm_vfio_update_coherency(struct kvm_device *dev)
>>   	mutex_unlock(&kv->lock);
>>   }
>>
>> +#ifdef CONFIG_SPAPR_TCE_IOMMU
>> +static void kvm_vfio_spapr_detach_iommu_group(struct kvm *kvm,
>> +		struct vfio_group *vfio_group)
>
>
> Shouldn't this go in the same patch that introduced the attach
> function?

Having less patches which touch different maintainers areas is better. I 
cannot avoid touching both PPC KVM and VFIO in this patch but I can in 
"[PATCH kernel 6/9] KVM: PPC: Associate IOMMU group with guest view of TCE 
table".

>
>> +{
>> +	int group_id;
>> +	struct iommu_group *grp;
>> +
>> +	group_id = kvm_vfio_external_user_iommu_id(vfio_group);
>> +	grp = iommu_group_get_by_id(group_id);
>> +	if (grp) {
>> +		kvm_spapr_tce_detach_iommu_group(kvm, grp);
>> +		iommu_group_put(grp);
>> +	}
>> +}
>> +#endif
>> +
>>   static int kvm_vfio_set_group(struct kvm_device *dev, long attr, u64 arg)
>>   {
>>   	struct kvm_vfio *kv = dev->private;
>> @@ -186,6 +222,10 @@ static int kvm_vfio_set_group(struct kvm_device *dev, long attr, u64 arg)
>>   				continue;
>>
>>   			list_del(&kvg->node);
>> +#ifdef CONFIG_SPAPR_TCE_IOMMU
>
> Better to make a no-op version of the call than have to #ifdef at the
> callsite.

It is questionable. A x86 reader may decide that 
KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE_LIOBN is implemented for x86 and get confused.

>
>> +			kvm_vfio_spapr_detach_iommu_group(dev->kvm,
>> +					kvg->vfio_group);
>> +#endif
>>   			kvm_vfio_group_put_external_user(kvg->vfio_group);
>>   			kfree(kvg);
>>   			ret = 0;
>> @@ -201,6 +241,69 @@ static int kvm_vfio_set_group(struct kvm_device *dev, long attr, u64 arg)
>>   		kvm_vfio_update_coherency(dev);
>>
>>   		return ret;
>> +
>> +#ifdef CONFIG_SPAPR_TCE_IOMMU
>> +	case KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE_LIOBN: {
>> +		struct kvm_vfio_spapr_tce_liobn param;
>> +		unsigned long minsz;
>> +		struct kvm_vfio *kv = dev->private;
>> +		struct vfio_group *vfio_group;
>> +		struct kvm_vfio_group *kvg;
>> +		struct fd f;
>> +
>> +		minsz = offsetofend(struct kvm_vfio_spapr_tce_liobn,
>> +				start_addr);
>> +
>> +		if (copy_from_user(&param, (void __user *)arg, minsz))
>> +			return -EFAULT;
>> +
>> +		if (param.argsz < minsz)
>> +			return -EINVAL;
>> +
>> +		f = fdget(param.fd);
>> +		if (!f.file)
>> +			return -EBADF;
>> +
>> +		vfio_group = kvm_vfio_group_get_external_user(f.file);
>> +		fdput(f);
>> +
>> +		if (IS_ERR(vfio_group))
>> +			return PTR_ERR(vfio_group);
>> +
>> +		ret = -ENOENT;
>
> Shouldn't there be some runtime test for the type of the IOMMU?  It's
> possible a kernel could be built for a platform supporting multiple
> IOMMU types.

Well, may make sense but I do not know to test that. The IOMMU type is a 
VFIO container property, not a group property and here (KVM) we only have 
groups.

And calling  iommu_group_get_iommudata() is quite useless as it returns a 
void pointer... I could probably check that the release() callback is the 
one I set via iommu_group_set_iommudata() but there is no API to get it 
from a group.

And I cannot really imagine a kernel with CONFIG_PPC_BOOK3S_64 (and 
therefore KVM_CAP_SPAPR_TCE_VFIO enabled) with different IOMMU types. Can 
the same kernel binary image work on both BOOK3S and embedded PPC? Where 
these other types can come from?

>
>> +		mutex_lock(&kv->lock);
>> +
>> +		list_for_each_entry(kvg, &kv->group_list, node) {
>> +			int group_id;
>> +			struct iommu_group *grp;
>> +
>> +			if (kvg->vfio_group != vfio_group)
>> +				continue;
>> +
>> +			group_id = kvm_vfio_external_user_iommu_id(
>> +					kvg->vfio_group);
>> +			grp = iommu_group_get_by_id(group_id);
>> +			if (!grp) {
>> +				ret = -EFAULT;
>> +				break;
>> +			}
>> +
>> +			ret = kvm_spapr_tce_attach_iommu_group(dev->kvm,
>> +					param.liobn, param.start_addr,
>> +					grp);
>> +			if (ret)
>> +				iommu_group_put(grp);
>> +			break;
>> +		}
>> +
>> +		mutex_unlock(&kv->lock);
>> +
>> +		kvm_vfio_group_put_external_user(vfio_group);
>> +
>> +		return ret;
>> +	}
>> +#endif /* CONFIG_SPAPR_TCE_IOMMU */
>>   	}
>>
>>   	return -ENXIO;
>> @@ -225,6 +328,9 @@ static int kvm_vfio_has_attr(struct kvm_device *dev,
>>   		switch (attr->attr) {
>>   		case KVM_DEV_VFIO_GROUP_ADD:
>>   		case KVM_DEV_VFIO_GROUP_DEL:
>> +#ifdef CONFIG_SPAPR_TCE_IOMMU
>> +		case KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE_LIOBN:
>> +#endif
>>   			return 0;
>>   		}
>>
>

-- 
Alexey