[PATCH kernel v4 4/4] powerpc/mm/iommu, vfio/spapr: Put pages on VFIO container shutdown
Alexey Kardashevskiy
aik at ozlabs.ru
Tue Oct 25 15:55:56 AEDT 2016
On 25/10/16 15:44, David Gibson wrote:
> On Mon, Oct 24, 2016 at 05:53:10PM +1100, Alexey Kardashevskiy wrote:
>> At the moment the userspace tool is expected to request pinning of
>> the entire guest RAM when VFIO IOMMU SPAPR v2 driver is present.
>> When the userspace process finishes, all the pinned pages need to
>> be put; this is done as a part of the userspace memory context (MM)
>> destruction which happens on the very last mmdrop().
>>
>> This approach has a problem that a MM of the userspace process
>> may live longer than the userspace process itself as kernel threads
>> use userspace process MMs which was runnning on a CPU where
>> the kernel thread was scheduled to. If this happened, the MM remains
>> referenced until this exact kernel thread wakes up again
>> and releases the very last reference to the MM, on an idle system this
>> can take even hours.
>>
>> This moves preregistered regions tracking from MM to VFIO; insteads of
>> using mm_iommu_table_group_mem_t::used, tce_container::prereg_list is
>> added so each container releases regions which it has pre-registered.
>>
>> This changes the userspace interface to return EBUSY if a memory
>> region is already registered in a container. However it should not
>> have any practical effect as the only userspace tool available now
>> does register memory region once per container anyway.
>>
>> As tce_iommu_register_pages/tce_iommu_unregister_pages are called
>> under container->lock, this does not need additional locking.
>>
>> Signed-off-by: Alexey Kardashevskiy <aik at ozlabs.ru>
>> Reviewed-by: Nicholas Piggin <npiggin at gmail.com>
>
> On the grounds that this leaves things in a better state than before:
>
> Reviewed-by: David Gibson <david at gibson.dropbear.id.au>
>
> On the other hand the implementation is kind of clunky, with the way
> it keeps the mm-level and vfio-level lists of regions in parallel.
> With this change, does the mm-level list actually serve any purpose at
> all, or could it all be moved into the vfio-level list?
The mm-level list allows not having gup() called for each container (minor
thing, I suppose) and it also tracks a number of active mappings which will
become useful when we add in-kernel real-mode TCE acceleration as
vfio-level code cannot run in realmode.
>
>> ---
>> Changes:
>> v4:
>> * changed tce_iommu_register_pages() to call mm_iommu_find() first and
>> avoid calling mm_iommu_put() if memory is preregistered already
>>
>> v3:
>> * moved tce_iommu_prereg_free() call out of list_for_each_entry()
>>
>> v2:
>> * updated commit log
>> ---
>> arch/powerpc/mm/mmu_context_book3s64.c | 4 ---
>> arch/powerpc/mm/mmu_context_iommu.c | 11 -------
>> drivers/vfio/vfio_iommu_spapr_tce.c | 58 +++++++++++++++++++++++++++++++++-
>> 3 files changed, 57 insertions(+), 16 deletions(-)
>>
>> diff --git a/arch/powerpc/mm/mmu_context_book3s64.c b/arch/powerpc/mm/mmu_context_book3s64.c
>> index ad82735..1a07969 100644
>> --- a/arch/powerpc/mm/mmu_context_book3s64.c
>> +++ b/arch/powerpc/mm/mmu_context_book3s64.c
>> @@ -159,10 +159,6 @@ static inline void destroy_pagetable_page(struct mm_struct *mm)
>>
>> void destroy_context(struct mm_struct *mm)
>> {
>> -#ifdef CONFIG_SPAPR_TCE_IOMMU
>> - mm_iommu_cleanup(mm);
>> -#endif
>> -
>> #ifdef CONFIG_PPC_ICSWX
>> drop_cop(mm->context.acop, mm);
>> kfree(mm->context.cop_lockp);
>> diff --git a/arch/powerpc/mm/mmu_context_iommu.c b/arch/powerpc/mm/mmu_context_iommu.c
>> index 4c6db09..104bad0 100644
>> --- a/arch/powerpc/mm/mmu_context_iommu.c
>> +++ b/arch/powerpc/mm/mmu_context_iommu.c
>> @@ -365,14 +365,3 @@ void mm_iommu_init(struct mm_struct *mm)
>> {
>> INIT_LIST_HEAD_RCU(&mm->context.iommu_group_mem_list);
>> }
>> -
>> -void mm_iommu_cleanup(struct mm_struct *mm)
>> -{
>> - struct mm_iommu_table_group_mem_t *mem, *tmp;
>> -
>> - list_for_each_entry_safe(mem, tmp, &mm->context.iommu_group_mem_list,
>> - next) {
>> - list_del_rcu(&mem->next);
>> - mm_iommu_do_free(mem);
>> - }
>> -}
>> diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
>> index 81ab93f..001a488 100644
>> --- a/drivers/vfio/vfio_iommu_spapr_tce.c
>> +++ b/drivers/vfio/vfio_iommu_spapr_tce.c
>> @@ -86,6 +86,15 @@ struct tce_iommu_group {
>> };
>>
>> /*
>> + * A container needs to remember which preregistered region it has
>> + * referenced to do proper cleanup at the userspace process exit.
>> + */
>> +struct tce_iommu_prereg {
>> + struct list_head next;
>> + struct mm_iommu_table_group_mem_t *mem;
>> +};
>> +
>> +/*
>> * The container descriptor supports only a single group per container.
>> * Required by the API as the container is not supplied with the IOMMU group
>> * at the moment of initialization.
>> @@ -98,12 +107,27 @@ struct tce_container {
>> struct mm_struct *mm;
>> struct iommu_table *tables[IOMMU_TABLE_GROUP_MAX_TABLES];
>> struct list_head group_list;
>> + struct list_head prereg_list;
>> };
>>
>> +static long tce_iommu_prereg_free(struct tce_container *container,
>> + struct tce_iommu_prereg *tcemem)
>> +{
>> + long ret;
>> +
>> + list_del(&tcemem->next);
>> + ret = mm_iommu_put(container->mm, tcemem->mem);
>> + kfree(tcemem);
>> +
>> + return ret;
>> +}
>> +
>> static long tce_iommu_unregister_pages(struct tce_container *container,
>> __u64 vaddr, __u64 size)
>> {
>> struct mm_iommu_table_group_mem_t *mem;
>> + struct tce_iommu_prereg *tcemem;
>> + bool found = false;
>>
>> if ((vaddr & ~PAGE_MASK) || (size & ~PAGE_MASK))
>> return -EINVAL;
>> @@ -112,7 +136,17 @@ static long tce_iommu_unregister_pages(struct tce_container *container,
>> if (!mem)
>> return -ENOENT;
>>
>> - return mm_iommu_put(container->mm, mem);
>> + list_for_each_entry(tcemem, &container->prereg_list, next) {
>> + if (tcemem->mem == mem) {
>> + found = true;
>> + break;
>> + }
>> + }
>> +
>> + if (!found)
>> + return -ENOENT;
>> +
>> + return tce_iommu_prereg_free(container, tcemem);
>> }
>>
>> static long tce_iommu_register_pages(struct tce_container *container,
>> @@ -120,16 +154,29 @@ static long tce_iommu_register_pages(struct tce_container *container,
>> {
>> long ret = 0;
>> struct mm_iommu_table_group_mem_t *mem = NULL;
>> + struct tce_iommu_prereg *tcemem;
>> unsigned long entries = size >> PAGE_SHIFT;
>>
>> if ((vaddr & ~PAGE_MASK) || (size & ~PAGE_MASK) ||
>> ((vaddr + size) < vaddr))
>> return -EINVAL;
>>
>> + mem = mm_iommu_find(container->mm, vaddr, entries);
>> + if (mem) {
>> + list_for_each_entry(tcemem, &container->prereg_list, next) {
>> + if (tcemem->mem == mem)
>> + return -EBUSY;
>> + }
>> + }
>> +
>> ret = mm_iommu_get(container->mm, vaddr, entries, &mem);
>> if (ret)
>> return ret;
>>
>> + tcemem = kzalloc(sizeof(*tcemem), GFP_KERNEL);
>> + tcemem->mem = mem;
>> + list_add(&tcemem->next, &container->prereg_list);
>> +
>> container->enabled = true;
>>
>> return 0;
>> @@ -311,6 +358,7 @@ static void *tce_iommu_open(unsigned long arg)
>>
>> mutex_init(&container->lock);
>> INIT_LIST_HEAD_RCU(&container->group_list);
>> + INIT_LIST_HEAD_RCU(&container->prereg_list);
>>
>> container->v2 = arg == VFIO_SPAPR_TCE_v2_IOMMU;
>>
>> @@ -353,6 +401,14 @@ static void tce_iommu_release(void *iommu_data)
>> tce_iommu_free_table(container, tbl);
>> }
>>
>> + while (!list_empty(&container->prereg_list)) {
>> + struct tce_iommu_prereg *tcemem;
>> +
>> + tcemem = list_first_entry(&container->prereg_list,
>> + struct tce_iommu_prereg, next);
>> + tce_iommu_prereg_free(container, tcemem);
>> + }
>> +
>> tce_iommu_disable(container);
>> mmdrop(container->mm);
>> mutex_destroy(&container->lock);
>
--
Alexey
-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 839 bytes
Desc: OpenPGP digital signature
URL: <http://lists.ozlabs.org/pipermail/linuxppc-dev/attachments/20161025/7625b7ba/attachment.sig>
More information about the Linuxppc-dev
mailing list