[PATCH 8/8] KVM: PPC: Add hugepage support for IOMMU in-kernel handling
Alexander Graf
agraf at suse.de
Wed Jul 10 03:32:19 EST 2013
On 07/06/2013 05:07 PM, Alexey Kardashevskiy wrote:
> This adds special support for huge pages (16MB). The reference
> counting cannot be easily done for such pages in real mode (when
> MMU is off) so we added a list of huge pages. It is populated in
> virtual mode and get_page is called just once per a huge page.
> Real mode handlers check if the requested page is huge and in the list,
> then no reference counting is done, otherwise an exit to virtual mode
> happens. The list is released at KVM exit. At the moment the fastest
> card available for tests uses up to 9 huge pages so walking through this
> list is not very expensive. However this can change and we may want
> to optimize this.
>
> Signed-off-by: Paul Mackerras<paulus at samba.org>
> Signed-off-by: Alexey Kardashevskiy<aik at ozlabs.ru>
>
> ---
>
> Changes:
> 2013/06/27:
> * list of huge pages replaces with hashtable for better performance
So the only thing your patch description really talks about is not true
anymore?
> * spinlock removed from real mode and only protects insertion of new
> huge [ages descriptors into the hashtable
>
> 2013/06/05:
> * fixed compile error when CONFIG_IOMMU_API=n
>
> 2013/05/20:
> * the real mode handler now searches for a huge page by gpa (used to be pte)
> * the virtual mode handler prints warning if it is called twice for the same
> huge page as the real mode handler is expected to fail just once - when a huge
> page is not in the list yet.
> * the huge page is refcounted twice - when added to the hugepage list and
> when used in the virtual mode hcall handler (can be optimized but it will
> make the patch less nice).
>
> Signed-off-by: Alexey Kardashevskiy<aik at ozlabs.ru>
> ---
> arch/powerpc/include/asm/kvm_host.h | 25 +++++++++
> arch/powerpc/kernel/iommu.c | 6 ++-
> arch/powerpc/kvm/book3s_64_vio.c | 104 +++++++++++++++++++++++++++++++++---
> arch/powerpc/kvm/book3s_64_vio_hv.c | 21 ++++++--
> 4 files changed, 146 insertions(+), 10 deletions(-)
>
> diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
> index 53e61b2..a7508cf 100644
> --- a/arch/powerpc/include/asm/kvm_host.h
> +++ b/arch/powerpc/include/asm/kvm_host.h
> @@ -30,6 +30,7 @@
> #include<linux/kvm_para.h>
> #include<linux/list.h>
> #include<linux/atomic.h>
> +#include<linux/hashtable.h>
> #include<asm/kvm_asm.h>
> #include<asm/processor.h>
> #include<asm/page.h>
> @@ -182,10 +183,34 @@ struct kvmppc_spapr_tce_table {
> u32 window_size;
> struct iommu_group *grp; /* used for IOMMU groups */
> struct vfio_group *vfio_grp; /* used for IOMMU groups */
> + DECLARE_HASHTABLE(hash_tab, ilog2(64)); /* used for IOMMU groups */
> + spinlock_t hugepages_write_lock; /* used for IOMMU groups */
> struct { struct { unsigned long put, indir, stuff; } rm, vm; } stat;
> struct page *pages[0];
> };
>
> +/*
> + * The KVM guest can be backed with 16MB pages.
> + * In this case, we cannot do page counting from the real mode
> + * as the compound pages are used - they are linked in a list
> + * with pointers as virtual addresses which are inaccessible
> + * in real mode.
> + *
> + * The code below keeps a 16MB pages list and uses page struct
> + * in real mode if it is already locked in RAM and inserted into
> + * the list or switches to the virtual mode where it can be
> + * handled in a usual manner.
> + */
> +#define KVMPPC_SPAPR_HUGEPAGE_HASH(gpa) hash_32(gpa>> 24, 32)
> +
> +struct kvmppc_spapr_iommu_hugepage {
> + struct hlist_node hash_node;
> + unsigned long gpa; /* Guest physical address */
> + unsigned long hpa; /* Host physical address */
> + struct page *page; /* page struct of the very first subpage */
> + unsigned long size; /* Huge page size (always 16MB at the moment) */
> +};
> +
> struct kvmppc_linear_info {
> void *base_virt;
> unsigned long base_pfn;
> diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
> index 51678ec..e0b6eca 100644
> --- a/arch/powerpc/kernel/iommu.c
> +++ b/arch/powerpc/kernel/iommu.c
> @@ -999,7 +999,8 @@ int iommu_free_tces(struct iommu_table *tbl, unsigned long entry,
> if (!pg) {
> ret = -EAGAIN;
> } else if (PageCompound(pg)) {
> - ret = -EAGAIN;
> + /* Hugepages will be released at KVM exit */
> + ret = 0;
> } else {
> if (oldtce& TCE_PCI_WRITE)
> SetPageDirty(pg);
> @@ -1009,6 +1010,9 @@ int iommu_free_tces(struct iommu_table *tbl, unsigned long entry,
> struct page *pg = pfn_to_page(oldtce>> PAGE_SHIFT);
> if (!pg) {
> ret = -EAGAIN;
> + } else if (PageCompound(pg)) {
> + /* Hugepages will be released at KVM exit */
> + ret = 0;
> } else {
> if (oldtce& TCE_PCI_WRITE)
> SetPageDirty(pg);
> diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c
> index 2b51f4a..c037219 100644
> --- a/arch/powerpc/kvm/book3s_64_vio.c
> +++ b/arch/powerpc/kvm/book3s_64_vio.c
> @@ -46,6 +46,40 @@
>
> #define ERROR_ADDR ((void *)~(unsigned long)0x0)
>
> +#ifdef CONFIG_IOMMU_API
Can't you just make CONFIG_IOMMU_API mandatory in Kconfig?
> +static void kvmppc_iommu_hugepages_init(struct kvmppc_spapr_tce_table *tt)
> +{
> + spin_lock_init(&tt->hugepages_write_lock);
> + hash_init(tt->hash_tab);
> +}
> +
> +static void kvmppc_iommu_hugepages_cleanup(struct kvmppc_spapr_tce_table *tt)
> +{
> + int bkt;
> + struct kvmppc_spapr_iommu_hugepage *hp;
> + struct hlist_node *tmp;
> +
> + spin_lock(&tt->hugepages_write_lock);
> + hash_for_each_safe(tt->hash_tab, bkt, tmp, hp, hash_node) {
> + pr_debug("Release HP liobn=%llx #%u gpa=%lx hpa=%lx size=%ld\n",
> + tt->liobn, bkt, hp->gpa, hp->hpa, hp->size);
trace point
> + hlist_del_rcu(&hp->hash_node);
> +
> + put_page(hp->page);
Don't you have to mark them dirty?
> + kfree(hp);
> + }
> + spin_unlock(&tt->hugepages_write_lock);
> +}
> +#else
> +static void kvmppc_iommu_hugepages_init(struct kvmppc_spapr_tce_table *tt)
> +{
> +}
> +
> +static void kvmppc_iommu_hugepages_cleanup(struct kvmppc_spapr_tce_table *tt)
> +{
> +}
> +#endif /* CONFIG_IOMMU_API */
> +
> static long kvmppc_stt_npages(unsigned long window_size)
> {
> return ALIGN((window_size>> SPAPR_TCE_SHIFT)
> @@ -112,6 +146,7 @@ static void release_spapr_tce_table(struct kvmppc_spapr_tce_table *stt)
>
> mutex_lock(&kvm->lock);
> list_del(&stt->list);
> + kvmppc_iommu_hugepages_cleanup(stt);
>
> #ifdef CONFIG_IOMMU_API
> if (stt->grp) {
> @@ -200,6 +235,7 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
> kvm_get_kvm(kvm);
>
> mutex_lock(&kvm->lock);
> + kvmppc_iommu_hugepages_init(stt);
> list_add(&stt->list,&kvm->arch.spapr_tce_tables);
>
> mutex_unlock(&kvm->lock);
> @@ -283,6 +319,7 @@ long kvm_vm_ioctl_create_spapr_tce_iommu(struct kvm *kvm,
>
> kvm_get_kvm(kvm);
> mutex_lock(&kvm->lock);
> + kvmppc_iommu_hugepages_init(tt);
> list_add(&tt->list,&kvm->arch.spapr_tce_tables);
> mutex_unlock(&kvm->lock);
>
> @@ -307,10 +344,17 @@ long kvm_vm_ioctl_create_spapr_tce_iommu(struct kvm *kvm,
>
> /* Converts guest physical address to host virtual address */
> static void __user *kvmppc_vm_gpa_to_hva_and_get(struct kvm_vcpu *vcpu,
> + struct kvmppc_spapr_tce_table *tt,
> unsigned long gpa, struct page **pg, unsigned long *hpa)
> {
> unsigned long hva, gfn = gpa>> PAGE_SHIFT;
> struct kvm_memory_slot *memslot;
> +#ifdef CONFIG_IOMMU_API
> + struct kvmppc_spapr_iommu_hugepage *hp;
> + unsigned key = KVMPPC_SPAPR_HUGEPAGE_HASH(gpa);
> + pte_t *ptep;
> + unsigned int shift = 0;
> +#endif
>
> memslot = search_memslots(kvm_memslots(vcpu->kvm), gfn);
> if (!memslot)
> @@ -325,6 +369,54 @@ static void __user *kvmppc_vm_gpa_to_hva_and_get(struct kvm_vcpu *vcpu,
> *hpa = __pa((unsigned long) page_address(*pg)) +
> (hva& ~PAGE_MASK);
>
> +#ifdef CONFIG_IOMMU_API
This function is becoming incredibly large. Please split it up. Also
please document the code.
Alex
> + if (!PageCompound(*pg))
> + return (void *) hva;
> +
> + spin_lock(&tt->hugepages_write_lock);
> + hash_for_each_possible_rcu(tt->hash_tab, hp, hash_node, key) {
> + if ((gpa< hp->gpa) || (gpa>= hp->gpa + hp->size))
> + continue;
> + if (hpa)
> + *hpa = __pa((unsigned long) page_address(hp->page)) +
> + (hva& (hp->size - 1));
> + goto unlock_exit;
> + }
> +
> + ptep = find_linux_pte_or_hugepte(vcpu->arch.pgdir, hva,&shift);
> + WARN_ON(!ptep);
> +
> + if (!ptep || (shift<= PAGE_SHIFT)) {
> + hva = (unsigned long) ERROR_ADDR;
> + goto unlock_exit;
> + }
> +
> + hp = kzalloc(sizeof(*hp), GFP_KERNEL);
> + if (!hp) {
> + hva = (unsigned long) ERROR_ADDR;
> + goto unlock_exit;
> + }
> +
> + hp->gpa = gpa& ~((1<< shift) - 1);
> + hp->hpa = (pte_pfn(*ptep)<< PAGE_SHIFT);
> + hp->size = 1<< shift;
> +
> + if (get_user_pages_fast(hva& ~(hp->size - 1), 1, 1,&hp->page) != 1) {
> + hva = (unsigned long) ERROR_ADDR;
> + kfree(hp);
> + goto unlock_exit;
> + }
> + hash_add_rcu(tt->hash_tab,&hp->hash_node, key);
> +
> + if (hpa)
> + *hpa = __pa((unsigned long) page_address(hp->page)) +
> + (hva& (hp->size - 1));
> +unlock_exit:
> + spin_unlock(&tt->hugepages_write_lock);
> +
> + put_page(*pg);
> + *pg = NULL;
> +#endif /* CONFIG_IOMMU_API */
> return (void *) hva;
> }
>
> @@ -363,7 +455,7 @@ long kvmppc_vm_h_put_tce_iommu(struct kvm_vcpu *vcpu,
> if (iommu_tce_put_param_check(tbl, ioba, tce))
> return H_PARAMETER;
>
> - hva = kvmppc_vm_gpa_to_hva_and_get(vcpu, tce,&pg,&hpa);
> + hva = kvmppc_vm_gpa_to_hva_and_get(vcpu, tt, tce,&pg,&hpa);
> if (hva == ERROR_ADDR)
> return H_HARDWARE;
> }
> @@ -372,7 +464,7 @@ long kvmppc_vm_h_put_tce_iommu(struct kvm_vcpu *vcpu,
> return H_SUCCESS;
>
> pg = pfn_to_page(hpa>> PAGE_SHIFT);
> - if (pg)
> + if (pg&& !PageCompound(pg))
> put_page(pg);
>
> return H_HARDWARE;
> @@ -414,7 +506,7 @@ static long kvmppc_vm_h_put_tce_indirect_iommu(struct kvm_vcpu *vcpu,
> (i<< IOMMU_PAGE_SHIFT), gpa))
> return H_PARAMETER;
>
> - hva = kvmppc_vm_gpa_to_hva_and_get(vcpu, gpa,&pg,
> + hva = kvmppc_vm_gpa_to_hva_and_get(vcpu, tt, gpa,&pg,
> &vcpu->arch.tce_tmp_hpas[i]);
> if (hva == ERROR_ADDR)
> goto putpages_flush_exit;
> @@ -429,7 +521,7 @@ putpages_flush_exit:
> for ( --i; i>= 0; --i) {
> struct page *pg;
> pg = pfn_to_page(vcpu->arch.tce_tmp_hpas[i]>> PAGE_SHIFT);
> - if (pg)
> + if (pg&& !PageCompound(pg))
> put_page(pg);
> }
>
> @@ -517,7 +609,7 @@ long kvmppc_vm_h_put_tce_indirect(struct kvm_vcpu *vcpu,
> if ((ioba + (npages<< IOMMU_PAGE_SHIFT))> tt->window_size)
> return H_PARAMETER;
>
> - tces = kvmppc_vm_gpa_to_hva_and_get(vcpu, tce_list,&pg, NULL);
> + tces = kvmppc_vm_gpa_to_hva_and_get(vcpu, tt, tce_list,&pg, NULL);
> if (tces == ERROR_ADDR)
> return H_TOO_HARD;
>
> @@ -547,7 +639,7 @@ long kvmppc_vm_h_put_tce_indirect(struct kvm_vcpu *vcpu,
> kvmppc_emulated_put_tce(tt, ioba + (i<< IOMMU_PAGE_SHIFT),
> vcpu->arch.tce_tmp_hpas[i]);
> put_list_page_exit:
> - if (pg)
> + if (pg&& !PageCompound(pg))
> put_page(pg);
>
> if (vcpu->arch.tce_rm_fail != TCERM_NONE) {
> diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c
> index f8103c6..8c6449f 100644
> --- a/arch/powerpc/kvm/book3s_64_vio_hv.c
> +++ b/arch/powerpc/kvm/book3s_64_vio_hv.c
> @@ -132,6 +132,7 @@ EXPORT_SYMBOL_GPL(kvmppc_emulated_put_tce);
> * returns ERROR_ADDR if failed.
> */
> static unsigned long kvmppc_rm_gpa_to_hpa_and_get(struct kvm_vcpu *vcpu,
> + struct kvmppc_spapr_tce_table *tt,
> unsigned long gpa, struct page **pg)
> {
> struct kvm_memory_slot *memslot;
> @@ -139,6 +140,20 @@ static unsigned long kvmppc_rm_gpa_to_hpa_and_get(struct kvm_vcpu *vcpu,
> unsigned long hva, hpa = ERROR_ADDR;
> unsigned long gfn = gpa>> PAGE_SHIFT;
> unsigned shift = 0;
> + struct kvmppc_spapr_iommu_hugepage *hp;
> +
> + /* Try to find an already used hugepage */
> + unsigned key = KVMPPC_SPAPR_HUGEPAGE_HASH(gpa);
> +
> + hash_for_each_possible_rcu_notrace(tt->hash_tab, hp,
> + hash_node, key) {
> + if ((gpa< hp->gpa) || (gpa>= hp->gpa + hp->size))
> + continue;
> +
> + *pg = NULL; /* Tell the caller not to put page */
> +
> + return hp->hpa + (gpa& (hp->size - 1));
> + }
>
> memslot = search_memslots(kvm_memslots(vcpu->kvm), gfn);
> if (!memslot)
> @@ -208,7 +223,7 @@ static long kvmppc_h_put_tce_iommu(struct kvm_vcpu *vcpu,
> if (iommu_tce_put_param_check(tbl, ioba, tce))
> return H_PARAMETER;
>
> - hpa = kvmppc_rm_gpa_to_hpa_and_get(vcpu, tce,&pg);
> + hpa = kvmppc_rm_gpa_to_hpa_and_get(vcpu, tt, tce,&pg);
> if (hpa == ERROR_ADDR)
> return H_TOO_HARD;
>
> @@ -247,7 +262,7 @@ static long kvmppc_h_put_tce_indirect_iommu(struct kvm_vcpu *vcpu,
>
> /* Translate TCEs and go get_page() */
> for (i = 0; i< npages; ++i) {
> - hpa = kvmppc_rm_gpa_to_hpa_and_get(vcpu, tces[i],&pg);
> + hpa = kvmppc_rm_gpa_to_hpa_and_get(vcpu, tt, tces[i],&pg);
> if (hpa == ERROR_ADDR) {
> vcpu->arch.tce_tmp_num = i;
> vcpu->arch.tce_rm_fail = TCERM_GETPAGE;
> @@ -342,7 +357,7 @@ long kvmppc_h_put_tce_indirect(struct kvm_vcpu *vcpu,
> if ((ioba + (npages<< IOMMU_PAGE_SHIFT))> tt->window_size)
> return H_PARAMETER;
>
> - tces = kvmppc_rm_gpa_to_hpa_and_get(vcpu, tce_list,&pg);
> + tces = kvmppc_rm_gpa_to_hpa_and_get(vcpu, tt, tce_list,&pg);
> if (tces == ERROR_ADDR)
> return H_TOO_HARD;
>
More information about the Linuxppc-dev
mailing list