[PATCH 13/18] KVM: PPC: Book3S HV: Page table construction and page faults for radix guests

Suraj Jitindar Singh sjitindarsingh at gmail.com
Mon Jan 23 14:17:20 AEDT 2017


On Thu, 2017-01-12 at 20:07 +1100, Paul Mackerras wrote:
> This adds the code to construct the second-level ("partition-scoped"
> in
> architecturese) page tables for guests using the radix MMU.  Apart
> from
> the PGD level, which is allocated when the guest is created, the rest
> of the tree is all constructed in response to hypervisor page faults.
> 
> As well as hypervisor page faults for missing pages, we also get
> faults
> for reference/change (RC) bits needing to be set, as well as various
> other error conditions.  For now, we only set the R or C bit in the
> guest page table if the same bit is set in the host PTE for the
> backing page.
> 
> This code can take advantage of the guest being backed with either
> transparent or ordinary 2MB huge pages, and insert 2MB page entries
> into the guest page tables.  There is no support for 1GB huge pages
> yet.
> ---
>  arch/powerpc/include/asm/kvm_book3s.h  |   8 +
>  arch/powerpc/kvm/book3s.c              |   1 +
>  arch/powerpc/kvm/book3s_64_mmu_hv.c    |   7 +-
>  arch/powerpc/kvm/book3s_64_mmu_radix.c | 385
> +++++++++++++++++++++++++++++++++
>  arch/powerpc/kvm/book3s_hv.c           |  17 +-
>  5 files changed, 415 insertions(+), 3 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/kvm_book3s.h
> b/arch/powerpc/include/asm/kvm_book3s.h
> index 7adfcc0..ff5cd5c 100644
> --- a/arch/powerpc/include/asm/kvm_book3s.h
> +++ b/arch/powerpc/include/asm/kvm_book3s.h
> @@ -170,6 +170,8 @@ extern int kvmppc_book3s_hv_page_fault(struct
> kvm_run *run,
>  			unsigned long status);
>  extern long kvmppc_hv_find_lock_hpte(struct kvm *kvm, gva_t eaddr,
>  			unsigned long slb_v, unsigned long valid);
> +extern int kvmppc_hv_emulate_mmio(struct kvm_run *run, struct
> kvm_vcpu *vcpu,
> +			unsigned long gpa, gva_t ea, int is_store);
>  
>  extern void kvmppc_mmu_hpte_cache_map(struct kvm_vcpu *vcpu, struct
> hpte_cache *pte);
>  extern struct hpte_cache *kvmppc_mmu_hpte_cache_next(struct kvm_vcpu
> *vcpu);
> @@ -182,8 +184,14 @@ extern void kvmppc_mmu_hpte_sysexit(void);
>  extern int kvmppc_mmu_hv_init(void);
>  extern int kvmppc_book3s_hcall_implemented(struct kvm *kvm, unsigned
> long hc);
>  
> +extern int kvmppc_book3s_radix_page_fault(struct kvm_run *run,
> +			struct kvm_vcpu *vcpu,
> +			unsigned long ea, unsigned long dsisr);
>  extern int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t
> eaddr,
>  			struct kvmppc_pte *gpte, bool data, bool
> iswrite);
> +extern void kvmppc_free_radix(struct kvm *kvm);
> +extern int kvmppc_radix_init(void);
> +extern void kvmppc_radix_exit(void);
>  
>  /* XXX remove this export when load_last_inst() is generic */
>  extern int kvmppc_ld(struct kvm_vcpu *vcpu, ulong *eaddr, int size,
> void *ptr, bool data);
> diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
> index 019f008..b6b5c18 100644
> --- a/arch/powerpc/kvm/book3s.c
> +++ b/arch/powerpc/kvm/book3s.c
> @@ -239,6 +239,7 @@ void kvmppc_core_queue_data_storage(struct
> kvm_vcpu *vcpu, ulong dar,
>  	kvmppc_set_dsisr(vcpu, flags);
>  	kvmppc_book3s_queue_irqprio(vcpu,
> BOOK3S_INTERRUPT_DATA_STORAGE);
>  }
> +EXPORT_SYMBOL_GPL(kvmppc_core_queue_data_storage);	/* used by
> kvm_hv */
>  
>  void kvmppc_core_queue_inst_storage(struct kvm_vcpu *vcpu, ulong
> flags)
>  {
> diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c
> b/arch/powerpc/kvm/book3s_64_mmu_hv.c
> index c208bf3..57690c2 100644
> --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
> +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
> @@ -395,8 +395,8 @@ static int instruction_is_store(unsigned int
> instr)
>  	return (instr & mask) != 0;
>  }
>  
> -static int kvmppc_hv_emulate_mmio(struct kvm_run *run, struct
> kvm_vcpu *vcpu,
> -				  unsigned long gpa, gva_t ea, int
> is_store)
> +int kvmppc_hv_emulate_mmio(struct kvm_run *run, struct kvm_vcpu
> *vcpu,
> +			   unsigned long gpa, gva_t ea, int
> is_store)
>  {
>  	u32 last_inst;
>  
> @@ -461,6 +461,9 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run
> *run, struct kvm_vcpu *vcpu,
>  	unsigned long rcbits;
>  	long mmio_update;
>  
> +	if (kvm_is_radix(kvm))
> +		return kvmppc_book3s_radix_page_fault(run, vcpu, ea,
> dsisr);
> +
>  	/*
>  	 * Real-mode code has already searched the HPT and found the
>  	 * entry we're interested in.  Lock the entry and check that
> diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c
> b/arch/powerpc/kvm/book3s_64_mmu_radix.c
> index 9091407..865ea9b 100644
> --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c
> +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c
> @@ -137,3 +137,388 @@ int kvmppc_mmu_radix_xlate(struct kvm_vcpu
> *vcpu, gva_t eaddr,
>  	return 0;
>  }
>  
> +#ifdef CONFIG_PPC_64K_PAGES
> +#define MMU_BASE_PSIZE	MMU_PAGE_64K
> +#else
> +#define MMU_BASE_PSIZE	MMU_PAGE_4K
> +#endif
> +
> +static void kvmppc_radix_tlbie_page(struct kvm *kvm, unsigned long
> addr,
> +				    unsigned int pshift)
> +{
> +	int psize = MMU_BASE_PSIZE;
> +
> +	if (pshift >= PMD_SHIFT)
> +		psize = MMU_PAGE_2M;
> +	addr &= ~0xfffUL;
> +	addr |= mmu_psize_defs[psize].ap << 5;
> +	asm volatile("ptesync": : :"memory");
> +	asm volatile(PPC_TLBIE_5(%0, %1, 0, 0, 1)
> +		     : : "r" (addr), "r" (kvm->arch.lpid) :
> "memory");
> +	asm volatile("ptesync": : :"memory");
> +}
> +
> +void kvmppc_radix_update_pte(struct kvm *kvm, pte_t *ptep, unsigned
> long clr,
> +			     unsigned long set, unsigned long addr,
> +			     unsigned int shift)
> +{
> +	if (!(clr & _PAGE_PRESENT) &&
> cpu_has_feature(CPU_FTR_POWER9_DD1) &&
> +	    pte_present(*ptep)) {
> +		/* have to invalidate it first */
> +		__radix_pte_update(ptep, _PAGE_PRESENT, 0);
> +		kvmppc_radix_tlbie_page(kvm, addr, shift);
> +		set |= _PAGE_PRESENT;
> +	}
> +	__radix_pte_update(ptep, clr, set);
> +}
> +
> +void kvmppc_radix_set_pte_at(struct kvm *kvm, unsigned long addr,
> +			     pte_t *ptep, pte_t pte)
> +{
> +	radix__set_pte_at(kvm->mm, addr, ptep, pte, 0);
> +}
> +
> +static struct kmem_cache *kvm_pte_cache;
> +
> +static pte_t *kvmppc_pte_alloc(void)
> +{
> +	return kmem_cache_alloc(kvm_pte_cache, GFP_KERNEL);
> +}
> +
> +static void kvmppc_pte_free(pte_t *ptep)
> +{
> +	kmem_cache_free(kvm_pte_cache, ptep);
> +}
> +
> +static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned
> long gpa,
> +			     unsigned int level, unsigned long
> mmu_seq)
> +{
> +	pgd_t *pgd;
> +	pud_t *pud, *new_pud = NULL;
> +	pmd_t *pmd, *new_pmd = NULL;
> +	pte_t *ptep, *new_ptep = NULL;
> +	int ret;
> +
> +	/* Traverse the guest's 2nd-level tree, allocate new levels
> needed */
> +	pgd = kvm->arch.pgtable + pgd_index(gpa);
> +	pud = NULL;
> +	if (pgd_present(*pgd))
> +		pud = pud_offset(pgd, gpa);
> +	else
> +		new_pud = pud_alloc_one(kvm->mm, gpa);
> +
> +	pmd = NULL;
> +	if (pud && pud_present(*pud))
> +		pmd = pmd_offset(pud, gpa);
> +	else
> +		new_pmd = pmd_alloc_one(kvm->mm, gpa);
> +
> +	if (level == 0 && !(pmd && pmd_present(*pmd)))
> +		new_ptep = kvmppc_pte_alloc();
> +
> +	/* Check if we might have been invalidated; let the guest
> retry if so */
> +	spin_lock(&kvm->mmu_lock);
> +	ret = -EAGAIN;
> +	if (mmu_notifier_retry(kvm, mmu_seq))
> +		goto out_unlock;
> +
> +	/* Now traverse again under the lock and change the tree */
> +	ret = -ENOMEM;
> +	if (pgd_none(*pgd)) {
> +		if (!new_pud)
> +			goto out_unlock;
> +		pgd_populate(kvm->mm, pgd, new_pud);
> +		new_pud = NULL;
> +	}
> +	pud = pud_offset(pgd, gpa);
> +	if (pud_none(*pud)) {
> +		if (!new_pmd)
> +			goto out_unlock;
> +		pud_populate(kvm->mm, pud, new_pmd);
> +		new_pmd = NULL;
> +	}
> +	pmd = pmd_offset(pud, gpa);
> +	if (pmd_large(*pmd)) {
> +		/* Someone else has instantiated a large page here;
> retry */
> +		ret = -EAGAIN;
> +		goto out_unlock;
> +	}
> +	if (level == 1 && !pmd_none(*pmd)) {
> +		/*
> +		 * There's a page table page here, but we wanted
> +		 * to install a large page.  Tell the caller and let
> +		 * it try installing a normal page if it wants.
> +		 */
> +		ret = -EBUSY;
> +		goto out_unlock;
> +	}
> +	if (level == 0) {
> +		if (pmd_none(*pmd)) {
> +			if (!new_ptep)
> +				goto out_unlock;
> +			pmd_populate(kvm->mm, pmd, new_ptep);
> +			new_ptep = NULL;
> +		}
> +		ptep = pte_offset_kernel(pmd, gpa);
> +		if (pte_present(*ptep)) {
> +			/* PTE was previously valid, so invalidate
> it */
> +			kvmppc_radix_update_pte(kvm, ptep,
> _PAGE_PRESENT,
> +						0, gpa, 0);
> +			kvmppc_radix_tlbie_page(kvm, gpa, 0);
> +		}
> +		kvmppc_radix_set_pte_at(kvm, gpa, ptep, pte);
> +	} else {
> +		kvmppc_radix_set_pte_at(kvm, gpa, pmdp_ptep(pmd),
> pte);
> +	}
> +	ret = 0;
> +
> + out_unlock:
> +	spin_unlock(&kvm->mmu_lock);
> +	if (new_pud)
> +		pud_free(kvm->mm, new_pud);
> +	if (new_pmd)
> +		pmd_free(kvm->mm, new_pmd);
> +	if (new_ptep)
> +		kvmppc_pte_free(new_ptep);
> +	return ret;
> +}
> +
> +int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct
> kvm_vcpu *vcpu,
> +				   unsigned long ea, unsigned long
> dsisr)
> +{
> +	struct kvm *kvm = vcpu->kvm;
> +	unsigned long mmu_seq, pte_size;
> +	unsigned long gpa, gfn, hva, pfn;
> +	struct kvm_memory_slot *memslot;
> +	struct page *page = NULL, *pages[1];
> +	long ret, npages, ok;
> +	unsigned int writing;
> +	struct vm_area_struct *vma;
> +	unsigned long flags;
> +	pte_t pte, *ptep;
> +	unsigned long pgflags;
> +	unsigned int shift, level;
> +
> +	/* Check for unusual errors */
> +	if (dsisr & DSISR_UNSUPP_MMU) {
> +		pr_err("KVM: Got unsupported MMU fault\n");
> +		return -EFAULT;
> +	}
> +	if (dsisr & DSISR_BADACCESS) {
> +		/* Reflect to the guest as DSI */
> +		pr_err("KVM: Got radix HV page fault with
> DSISR=%lx\n", dsisr);
> +		kvmppc_core_queue_data_storage(vcpu, ea, dsisr);
> +		return RESUME_GUEST;
> +	}
> +
> +	/* Translate the logical address and get the page */
> +	gpa = vcpu->arch.fault_gpa & ~0xfffUL;
> +	gpa &= ~0xF000000000000000ul;
> +	gfn = gpa >> PAGE_SHIFT;
> +	if (!(dsisr & DSISR_PGDIRFAULT))
> +		gpa |= ea & 0xfff;
> +	memslot = gfn_to_memslot(kvm, gfn);
> +
> +	/* No memslot means it's an emulated MMIO region */
> +	if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) {
> +		if (dsisr & (DSISR_PGDIRFAULT | DSISR_BADACCESS |
> +			     DSISR_SET_RC)) {
> +			/*
> +			 * Bad address in guest page table tree, or
> other
> +			 * unusual error - reflect it to the guest
> as DSI.
> +			 */
> +			kvmppc_core_queue_data_storage(vcpu, ea,
> dsisr);
> +			return RESUME_GUEST;
> +		}
> +		return kvmppc_hv_emulate_mmio(run, vcpu, gpa, ea,
> +					      dsisr &
> DSISR_ISSTORE);
> +	}
> +
> +	/* used to check for invalidations in progress */
> +	mmu_seq = kvm->mmu_notifier_seq;
> +	smp_rmb();
> +
> +	writing = (dsisr & DSISR_ISSTORE) != 0;
> +	hva = gfn_to_hva_memslot(memslot, gfn);
> +	if (dsisr & DSISR_SET_RC) {
> +		/*
> +		 * Need to set an R or C bit in the 2nd-level
> tables;
> +		 * if the relevant bits aren't already set in the
> linux
> +		 * page tables, fall through to do the gup_fast to
> +		 * set them in the linux page tables too.
> +		 */
> +		ok = 0;
> +		pgflags = _PAGE_ACCESSED;
> +		if (writing)
> +			pgflags |= _PAGE_DIRTY;
> +		local_irq_save(flags);
> +		ptep = __find_linux_pte_or_hugepte(current->mm->pgd, 
> hva,
> +						   NULL, NULL);
> +		if (ptep) {
> +			pte = READ_ONCE(*ptep);
> +			if (pte_present(pte) &&
> +			    (pte_val(pte) & pgflags) == pgflags)
> +				ok = 1;
> +		}
> +		local_irq_restore(flags);
> +		if (ok) {
> +			spin_lock(&kvm->mmu_lock);
> +			if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
> {
> +				spin_unlock(&kvm->mmu_lock);
> +				return RESUME_GUEST;
> +			}
> +			ptep = __find_linux_pte_or_hugepte(kvm-
> >arch.pgtable,
> +							gpa, NULL,
> &shift);
> +			if (ptep && pte_present(*ptep)) {
> +				kvmppc_radix_update_pte(kvm, ptep,
> 0, pgflags,
> +							gpa, shift);
> +				spin_unlock(&kvm->mmu_lock);
> +				return RESUME_GUEST;
> +			}
> +			spin_unlock(&kvm->mmu_lock);
> +		}
> +	}
> +
> +	ret = -EFAULT;
> +	pfn = 0;
> +	pte_size = PAGE_SIZE;
> +	pgflags = _PAGE_READ | _PAGE_EXEC;
> +	level = 0;
> +	npages = get_user_pages_fast(hva, 1, writing, pages);
> +	if (npages < 1) {
> +		/* Check if it's an I/O mapping */
> +		down_read(&current->mm->mmap_sem);
> +		vma = find_vma(current->mm, hva);
> +		if (vma && vma->vm_start <= hva && hva < vma->vm_end 
> &&
> +		    (vma->vm_flags & VM_PFNMAP)) {
> +			pfn = vma->vm_pgoff +
> +				((hva - vma->vm_start) >>
> PAGE_SHIFT);
> +			pgflags = pgprot_val(vma->vm_page_prot);
> +		}
> +		up_read(&current->mm->mmap_sem);
> +		if (!pfn)
> +			return -EFAULT;
> +	} else {
> +		page = pages[0];
> +		pfn = page_to_pfn(page);
> +		if (PageHuge(page)) {
> +			page = compound_head(page);
> +			pte_size <<= compound_order(page);
> +			/* See if we can insert a 2MB large-page PTE
> here */
> +			if (pte_size >= PMD_SIZE &&
> +			    (gpa & PMD_MASK & PAGE_MASK) ==
> +			    (hva & PMD_MASK & PAGE_MASK)) {
> +				level = 1;
> +				pfn &= ~((PMD_SIZE >> PAGE_SHIFT) -
> 1);
> +			}
> +		}
> +		/* See if we can provide write access */
> +		if (writing) {
> +			/*
> +			 * We assume gup_fast has set dirty on the
> host PTE.
> +			 */
> +			pgflags |= _PAGE_WRITE;
> +		} else {
> +			local_irq_save(flags);
> +			ptep = __find_linux_pte_or_hugepte(current-
> >mm->pgd,
> +							hva, NULL,
> NULL);
> +			if (ptep && pte_write(*ptep) &&
> pte_dirty(*ptep))
> +				pgflags |= _PAGE_WRITE;
> +			local_irq_restore(flags);
> +		}
> +	}
> +
> +	/*
> +	 * Compute the PTE value that we need to insert.
> +	 */
> +	pgflags |= _PAGE_PRESENT | _PAGE_PTE | _PAGE_ACCESSED;
> +	if (pgflags & _PAGE_WRITE)
> +		pgflags |= _PAGE_DIRTY;
> +	pte = pfn_pte(pfn, __pgprot(pgflags));
> +
> +	/* Allocate space in the tree and write the PTE */
> +	ret = kvmppc_create_pte(kvm, pte, gpa, level, mmu_seq);
> +	if (ret == -EBUSY) {
> +		/*
> +		 * There's already a PMD where wanted to install a
> large page;
> +		 * for now, fall back to installing a small page.
> +		 */
> +		level = 0;
> +		pfn |= gfn & ((PMD_SIZE >> PAGE_SHIFT) - 1);
> +		pte = pfn_pte(pfn, __pgprot(pgflags));
> +		ret = kvmppc_create_pte(kvm, pte, gpa, level,
> mmu_seq);
> +	}
> +	if (ret == 0 || ret == -EAGAIN)
> +		ret = RESUME_GUEST;
> +
> +	if (page) {
> +		/*
> +		 * We drop pages[0] here, not page because page
> might
> +		 * have been set to the head page of a compound, but
> +		 * we have to drop the reference on the correct tail
> +		 * page to match the get inside gup()
> +		 */
> +		put_page(pages[0]);
> +	}
> +	return ret;
> +}
> +
> +void kvmppc_free_radix(struct kvm *kvm)
> +{
> +	unsigned long ig, iu, im;
> +	pte_t *pte;
> +	pmd_t *pmd;
> +	pud_t *pud;
> +	pgd_t *pgd;
> +
> +	if (!kvm->arch.pgtable)
> +		return;
> +	pgd = kvm->arch.pgtable;
> +	for (ig = 0; ig < PTRS_PER_PGD; ++ig, ++pgd) {
> +		if (!pgd_present(*pgd))
> +			continue;
> +		pud = pud_offset(pgd, 0);
> +		for (iu = 0; iu < PTRS_PER_PUD; ++iu, ++pud) {
> +			if (!pud_present(*pud))
> +				continue;
> +			pmd = pmd_offset(pud, 0);
> +			for (im = 0; im < PTRS_PER_PMD; ++im, ++pmd)
> {
> +				if (pmd_huge(*pmd)) {
> +					pmd_clear(pmd);
> +					continue;
> +				}
> +				if (!pmd_present(*pmd))
> +					continue;
> +				pte = pte_offset_map(pmd, 0);
> +				memset(pte, 0, sizeof(long) <<
> PTE_INDEX_SIZE);
> +				kvmppc_pte_free(pte);
> +				pmd_clear(pmd);
> +			}
> +			pmd_free(kvm->mm, pmd_offset(pud, 0));
> +			pud_clear(pud);
> +		}
> +		pud_free(kvm->mm, pud_offset(pgd, 0));
> +		pgd_clear(pgd);
> +	}
> +	pgd_free(kvm->mm, kvm->arch.pgtable);
> +}
> +
> +static void pte_ctor(void *addr)
> +{
> +	memset(addr, 0, PTE_TABLE_SIZE);
> +}
> +
> +int kvmppc_radix_init(void)
> +{
> +	unsigned long size = sizeof(void *) << PTE_INDEX_SIZE;
> +
> +	kvm_pte_cache = kmem_cache_create("kvm-pte", size, size, 0,
> pte_ctor);
> +	if (!kvm_pte_cache)
> +		return -ENOMEM;
> +	return 0;
> +}
> +
> +void kvmppc_radix_exit(void)
> +{
> +	kmem_cache_destroy(kvm_pte_cache);
> +}
> diff --git a/arch/powerpc/kvm/book3s_hv.c
> b/arch/powerpc/kvm/book3s_hv.c
> index 6bd0f4a..4c2d054 100644
> --- a/arch/powerpc/kvm/book3s_hv.c
> +++ b/arch/powerpc/kvm/book3s_hv.c
> @@ -3357,7 +3357,10 @@ static void kvmppc_core_destroy_vm_hv(struct
> kvm *kvm)
>  
>  	kvmppc_free_vcores(kvm);
>  
> -	kvmppc_free_hpt(kvm);
> +	if (kvm->arch.radix)
kvm_is_radix() for consistency?
> +		kvmppc_free_radix(kvm);
> +	else
> +		kvmppc_free_hpt(kvm);
>  
>  	kvmppc_free_pimap(kvm);
>  }
> @@ -3769,6 +3772,11 @@ static int kvm_init_subcore_bitmap(void)
>  	return 0;
>  }
>  
> +static int kvmppc_radix_possible(void)
> +{
> +	return cpu_has_feature(CPU_FTR_ARCH_300) && radix_enabled();
> +}
> +
>  static int kvmppc_book3s_init_hv(void)
>  {
>  	int r;
> @@ -3808,12 +3816,19 @@ static int kvmppc_book3s_init_hv(void)
>  	init_vcore_lists();
>  
>  	r = kvmppc_mmu_hv_init();
> +	if (r)
> +		return r;
> +
> +	if (kvmppc_radix_possible())
> +		r = kvmppc_radix_init();
>  	return r;
>  }
>  
>  static void kvmppc_book3s_exit_hv(void)
>  {
>  	kvmppc_free_host_rm_ops();
> +	if (kvmppc_radix_possible())
> +		kvmppc_radix_exit();
>  	kvmppc_hv_ops = NULL;
>  }
>  


More information about the Linuxppc-dev mailing list