[PATCH] KVM: PPC: Book3S HV: Don't use compound_order to determine host mapping size

David Gibson david at gibson.dropbear.id.au
Wed Sep 5 13:59:52 AEST 2018


On Tue, Sep 04, 2018 at 06:16:01PM +1000, Nicholas Piggin wrote:
> THP paths can defer splitting compound pages until after the actual
> remap and TLB flushes to split a huge PMD/PUD. This causes radix
> partition scope page table mappings to get out of synch with the host
> qemu page table mappings.
> 
> This results in random memory corruption in the guest when running
> with THP. The easiest way to reproduce is use KVM baloon to free up
> a lot of memory in the guest and then shrink the balloon to give the
> memory back, while some work is being done in the guest.
> 
> Cc: Paul Mackerras <paulus at ozlabs.org>
> Cc: David Gibson <david at gibson.dropbear.id.au>
> Cc: "Aneesh Kumar K.V" <aneesh.kumar at linux.ibm.com>
> Cc: linuxppc-dev at lists.ozlabs.org
> Signed-off-by: Nicholas Piggin <npiggin at gmail.com>

Seems to fix the problem on my test case.

Tested-by: David Gibson <david at gibson.dropbear.id.au>

> ---
>  arch/powerpc/kvm/book3s_64_mmu_radix.c | 88 ++++++++++----------------
>  1 file changed, 34 insertions(+), 54 deletions(-)
> 
> diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c
> index 0af1c0aea1fe..d8792445d95a 100644
> --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c
> +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c
> @@ -525,8 +525,8 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
>  				   unsigned long ea, unsigned long dsisr)
>  {
>  	struct kvm *kvm = vcpu->kvm;
> -	unsigned long mmu_seq, pte_size;
> -	unsigned long gpa, gfn, hva, pfn;
> +	unsigned long mmu_seq;
> +	unsigned long gpa, gfn, hva;
>  	struct kvm_memory_slot *memslot;
>  	struct page *page = NULL;
>  	long ret;
> @@ -623,9 +623,10 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
>  	 */
>  	hva = gfn_to_hva_memslot(memslot, gfn);
>  	if (upgrade_p && __get_user_pages_fast(hva, 1, 1, &page) == 1) {
> -		pfn = page_to_pfn(page);
>  		upgrade_write = true;
>  	} else {
> +		unsigned long pfn;
> +
>  		/* Call KVM generic code to do the slow-path check */
>  		pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL,
>  					   writing, upgrade_p);
> @@ -639,63 +640,42 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
>  		}
>  	}
>  
> -	/* See if we can insert a 1GB or 2MB large PTE here */
> -	level = 0;
> -	if (page && PageCompound(page)) {
> -		pte_size = PAGE_SIZE << compound_order(compound_head(page));
> -		if (pte_size >= PUD_SIZE &&
> -		    (gpa & (PUD_SIZE - PAGE_SIZE)) ==
> -		    (hva & (PUD_SIZE - PAGE_SIZE))) {
> -			level = 2;
> -			pfn &= ~((PUD_SIZE >> PAGE_SHIFT) - 1);
> -		} else if (pte_size >= PMD_SIZE &&
> -			   (gpa & (PMD_SIZE - PAGE_SIZE)) ==
> -			   (hva & (PMD_SIZE - PAGE_SIZE))) {
> -			level = 1;
> -			pfn &= ~((PMD_SIZE >> PAGE_SHIFT) - 1);
> -		}
> -	}
> -
>  	/*
> -	 * Compute the PTE value that we need to insert.
> +	 * Read the PTE from the process' radix tree and use that
> +	 * so we get the shift and attribute bits.
>  	 */
> -	if (page) {
> -		pgflags = _PAGE_READ | _PAGE_EXEC | _PAGE_PRESENT | _PAGE_PTE |
> -			_PAGE_ACCESSED;
> -		if (writing || upgrade_write)
> -			pgflags |= _PAGE_WRITE | _PAGE_DIRTY;
> -		pte = pfn_pte(pfn, __pgprot(pgflags));
> +	local_irq_disable();
> +	ptep = __find_linux_pte(vcpu->arch.pgdir, hva, NULL, &shift);
> +	pte = *ptep;
> +	local_irq_enable();
> +
> +	/* Get pte level from shift/size */
> +	if (shift == PUD_SHIFT &&
> +	    (gpa & (PUD_SIZE - PAGE_SIZE)) ==
> +	    (hva & (PUD_SIZE - PAGE_SIZE))) {
> +		level = 2;
> +	} else if (shift == PMD_SHIFT &&
> +		   (gpa & (PMD_SIZE - PAGE_SIZE)) ==
> +		   (hva & (PMD_SIZE - PAGE_SIZE))) {
> +		level = 1;
>  	} else {
> -		/*
> -		 * Read the PTE from the process' radix tree and use that
> -		 * so we get the attribute bits.
> -		 */
> -		local_irq_disable();
> -		ptep = __find_linux_pte(vcpu->arch.pgdir, hva, NULL, &shift);
> -		pte = *ptep;
> -		local_irq_enable();
> -		if (shift == PUD_SHIFT &&
> -		    (gpa & (PUD_SIZE - PAGE_SIZE)) ==
> -		    (hva & (PUD_SIZE - PAGE_SIZE))) {
> -			level = 2;
> -		} else if (shift == PMD_SHIFT &&
> -			   (gpa & (PMD_SIZE - PAGE_SIZE)) ==
> -			   (hva & (PMD_SIZE - PAGE_SIZE))) {
> -			level = 1;
> -		} else if (shift && shift != PAGE_SHIFT) {
> -			/* Adjust PFN */
> -			unsigned long mask = (1ul << shift) - PAGE_SIZE;
> -			pte = __pte(pte_val(pte) | (hva & mask));
> -		}
> -		pte = __pte(pte_val(pte) | _PAGE_EXEC | _PAGE_ACCESSED);
> -		if (writing || upgrade_write) {
> -			if (pte_val(pte) & _PAGE_WRITE)
> -				pte = __pte(pte_val(pte) | _PAGE_DIRTY);
> -		} else {
> -			pte = __pte(pte_val(pte) & ~(_PAGE_WRITE | _PAGE_DIRTY));
> +		level = 0;
> +
> +		/* Can not cope with unknown page shift */
> +		if (shift && shift != PAGE_SHIFT) {
> +			WARN_ON_ONCE(1);
> +			return -EFAULT;
>  		}
>  	}
>  
> +	pte = __pte(pte_val(pte) | _PAGE_EXEC | _PAGE_ACCESSED);
> +	if (writing || upgrade_write) {
> +		if (pte_val(pte) & _PAGE_WRITE)
> +			pte = __pte(pte_val(pte) | _PAGE_DIRTY);
> +	} else {
> +		pte = __pte(pte_val(pte) & ~(_PAGE_WRITE | _PAGE_DIRTY));
> +	}
> +
>  	/* Allocate space in the tree and write the PTE */
>  	ret = kvmppc_create_pte(kvm, pte, gpa, level, mmu_seq);
>  

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson
-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 833 bytes
Desc: not available
URL: <http://lists.ozlabs.org/pipermail/linuxppc-dev/attachments/20180905/834dbd76/attachment.sig>


More information about the Linuxppc-dev mailing list