[PATCH] KVM: PPC: Book3S HV: Don't use compound_order to determine host mapping size
David Gibson
david at gibson.dropbear.id.au
Wed Sep 5 13:59:52 AEST 2018
On Tue, Sep 04, 2018 at 06:16:01PM +1000, Nicholas Piggin wrote:
> THP paths can defer splitting compound pages until after the actual
> remap and TLB flushes to split a huge PMD/PUD. This causes radix
> partition scope page table mappings to get out of synch with the host
> qemu page table mappings.
>
> This results in random memory corruption in the guest when running
> with THP. The easiest way to reproduce is use KVM baloon to free up
> a lot of memory in the guest and then shrink the balloon to give the
> memory back, while some work is being done in the guest.
>
> Cc: Paul Mackerras <paulus at ozlabs.org>
> Cc: David Gibson <david at gibson.dropbear.id.au>
> Cc: "Aneesh Kumar K.V" <aneesh.kumar at linux.ibm.com>
> Cc: linuxppc-dev at lists.ozlabs.org
> Signed-off-by: Nicholas Piggin <npiggin at gmail.com>
Seems to fix the problem on my test case.
Tested-by: David Gibson <david at gibson.dropbear.id.au>
> ---
> arch/powerpc/kvm/book3s_64_mmu_radix.c | 88 ++++++++++----------------
> 1 file changed, 34 insertions(+), 54 deletions(-)
>
> diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c
> index 0af1c0aea1fe..d8792445d95a 100644
> --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c
> +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c
> @@ -525,8 +525,8 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
> unsigned long ea, unsigned long dsisr)
> {
> struct kvm *kvm = vcpu->kvm;
> - unsigned long mmu_seq, pte_size;
> - unsigned long gpa, gfn, hva, pfn;
> + unsigned long mmu_seq;
> + unsigned long gpa, gfn, hva;
> struct kvm_memory_slot *memslot;
> struct page *page = NULL;
> long ret;
> @@ -623,9 +623,10 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
> */
> hva = gfn_to_hva_memslot(memslot, gfn);
> if (upgrade_p && __get_user_pages_fast(hva, 1, 1, &page) == 1) {
> - pfn = page_to_pfn(page);
> upgrade_write = true;
> } else {
> + unsigned long pfn;
> +
> /* Call KVM generic code to do the slow-path check */
> pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL,
> writing, upgrade_p);
> @@ -639,63 +640,42 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
> }
> }
>
> - /* See if we can insert a 1GB or 2MB large PTE here */
> - level = 0;
> - if (page && PageCompound(page)) {
> - pte_size = PAGE_SIZE << compound_order(compound_head(page));
> - if (pte_size >= PUD_SIZE &&
> - (gpa & (PUD_SIZE - PAGE_SIZE)) ==
> - (hva & (PUD_SIZE - PAGE_SIZE))) {
> - level = 2;
> - pfn &= ~((PUD_SIZE >> PAGE_SHIFT) - 1);
> - } else if (pte_size >= PMD_SIZE &&
> - (gpa & (PMD_SIZE - PAGE_SIZE)) ==
> - (hva & (PMD_SIZE - PAGE_SIZE))) {
> - level = 1;
> - pfn &= ~((PMD_SIZE >> PAGE_SHIFT) - 1);
> - }
> - }
> -
> /*
> - * Compute the PTE value that we need to insert.
> + * Read the PTE from the process' radix tree and use that
> + * so we get the shift and attribute bits.
> */
> - if (page) {
> - pgflags = _PAGE_READ | _PAGE_EXEC | _PAGE_PRESENT | _PAGE_PTE |
> - _PAGE_ACCESSED;
> - if (writing || upgrade_write)
> - pgflags |= _PAGE_WRITE | _PAGE_DIRTY;
> - pte = pfn_pte(pfn, __pgprot(pgflags));
> + local_irq_disable();
> + ptep = __find_linux_pte(vcpu->arch.pgdir, hva, NULL, &shift);
> + pte = *ptep;
> + local_irq_enable();
> +
> + /* Get pte level from shift/size */
> + if (shift == PUD_SHIFT &&
> + (gpa & (PUD_SIZE - PAGE_SIZE)) ==
> + (hva & (PUD_SIZE - PAGE_SIZE))) {
> + level = 2;
> + } else if (shift == PMD_SHIFT &&
> + (gpa & (PMD_SIZE - PAGE_SIZE)) ==
> + (hva & (PMD_SIZE - PAGE_SIZE))) {
> + level = 1;
> } else {
> - /*
> - * Read the PTE from the process' radix tree and use that
> - * so we get the attribute bits.
> - */
> - local_irq_disable();
> - ptep = __find_linux_pte(vcpu->arch.pgdir, hva, NULL, &shift);
> - pte = *ptep;
> - local_irq_enable();
> - if (shift == PUD_SHIFT &&
> - (gpa & (PUD_SIZE - PAGE_SIZE)) ==
> - (hva & (PUD_SIZE - PAGE_SIZE))) {
> - level = 2;
> - } else if (shift == PMD_SHIFT &&
> - (gpa & (PMD_SIZE - PAGE_SIZE)) ==
> - (hva & (PMD_SIZE - PAGE_SIZE))) {
> - level = 1;
> - } else if (shift && shift != PAGE_SHIFT) {
> - /* Adjust PFN */
> - unsigned long mask = (1ul << shift) - PAGE_SIZE;
> - pte = __pte(pte_val(pte) | (hva & mask));
> - }
> - pte = __pte(pte_val(pte) | _PAGE_EXEC | _PAGE_ACCESSED);
> - if (writing || upgrade_write) {
> - if (pte_val(pte) & _PAGE_WRITE)
> - pte = __pte(pte_val(pte) | _PAGE_DIRTY);
> - } else {
> - pte = __pte(pte_val(pte) & ~(_PAGE_WRITE | _PAGE_DIRTY));
> + level = 0;
> +
> + /* Can not cope with unknown page shift */
> + if (shift && shift != PAGE_SHIFT) {
> + WARN_ON_ONCE(1);
> + return -EFAULT;
> }
> }
>
> + pte = __pte(pte_val(pte) | _PAGE_EXEC | _PAGE_ACCESSED);
> + if (writing || upgrade_write) {
> + if (pte_val(pte) & _PAGE_WRITE)
> + pte = __pte(pte_val(pte) | _PAGE_DIRTY);
> + } else {
> + pte = __pte(pte_val(pte) & ~(_PAGE_WRITE | _PAGE_DIRTY));
> + }
> +
> /* Allocate space in the tree and write the PTE */
> ret = kvmppc_create_pte(kvm, pte, gpa, level, mmu_seq);
>
--
David Gibson | I'll have my music baroque, and my code
david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson
-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 833 bytes
Desc: not available
URL: <http://lists.ozlabs.org/pipermail/linuxppc-dev/attachments/20180905/834dbd76/attachment.sig>
More information about the Linuxppc-dev
mailing list