[RFC PATCH 10/11] KVM: PPC: Implement MMU notifiers
Paul Mackerras
paulus at samba.org
Thu Nov 17 10:52:20 EST 2011
This implements the low-level functions called by the MMU notifiers in
the generic KVM code, and defines KVM_ARCH_WANT_MMU_NOTIFIER if
CONFIG_KVM_BOOK3S_64_HV so that the generic KVM MMU notifiers get
included.
That means we also have to take notice of when PTE invalidations are
in progress, as indicated by mmu_notifier_retry(). In kvmppc_h_enter,
if any invalidation is in progress we just install a non-present HPTE.
In kvmppc_book3s_hv_page_fault, if an invalidation is in progress we
just return without resolving the guest, causing it to encounter another
page fault immediately. This is better than spinning inside
kvmppc_book3s_hv_page_fault because this way the guest can get preempted
by a hypervisor decrementer interrupt without us having to do any
special checks.
We currently maintain a referenced bit in the rmap array, and when we
clear it, we make all the HPTEs that map the corresponding page be
non-present, as if the page were invalidated. In future we could use
the hardware reference bit in the guest HPT instead.
The kvm_set_spte_hva function is implemented as kvm_unmap_hva. The
former appears to be unused anyway.
This all means that on processors that support virtual partition
memory (POWER7), we can claim support for the KVM_CAP_SYNC_MMU
capability, and we no longer have to pin all the guest memory.
Signed-off-by: Paul Mackerras <paulus at samba.org>
---
arch/powerpc/include/asm/kvm_host.h | 13 +++
arch/powerpc/kvm/Kconfig | 1 +
arch/powerpc/kvm/book3s_64_mmu_hv.c | 160 ++++++++++++++++++++++++++++++++++-
arch/powerpc/kvm/book3s_hv.c | 25 +++--
arch/powerpc/kvm/book3s_hv_rm_mmu.c | 34 ++++++-
arch/powerpc/kvm/powerpc.c | 3 +
6 files changed, 218 insertions(+), 18 deletions(-)
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 3dfac3d..79bfc69 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -44,6 +44,19 @@
#define KVM_COALESCED_MMIO_PAGE_OFFSET 1
#endif
+#ifdef CONFIG_KVM_BOOK3S_64_HV
+#include <linux/mmu_notifier.h>
+
+#define KVM_ARCH_WANT_MMU_NOTIFIER
+
+struct kvm;
+extern int kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
+extern int kvm_age_hva(struct kvm *kvm, unsigned long hva);
+extern int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
+extern void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
+
+#endif
+
/* We don't currently support large pages. */
#define KVM_HPAGE_GFN_SHIFT(x) 0
#define KVM_NR_PAGE_SIZES 1
diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
index 78133de..8f64709 100644
--- a/arch/powerpc/kvm/Kconfig
+++ b/arch/powerpc/kvm/Kconfig
@@ -69,6 +69,7 @@ config KVM_BOOK3S_64
config KVM_BOOK3S_64_HV
bool "KVM support for POWER7 and PPC970 using hypervisor mode in host"
depends on KVM_BOOK3S_64
+ select MMU_NOTIFIER
---help---
Support running unmodified book3s_64 guest kernels in
virtual machines on POWER7 and PPC970 processors that have
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index e93c789..8c497b8 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -138,6 +138,15 @@ void kvmppc_map_vrma(struct kvm *kvm, struct kvm_userspace_memory_region *mem)
hp1 = hpte1_pgsize_encoding(psize) |
HPTE_R_R | HPTE_R_C | HPTE_R_M | PP_RWXX;
+ spin_lock(&kvm->mmu_lock);
+ /* wait until no invalidations are in progress */
+ while (kvm->mmu_notifier_count) {
+ spin_unlock(&kvm->mmu_lock);
+ while (kvm->mmu_notifier_count)
+ cpu_relax();
+ spin_lock(&kvm->mmu_lock);
+ }
+
for (i = 0; i < npages; ++i) {
addr = i << porder;
if (pfns) {
@@ -185,6 +194,7 @@ void kvmppc_map_vrma(struct kvm *kvm, struct kvm_userspace_memory_region *mem)
KVMPPC_RMAP_REFERENCED | KVMPPC_RMAP_PRESENT;
}
}
+ spin_unlock(&kvm->mmu_lock);
}
int kvmppc_mmu_hv_init(void)
@@ -506,7 +516,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
struct kvm *kvm = vcpu->kvm;
struct kvmppc_slb *slbe;
unsigned long *hptep, hpte[3];
- unsigned long psize, pte_size;
+ unsigned long mmu_seq, psize, pte_size;
unsigned long gfn, hva, pfn, amr;
struct kvm_memory_slot *memslot;
unsigned long *rmap;
@@ -581,6 +591,11 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
if (kvm->arch.slot_pfns[memslot->id])
return -EFAULT; /* should never get here */
hva = gfn_to_hva_memslot(memslot, gfn);
+
+ /* used to check for invalidations in progress */
+ mmu_seq = kvm->mmu_notifier_seq;
+ smp_rmb();
+
npages = get_user_pages_fast(hva, 1, 1, pages);
if (npages < 1)
return -EFAULT;
@@ -596,9 +611,15 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
goto out_put;
pfn = page_to_pfn(page);
+ /* Check if we might have been invalidated; let the guest retry if so */
+ ret = RESUME_GUEST;
+ spin_lock(&kvm->mmu_lock);
+ if (mmu_notifier_retry(vcpu, mmu_seq))
+ goto out_unlock;
+
/* Set the HPTE to point to pfn */
ret = RESUME_GUEST;
- hptep = (unsigned long *)kvm->arch.hpt_virt + (index << 1);
+ hptep = (unsigned long *)(kvm->arch.hpt_virt + (index << 4));
rev = &kvm->arch.revmap[index];
while (!try_lock_hpte(hptep, HPTE_V_HVLOCK))
cpu_relax();
@@ -606,7 +627,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
rev->guest_rpte != hpte[2]) {
/* HPTE has been changed under us; let the guest retry */
hptep[0] &= ~HPTE_V_HVLOCK;
- goto out_put;
+ goto out_unlock;
}
hpte[0] = (hpte[0] & ~HPTE_V_ABSENT) | HPTE_V_VALID;
hpte[1] = (rev->guest_rpte & ~(HPTE_R_PP0 - pte_size)) |
@@ -617,6 +638,8 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
if (page)
SetPageDirty(page);
+ out_unlock:
+ spin_unlock(&kvm->mmu_lock);
out_put:
if (page)
put_page(page);
@@ -635,6 +658,137 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
return RESUME_GUEST;
}
+static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
+ int (*handler)(struct kvm *kvm, unsigned long *rmapp,
+ unsigned long gfn))
+{
+ int i;
+ int ret;
+ int retval = 0;
+ struct kvm_memslots *slots;
+
+ slots = kvm_memslots(kvm);
+ for (i = 0; i < slots->nmemslots; i++) {
+ struct kvm_memory_slot *memslot = &slots->memslots[i];
+ unsigned long start = memslot->userspace_addr;
+ unsigned long end;
+
+ end = start + (memslot->npages << PAGE_SHIFT);
+ if (hva >= start && hva < end) {
+ gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT;
+
+ ret = handler(kvm, &memslot->rmap[gfn_offset],
+ memslot->base_gfn + gfn_offset);
+ retval |= ret;
+ }
+ }
+
+ return retval;
+}
+
+static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
+ unsigned long gfn)
+{
+ struct revmap_entry *rev = kvm->arch.revmap;
+ unsigned long h, i, j;
+ unsigned long *hptep, new_hpte[2];
+ unsigned long ptel, psize;
+ int n = 0;
+
+ for (;;) {
+ while (test_and_set_bit_lock(KVMPPC_RMAP_LOCK_BIT, rmapp))
+ cpu_relax();
+ if (!(*rmapp & KVMPPC_RMAP_PRESENT)) {
+ __clear_bit_unlock(KVMPPC_RMAP_LOCK_BIT, rmapp);
+ break;
+ }
+
+ /*
+ * To avoid an ABBA deadlock with the HPTE lock bit,
+ * we have to unlock the rmap chain before locking the HPTE.
+ * Thus we remove the first entry, unlock the rmap chain,
+ * lock the HPTE and then check that it is for the
+ * page we're unmapping before changing it to non-present.
+ */
+ i = *rmapp & KVMPPC_RMAP_INDEX;
+ j = rev[i].forw;
+ if (j == i) {
+ /* chain is now empty */
+ j = 0;
+ } else {
+ /* remove i from chain */
+ h = rev[i].back;
+ rev[h].forw = j;
+ rev[j].back = h;
+ rev[i].forw = rev[i].back = i;
+ j |= KVMPPC_RMAP_PRESENT;
+ }
+ smp_wmb();
+ *rmapp = j | (1ul << KVMPPC_RMAP_REF_BIT);
+
+ /* Now lock, check and modify the HPTE */
+ hptep = (unsigned long *) (kvm->arch.hpt_virt + (i << 4));
+ while (!try_lock_hpte(hptep, HPTE_V_HVLOCK))
+ cpu_relax();
+ ptel = rev[i].guest_rpte;
+ psize = hpte_page_size(hptep[0], ptel);
+ if ((hptep[0] & HPTE_V_VALID) &&
+ hpte_rpn(ptel, psize) == gfn) {
+ new_hpte[0] = hptep[0] | HPTE_V_ABSENT;
+ if ((new_hpte[0] & 0xffffffffff000000ul) ==
+ (HPTE_V_1TB_SEG | (VRMA_VSID << (40 - 16))))
+ new_hpte[0] &= ~HPTE_V_VALID;
+ new_hpte[1] = (ptel & ~(HPTE_R_PP0 - psize)) |
+ HPTE_R_KEY_HI | HPTE_R_KEY_LO | HPTE_R_N;
+ kvmppc_modify_hpte(kvm, hptep, new_hpte, i);
+ ++n;
+ } else {
+ hptep[0] &= ~HPTE_V_HVLOCK;
+ }
+ }
+ return 0;
+}
+
+int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
+{
+ kvm_handle_hva(kvm, hva, kvm_unmap_rmapp);
+ return 0;
+}
+
+static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
+ unsigned long gfn)
+{
+ if (!(*rmapp & KVMPPC_RMAP_REFERENCED))
+ return 0;
+ kvm_unmap_rmapp(kvm, rmapp, gfn);
+ while (test_and_set_bit_lock(KVMPPC_RMAP_LOCK_BIT, rmapp))
+ cpu_relax();
+ __clear_bit(KVMPPC_RMAP_REF_BIT, rmapp);
+ __clear_bit_unlock(KVMPPC_RMAP_LOCK_BIT, rmapp);
+ return 1;
+}
+
+int kvm_age_hva(struct kvm *kvm, unsigned long hva)
+{
+ return kvm_handle_hva(kvm, hva, kvm_age_rmapp);
+}
+
+static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
+ unsigned long gfn)
+{
+ return !!(*rmapp & KVMPPC_RMAP_REFERENCED);
+}
+
+int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
+{
+ return kvm_handle_hva(kvm, hva, kvm_test_age_rmapp);
+}
+
+void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
+{
+ kvm_handle_hva(kvm, hva, kvm_unmap_rmapp);
+}
+
void *kvmppc_pin_guest_page(struct kvm *kvm, unsigned long gpa,
unsigned long *nb_ret)
{
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 47053e9..9e67320 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -1278,10 +1278,12 @@ int kvmppc_core_prepare_memory_region(struct kvm *kvm,
ri->base_pfn << PAGE_SHIFT, rma_size, lpcr);
}
- pfns = vzalloc(npages * sizeof(unsigned long));
- if (!pfns)
- return -ENOMEM;
- kvm->arch.slot_pfns[mem->slot] = pfns;
+ if (!cpu_has_feature(CPU_FTR_ARCH_206)) {
+ pfns = vzalloc(npages * sizeof(unsigned long));
+ if (!pfns)
+ return -ENOMEM;
+ kvm->arch.slot_pfns[mem->slot] = pfns;
+ }
return 0;
@@ -1305,12 +1307,14 @@ void kvmppc_core_commit_memory_region(struct kvm *kvm,
return;
pfns = kvm->arch.slot_pfns[mem->slot];
- npages = mem->memory_size >> porder;
- for (i = 0; i < npages; ++i) {
- hva = mem->userspace_addr + (i << porder);
- page = hva_to_page(hva);
- if (page)
- pfns[i] = page_to_pfn(page);
+ if (pfns) {
+ npages = mem->memory_size >> porder;
+ for (i = 0; i < npages; ++i) {
+ hva = mem->userspace_addr + (i << porder);
+ page = hva_to_page(hva);
+ if (page)
+ pfns[i] = page_to_pfn(page);
+ }
}
if (mem->guest_phys_addr == 0 && mem->memory_size != 0 &&
@@ -1384,6 +1388,7 @@ void kvmppc_core_destroy_vm(struct kvm *kvm)
page = pfn_to_page(pfns[j]);
if (PageHuge(page))
page = compound_head(page);
+ SetPageDirty(page);
put_page(page);
}
}
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index 622bfcd..2cadd06 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -143,11 +143,17 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
unsigned long *rmap;
pte_t *ptep;
unsigned int shift;
+ unsigned long mmu_seq;
+ long err;
psize = hpte_page_size(pteh, ptel);
if (!psize)
return H_PARAMETER;
+ /* used later to detect if we might have been invalidated */
+ mmu_seq = kvm->mmu_notifier_seq;
+ smp_rmb();
+
/* Find the memslot (if any) for this address */
gpa = (ptel & HPTE_R_RPN) & ~(psize - 1);
gfn = gpa >> PAGE_SHIFT;
@@ -212,6 +218,18 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
return H_PARAMETER;
}
+ /*
+ * Now that we're about to write the HPTE and thus give the guest
+ * access to the page, check for any pending invalidations.
+ * We don't need to worry about that if this is a non-present page.
+ * Note that the HPTE bitlock has to nest inside the kvm->mmu_lock.
+ */
+ spin_lock(&kvm->mmu_lock);
+ if (mmu_notifier_retry(vcpu, mmu_seq))
+ /* inval in progress, write a non-present HPTE */
+ pa = 0;
+
+ err = H_PARAMETER;
if (!pa) {
/*
* If this is a non-present page for any reason
@@ -222,7 +240,7 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
* On 970 we have to have all pages present.
*/
if (!cpu_has_feature(CPU_FTR_ARCH_206))
- return H_PARAMETER;
+ goto out;
pteh |= HPTE_V_ABSENT;
if ((pteh & 0xffffffffff000000ul) ==
(HPTE_V_1TB_SEG | (VRMA_VSID << (40 - 16))))
@@ -231,14 +249,16 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
ptel |= HPTE_R_KEY_HI | HPTE_R_KEY_LO | HPTE_R_N;
}
+ /* Find and lock the HPTEG slot to use */
if (pte_index >= HPT_NPTE)
- return H_PARAMETER;
+ goto out;
+ err = H_PTEG_FULL;
if (likely((flags & H_EXACT) == 0)) {
pte_index &= ~7UL;
hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4));
for (i = 0; ; ++i) {
if (i == 8)
- return H_PTEG_FULL;
+ goto out;
if ((*hpte & HPTE_V_VALID) == 0 &&
try_lock_hpte(hpte, HPTE_V_HVLOCK | HPTE_V_VALID |
HPTE_V_ABSENT))
@@ -250,7 +270,7 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4));
if (!try_lock_hpte(hpte, HPTE_V_HVLOCK | HPTE_V_VALID |
HPTE_V_ABSENT))
- return H_PTEG_FULL;
+ goto out;
}
/* Save away the guest's idea of the second HPTE dword */
@@ -272,7 +292,11 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
asm volatile("ptesync" : : : "memory");
vcpu->arch.gpr[4] = pte_index;
- return H_SUCCESS;
+ err = H_SUCCESS;
+
+ out:
+ spin_unlock(&kvm->mmu_lock);
+ return err;
}
#define LOCK_TOKEN (*(u32 *)(&get_paca()->lock_token))
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 084d1c5..0f10a04 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -244,6 +244,9 @@ int kvm_dev_ioctl_check_extension(long ext)
if (cpu_has_feature(CPU_FTR_ARCH_201))
r = 2;
break;
+ case KVM_CAP_SYNC_MMU:
+ r = cpu_has_feature(CPU_FTR_ARCH_206) ? 1 : 0;
+ break;
#endif
default:
r = 0;
--
1.7.7.2
More information about the Linuxppc-dev
mailing list