[PATCH -V2 2/2] powerpc/mm/thp: Make page table walk safe against thp split/collapse
Aneesh Kumar K.V
aneesh.kumar at linux.vnet.ibm.com
Wed Mar 25 22:08:07 AEDT 2015
We can disable a THP split or a hugepage collapse by disabling irq.
We do send IPI to all the cpus in the early part of split/collapse,
and disabling local irq ensure we don't make progress with
split/collapse. If the THP is getting split we return NULL from
find_linux_pte_or_hugepte(). For all the current callers it should be ok.
We need to be careful if we want to use returned pte_t pointer outside
the irq disabled region. W.r.t to THP split, the pfn remains the same,
but then a hugepage collapse will result in a pfn change. There are
few steps we can take to avoid a hugepage collapse.One way is to take page
reference inside the irq disable region. Other option is to take
mmap_sem so that a parallel collapse will not happen. We can also
disable collapse by taking pmd_lock. Another method used by kvm
subsystem is to check whether we had a mmu_notifer update in between
using mmu_notifier_retry().
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar at linux.vnet.ibm.com>
---
arch/powerpc/include/asm/kvm_book3s_64.h | 12 +----
arch/powerpc/include/asm/pgtable.h | 28 +++-------
arch/powerpc/kernel/eeh.c | 6 ++-
arch/powerpc/kernel/io-workarounds.c | 10 ++--
arch/powerpc/kvm/book3s_64_mmu_hv.c | 14 +++--
arch/powerpc/kvm/book3s_hv_rm_mmu.c | 91 ++++++++++++++++++--------------
arch/powerpc/kvm/e500_mmu_host.c | 16 ++++--
arch/powerpc/mm/hash_utils_64.c | 2 +-
arch/powerpc/mm/hugetlbpage.c | 29 +++++++---
arch/powerpc/perf/callchain.c | 24 +++++----
10 files changed, 125 insertions(+), 107 deletions(-)
diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h
index f06820c67175..5233a35d80e2 100644
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -281,11 +281,9 @@ static inline int hpte_cache_flags_ok(unsigned long ptel, unsigned long io_type)
/*
* If it's present and writable, atomically set dirty and referenced bits and
- * return the PTE, otherwise return 0. If we find a transparent hugepage
- * and if it is marked splitting we return 0;
+ * return the PTE, otherwise return 0.
*/
-static inline pte_t kvmppc_read_update_linux_pte(pte_t *ptep, int writing,
- unsigned int hugepage)
+static inline pte_t kvmppc_read_update_linux_pte(pte_t *ptep, int writing)
{
pte_t old_pte, new_pte = __pte(0);
@@ -301,12 +299,6 @@ static inline pte_t kvmppc_read_update_linux_pte(pte_t *ptep, int writing,
cpu_relax();
continue;
}
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- /* If hugepage and is trans splitting return None */
- if (unlikely(hugepage &&
- pmd_trans_splitting(pte_pmd(old_pte))))
- return __pte(0);
-#endif
/* If pte is not present return None */
if (unlikely(!(pte_val(old_pte) & _PAGE_PRESENT)))
return __pte(0);
diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h
index 9835ac4173b7..11a38635dd65 100644
--- a/arch/powerpc/include/asm/pgtable.h
+++ b/arch/powerpc/include/asm/pgtable.h
@@ -247,28 +247,16 @@ extern int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
#define pmd_large(pmd) 0
#define has_transparent_hugepage() 0
#endif
-pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea,
+pte_t *__find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea,
unsigned *shift);
-
-static inline pte_t *lookup_linux_ptep(pgd_t *pgdir, unsigned long hva,
- unsigned long *pte_sizep)
+static inline pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea,
+ unsigned *shift)
{
- pte_t *ptep;
- unsigned long ps = *pte_sizep;
- unsigned int shift;
-
- ptep = find_linux_pte_or_hugepte(pgdir, hva, &shift);
- if (!ptep)
- return NULL;
- if (shift)
- *pte_sizep = 1ul << shift;
- else
- *pte_sizep = PAGE_SIZE;
-
- if (ps > *pte_sizep)
- return NULL;
-
- return ptep;
+ if (!arch_irqs_disabled()) {
+ pr_info("%s called with irq enabled\n", __func__);
+ dump_stack();
+ }
+ return __find_linux_pte_or_hugepte(pgdir, ea, shift);
}
#endif /* __ASSEMBLY__ */
diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index 3b2252e7731b..8424b232e598 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -330,9 +330,11 @@ static inline unsigned long eeh_token_to_phys(unsigned long token)
int hugepage_shift;
/*
- * We won't find hugepages here, iomem
+ * We won't find hugepages here(this is iomem). Hence we are not
+ * worried about _PAGE_SPLITTING/collapse. Also we will not hit
+ * page table free, because of init_mm.
*/
- ptep = find_linux_pte_or_hugepte(init_mm.pgd, token, &hugepage_shift);
+ ptep = __find_linux_pte_or_hugepte(init_mm.pgd, token, &hugepage_shift);
if (!ptep)
return token;
WARN_ON(hugepage_shift);
diff --git a/arch/powerpc/kernel/io-workarounds.c b/arch/powerpc/kernel/io-workarounds.c
index 24b968f8e4d8..63d9cc4d7366 100644
--- a/arch/powerpc/kernel/io-workarounds.c
+++ b/arch/powerpc/kernel/io-workarounds.c
@@ -71,15 +71,15 @@ struct iowa_bus *iowa_mem_find_bus(const PCI_IO_ADDR addr)
vaddr = (unsigned long)PCI_FIX_ADDR(addr);
if (vaddr < PHB_IO_BASE || vaddr >= PHB_IO_END)
return NULL;
-
- ptep = find_linux_pte_or_hugepte(init_mm.pgd, vaddr,
+ /*
+ * We won't find huge pages here (iomem). Also can't hit
+ * a page table free due to init_mm
+ */
+ ptep = __find_linux_pte_or_hugepte(init_mm.pgd, vaddr,
&hugepage_shift);
if (ptep == NULL)
paddr = 0;
else {
- /*
- * we don't have hugepages backing iomem
- */
WARN_ON(hugepage_shift);
paddr = pte_pfn(*ptep) << PAGE_SHIFT;
}
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index 534acb3c6c3d..0fe9c92e78ed 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -537,23 +537,21 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
}
/* if the guest wants write access, see if that is OK */
if (!writing && hpte_is_writable(r)) {
- unsigned int hugepage_shift;
pte_t *ptep, pte;
-
+ unsigned long flags;
/*
* We need to protect against page table destruction
- * while looking up and updating the pte.
+ * hugepage split and collapse.
*/
- rcu_read_lock_sched();
+ local_irq_save(flags);
ptep = find_linux_pte_or_hugepte(current->mm->pgd,
- hva, &hugepage_shift);
+ hva, NULL);
if (ptep) {
- pte = kvmppc_read_update_linux_pte(ptep, 1,
- hugepage_shift);
+ pte = kvmppc_read_update_linux_pte(ptep, 1);
if (pte_write(pte))
write_ok = 1;
}
- rcu_read_unlock_sched();
+ local_irq_restore(flags);
}
}
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index 625407e4d3b0..00051fcee9d4 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -24,13 +24,19 @@
/* Translate address of a vmalloc'd thing to a linear map address */
static void *real_vmalloc_addr(void *x)
{
+ unsigned long flags = 0;
unsigned long addr = (unsigned long) x;
pte_t *p;
-
- p = find_linux_pte_or_hugepte(swapper_pg_dir, addr, NULL);
- if (!p || !pte_present(*p))
+ /*
+ * assume we don't have huge pages in vmalloc space...
+ * So don't worry about THP collapse/split. Called
+ * Only in realmode, hence won't need irq_save/restore.
+ */
+ p = __find_linux_pte_or_hugepte(swapper_pg_dir, addr, NULL);
+ if (!p || !pte_present(*p)) {
+ local_irq_restore(flags);
return NULL;
- /* assume we don't have huge pages in vmalloc space... */
+ }
addr = (pte_pfn(*p) << PAGE_SHIFT) | (addr & ~PAGE_MASK);
return __va(addr);
}
@@ -131,25 +137,6 @@ static void remove_revmap_chain(struct kvm *kvm, long pte_index,
unlock_rmap(rmap);
}
-static pte_t lookup_linux_pte_and_update(pgd_t *pgdir, unsigned long hva,
- int writing, unsigned long *pte_sizep)
-{
- pte_t *ptep;
- unsigned long ps = *pte_sizep;
- unsigned int hugepage_shift;
-
- ptep = find_linux_pte_or_hugepte(pgdir, hva, &hugepage_shift);
- if (!ptep)
- return __pte(0);
- if (hugepage_shift)
- *pte_sizep = 1ul << hugepage_shift;
- else
- *pte_sizep = PAGE_SIZE;
- if (ps > *pte_sizep)
- return __pte(0);
- return kvmppc_read_update_linux_pte(ptep, writing, hugepage_shift);
-}
-
static inline void unlock_hpte(__be64 *hpte, unsigned long hpte_v)
{
asm volatile(PPC_RELEASE_BARRIER "" : : : "memory");
@@ -166,13 +153,13 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
struct revmap_entry *rev;
unsigned long g_ptel;
struct kvm_memory_slot *memslot;
- unsigned long pte_size;
+ unsigned hpage_shift;
unsigned long is_io;
unsigned long *rmap;
- pte_t pte;
+ pte_t *ptep;
unsigned int writing;
unsigned long mmu_seq;
- unsigned long rcbits;
+ unsigned long rcbits, irq_flags = 0;
psize = hpte_page_size(pteh, ptel);
if (!psize)
@@ -208,22 +195,46 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
/* Translate to host virtual address */
hva = __gfn_to_hva_memslot(memslot, gfn);
-
- /* Look up the Linux PTE for the backing page */
- pte_size = psize;
- pte = lookup_linux_pte_and_update(pgdir, hva, writing, &pte_size);
- if (pte_present(pte) && !pte_protnone(pte)) {
- if (writing && !pte_write(pte))
- /* make the actual HPTE be read-only */
- ptel = hpte_make_readonly(ptel);
- is_io = hpte_cache_bits(pte_val(pte));
- pa = pte_pfn(pte) << PAGE_SHIFT;
- pa |= hva & (pte_size - 1);
- pa |= gpa & ~PAGE_MASK;
+ /*
+ * If we had a page table table change after lookup, we would
+ * retry via mmu_notifier_retry.
+ */
+ if (realmode)
+ ptep = __find_linux_pte_or_hugepte(pgdir, hva, &hpage_shift);
+ else {
+ local_irq_save(irq_flags);
+ ptep = find_linux_pte_or_hugepte(pgdir, hva, &hpage_shift);
}
+ if (ptep) {
+ pte_t pte;
+ unsigned int host_pte_size;
- if (pte_size < psize)
- return H_PARAMETER;
+ if (hpage_shift)
+ host_pte_size = 1ul << hpage_shift;
+ else
+ host_pte_size = PAGE_SIZE;
+ /*
+ * We should always find the guest page size
+ * to <= host page size, if host is using hugepage
+ */
+ if (host_pte_size < psize) {
+ if (!realmode)
+ local_irq_restore(flags);
+ return H_PARAMETER;
+ }
+ pte = kvmppc_read_update_linux_pte(ptep, writing);
+ if (pte_present(pte) && !pte_protnone(pte)) {
+ if (writing && !pte_write(pte))
+ /* make the actual HPTE be read-only */
+ ptel = hpte_make_readonly(ptel);
+ is_io = hpte_cache_bits(pte_val(pte));
+ pa = pte_pfn(pte) << PAGE_SHIFT;
+ pa |= hva & (host_pte_size - 1);
+ pa |= gpa & ~PAGE_MASK;
+ }
+ }
+ if (!realmode)
+ local_irq_restore(irq_flags);
ptel &= ~(HPTE_R_PP0 - psize);
ptel |= pa;
diff --git a/arch/powerpc/kvm/e500_mmu_host.c b/arch/powerpc/kvm/e500_mmu_host.c
index 5840d546aa03..4d33e199edcc 100644
--- a/arch/powerpc/kvm/e500_mmu_host.c
+++ b/arch/powerpc/kvm/e500_mmu_host.c
@@ -338,6 +338,7 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
pte_t *ptep;
unsigned int wimg = 0;
pgd_t *pgdir;
+ unsigned long flags;
/* used to check for invalidations in progress */
mmu_seq = kvm->mmu_notifier_seq;
@@ -468,14 +469,23 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
pgdir = vcpu_e500->vcpu.arch.pgdir;
- ptep = lookup_linux_ptep(pgdir, hva, &tsize_pages);
+ /*
+ * We are just looking at the wimg bits, so we don't
+ * care much about the trans splitting bit.
+ * We are holding kvm->mmu_lock so a notifier invalidate
+ * can't run hence pfn won't change.
+ */
+ local_irq_save(flags);
+ ptep = find_linux_pte_or_hugepte(pgdir, hva, NULL);
if (ptep) {
pte_t pte = READ_ONCE(*ptep);
- if (pte_present(pte))
+ if (pte_present(pte)) {
wimg = (pte_val(pte) >> PTE_WIMGE_SHIFT) &
MAS2_WIMGE_MASK;
- else {
+ local_irq_restore(flags);
+ } else {
+ local_irq_restore(flags);
pr_err_ratelimited("%s: pte not present: gfn %lx,pfn %lx\n",
__func__, (long)gfn, pfn);
ret = -EINVAL;
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index 2c2022d16059..444f7a5b859b 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -1066,7 +1066,7 @@ int hash_page_mm(struct mm_struct *mm, unsigned long ea,
#endif /* CONFIG_PPC_64K_PAGES */
/* Get PTE and page size from page tables */
- ptep = find_linux_pte_or_hugepte(pgdir, ea, &hugeshift);
+ ptep = __find_linux_pte_or_hugepte(pgdir, ea, &hugeshift);
if (ptep == NULL || !pte_present(*ptep)) {
DBG_LOW(" no PTE !\n");
rc = 1;
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index fa9d5c238d22..c0c3e5f14444 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -108,8 +108,12 @@ int pgd_huge(pgd_t pgd)
pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
{
- /* Only called for hugetlbfs pages, hence can ignore THP */
- return find_linux_pte_or_hugepte(mm->pgd, addr, NULL);
+ pte_t *ptep;
+ /*
+ * Only called for hugetlbfs pages, hence can ignore THP
+ */
+ ptep = __find_linux_pte_or_hugepte(mm->pgd, addr, NULL);
+ return ptep;
}
static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
@@ -681,28 +685,35 @@ void hugetlb_free_pgd_range(struct mmu_gather *tlb,
} while (addr = next, addr != end);
}
+/*
+ * We are holding mmap_sem, so a parallel huge page collapse cannot run.
+ * To prevent hugepage split, disable irq.
+ */
struct page *
follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
{
pte_t *ptep;
struct page *page;
unsigned shift;
- unsigned long mask;
+ unsigned long mask, flags;
/*
* Transparent hugepages are handled by generic code. We can skip them
* here.
*/
+ local_irq_save(flags);
ptep = find_linux_pte_or_hugepte(mm->pgd, address, &shift);
/* Verify it is a huge page else bail. */
- if (!ptep || !shift || pmd_trans_huge(*(pmd_t *)ptep))
+ if (!ptep || !shift || pmd_trans_huge(*(pmd_t *)ptep)) {
+ local_irq_restore(flags);
return ERR_PTR(-EINVAL);
-
+ }
mask = (1UL << shift) - 1;
page = pte_page(*ptep);
if (page)
page += (address & mask) / PAGE_SIZE;
+ local_irq_restore(flags);
return page;
}
@@ -949,9 +960,12 @@ void flush_dcache_icache_hugepage(struct page *page)
*
* So long as we atomically load page table pointers we are safe against teardown,
* we can follow the address down to the the page and take a ref on it.
+ * Should be called with irq disabled. If is is splitting hugepage, we will return
+ * NULL;
*/
-pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift)
+pte_t *__find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea,
+ unsigned *shift)
{
pgd_t pgd, *pgdp;
pud_t pud, *pudp;
@@ -962,7 +976,6 @@ pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift
if (shift)
*shift = 0;
-
pgdp = pgdir + pgd_index(ea);
pgd = READ_ONCE(*pgdp);
/*
@@ -1030,7 +1043,7 @@ out:
*shift = pdshift;
return ret_pte;
}
-EXPORT_SYMBOL_GPL(find_linux_pte_or_hugepte);
+EXPORT_SYMBOL_GPL(__find_linux_pte_or_hugepte);
int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
unsigned long end, int write, struct page **pages, int *nr)
diff --git a/arch/powerpc/perf/callchain.c b/arch/powerpc/perf/callchain.c
index 2396dda282cd..a5162d789cfc 100644
--- a/arch/powerpc/perf/callchain.c
+++ b/arch/powerpc/perf/callchain.c
@@ -111,41 +111,45 @@ perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs)
* interrupt context, so if the access faults, we read the page tables
* to find which page (if any) is mapped and access it directly.
*/
-static int read_user_stack_slow(void __user *ptr, void *ret, int nb)
+static int read_user_stack_slow(void __user *ptr, void *buf, int nb)
{
+ int ret = -EFAULT;
pgd_t *pgdir;
pte_t *ptep, pte;
unsigned shift;
unsigned long addr = (unsigned long) ptr;
unsigned long offset;
- unsigned long pfn;
+ unsigned long pfn, flags;
void *kaddr;
pgdir = current->mm->pgd;
if (!pgdir)
return -EFAULT;
+ local_irq_save(flags);
ptep = find_linux_pte_or_hugepte(pgdir, addr, &shift);
+ if (!ptep)
+ goto err_out;
if (!shift)
shift = PAGE_SHIFT;
/* align address to page boundary */
offset = addr & ((1UL << shift) - 1);
- addr -= offset;
- if (ptep == NULL)
- return -EFAULT;
- pte = *ptep;
+ pte = READ_ONCE(*ptep);
if (!pte_present(pte) || !(pte_val(pte) & _PAGE_USER))
- return -EFAULT;
+ goto err_out;
pfn = pte_pfn(pte);
if (!page_is_ram(pfn))
- return -EFAULT;
+ goto err_out;
/* no highmem to worry about here */
kaddr = pfn_to_kaddr(pfn);
- memcpy(ret, kaddr + offset, nb);
- return 0;
+ memcpy(buf, kaddr + offset, nb);
+ ret = 0;
+err_out:
+ local_irq_restore(flags);
+ return ret;
}
static int read_user_stack_64(unsigned long __user *ptr, unsigned long *ret)
--
2.1.0
More information about the Linuxppc-dev
mailing list