[RFC PATCH] powerpc/mm/thp: Make page table walk safe against thp split/collapse
Benjamin Herrenschmidt
benh at kernel.crashing.org
Tue Mar 24 10:36:02 AEDT 2015
On Mon, 2015-03-23 at 20:30 +0530, Aneesh Kumar K.V wrote:
> -static inline pte_t *lookup_linux_ptep(pgd_t *pgdir, unsigned long hva,
> +static inline pte_t lookup_linux_pte(pgd_t *pgdir, unsigned long hva,
> unsigned long *pte_sizep)
> {
> pte_t *ptep;
> unsigned long ps = *pte_sizep;
> unsigned int shift;
> + unsigned long flags;
>
> + local_irq_save(flags);
> ptep = find_linux_pte_or_hugepte(pgdir, hva, &shift);
> if (!ptep)
> - return NULL;
> + goto err_out;
> if (shift)
> *pte_sizep = 1ul << shift;
> else
> *pte_sizep = PAGE_SIZE;
>
> if (ps > *pte_sizep)
> - return NULL;
> + goto err_out;
>
> - return ptep;
> + local_irq_restore(flags);
> + return *ptep;
> +err_out:
> + local_irq_restore(flags);
> + return __pte(0);
> }
Doesn't the above go backward vs. your explanation ? IE. You capture the
PTE *value* inside the lock then drop the lock and have no check for
splitting or anything, causing you to potentially return a stale PTE.
You assume the caller will check for splitting ? It doesn't ...
> #endif /* __ASSEMBLY__ */
>
> diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
> index 3b2252e7731b..c2b44f250824 100644
> --- a/arch/powerpc/kernel/eeh.c
> +++ b/arch/powerpc/kernel/eeh.c
> @@ -326,17 +326,21 @@ void eeh_slot_error_detail(struct eeh_pe *pe, int severity)
> static inline unsigned long eeh_token_to_phys(unsigned long token)
> {
> pte_t *ptep;
> - unsigned long pa;
> + unsigned long pa, flags;
> int hugepage_shift;
>
> /*
> * We won't find hugepages here, iomem
> */
> + local_irq_save(flags);
> ptep = find_linux_pte_or_hugepte(init_mm.pgd, token, &hugepage_shift);
> - if (!ptep)
> + if (!ptep) {
> + local_irq_restore(flags);
> return token;
> + }
> WARN_ON(hugepage_shift);
> pa = pte_pfn(*ptep) << PAGE_SHIFT;
> + local_irq_restore(flags);
Isn't think the same as lookup_linux_pte ? Can't you just use that and
keep WARN_ON(psize != PAGE_SIZE) ?
> return pa | (token & (PAGE_SIZE-1));
> }
> diff --git a/arch/powerpc/kernel/io-workarounds.c b/arch/powerpc/kernel/io-workarounds.c
> index 24b968f8e4d8..bd958d93fd38 100644
> --- a/arch/powerpc/kernel/io-workarounds.c
> +++ b/arch/powerpc/kernel/io-workarounds.c
> @@ -65,6 +65,7 @@ struct iowa_bus *iowa_mem_find_bus(const PCI_IO_ADDR addr)
> if (token && token <= iowa_bus_count)
> bus = &iowa_busses[token - 1];
> else {
> + unsigned long flags;
> unsigned long vaddr, paddr;
> pte_t *ptep;
>
> @@ -72,6 +73,7 @@ struct iowa_bus *iowa_mem_find_bus(const PCI_IO_ADDR addr)
> if (vaddr < PHB_IO_BASE || vaddr >= PHB_IO_END)
> return NULL;
>
> + local_irq_save(flags);
> ptep = find_linux_pte_or_hugepte(init_mm.pgd, vaddr,
> &hugepage_shift);
> if (ptep == NULL)
> @@ -83,6 +85,7 @@ struct iowa_bus *iowa_mem_find_bus(const PCI_IO_ADDR addr)
> WARN_ON(hugepage_shift);
> paddr = pte_pfn(*ptep) << PAGE_SHIFT;
> }
> + local_irq_restore(flags);
> bus = iowa_pci_find(vaddr, paddr);
Same.
> if (bus == NULL)
> diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
> index 534acb3c6c3d..2dccaa6a1113 100644
> --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
> +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
> @@ -537,14 +537,15 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
> }
> /* if the guest wants write access, see if that is OK */
> if (!writing && hpte_is_writable(r)) {
> + unsigned long flags;
> unsigned int hugepage_shift;
> pte_t *ptep, pte;
>
> /*
> * We need to protect against page table destruction
> - * while looking up and updating the pte.
> + * hugepage split and collapse.
> */
> - rcu_read_lock_sched();
> + local_irq_save(flags);
> ptep = find_linux_pte_or_hugepte(current->mm->pgd,
> hva, &hugepage_shift);
> if (ptep) {
> @@ -553,7 +554,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
> if (pte_write(pte))
> write_ok = 1;
> }
> - rcu_read_unlock_sched();
> + local_irq_restore(flags);
> }
> }
Ack.
> diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
> index 625407e4d3b0..b0ad673b35cf 100644
> --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
> +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
> @@ -24,14 +24,19 @@
> /* Translate address of a vmalloc'd thing to a linear map address */
> static void *real_vmalloc_addr(void *x)
> {
> + unsigned long flags;
> unsigned long addr = (unsigned long) x;
> pte_t *p;
>
> + local_irq_save(flags);
> p = find_linux_pte_or_hugepte(swapper_pg_dir, addr, NULL);
> - if (!p || !pte_present(*p))
> + if (!p || !pte_present(*p)) {
> + local_irq_restore(flags);
> return NULL;
> + }
> /* assume we don't have huge pages in vmalloc space... */
> addr = (pte_pfn(*p) << PAGE_SHIFT) | (addr & ~PAGE_MASK);
> + local_irq_restore(flags);
> return __va(addr);
> }
This is called in real mode, I don't like the debug stuff that can
happen inside local_irq_* in that context. Can you either pounce that
to the higher level callers (and so not do it in real mode at all) or
use a lower level variant ? (I prefer going for the callers).
> @@ -134,20 +139,27 @@ static void remove_revmap_chain(struct kvm *kvm, long pte_index,
> static pte_t lookup_linux_pte_and_update(pgd_t *pgdir, unsigned long hva,
> int writing, unsigned long *pte_sizep)
> {
> - pte_t *ptep;
> + pte_t *ptep, pte;
> + unsigned long flags;
> unsigned long ps = *pte_sizep;
> unsigned int hugepage_shift;
>
> + local_irq_save(flags);
> ptep = find_linux_pte_or_hugepte(pgdir, hva, &hugepage_shift);
> if (!ptep)
> - return __pte(0);
> + goto err_out;
> if (hugepage_shift)
> *pte_sizep = 1ul << hugepage_shift;
> else
> *pte_sizep = PAGE_SIZE;
> if (ps > *pte_sizep)
> - return __pte(0);
> - return kvmppc_read_update_linux_pte(ptep, writing, hugepage_shift);
> + goto err_out;
> + pte = kvmppc_read_update_linux_pte(ptep, writing, hugepage_shift);
> + local_irq_restore(flags);
> + return pte;
> +err_out:
> + local_irq_restore(flags);
> + return __pte(0);
> }
Ditto
> static inline void unlock_hpte(__be64 *hpte, unsigned long hpte_v)
> @@ -211,6 +223,10 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
>
> /* Look up the Linux PTE for the backing page */
> pte_size = psize;
> + /*
> + * If we had a page table table change after lookup, we would
> + * retry via mmu_notifier_retry.
> + */
> pte = lookup_linux_pte_and_update(pgdir, hva, writing, &pte_size);
> if (pte_present(pte) && !pte_protnone(pte)) {
> if (writing && !pte_write(pte))
> diff --git a/arch/powerpc/kvm/e500_mmu_host.c b/arch/powerpc/kvm/e500_mmu_host.c
> index cc536d4a75ef..b50778eee650 100644
> --- a/arch/powerpc/kvm/e500_mmu_host.c
> +++ b/arch/powerpc/kvm/e500_mmu_host.c
> @@ -335,7 +335,7 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
> unsigned long mmu_seq;
> struct kvm *kvm = vcpu_e500->vcpu.kvm;
> unsigned long tsize_pages = 0;
> - pte_t *ptep;
> + pte_t pte;
> unsigned int wimg = 0;
> pgd_t *pgdir;
>
> @@ -468,9 +468,16 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
>
>
> pgdir = vcpu_e500->vcpu.arch.pgdir;
> - ptep = lookup_linux_ptep(pgdir, hva, &tsize_pages);
> - if (pte_present(*ptep))
> - wimg = (*ptep >> PTE_WIMGE_SHIFT) & MAS2_WIMGE_MASK;
> + /*
> + * We are just looking at the wimg bits, so we don't
> + * care much about the trans splitting bit.
> + * How about hugepage collapse ? the pfn will change,
> + * may be we should hold mmap_sem ?. We also don't
> + * have a page reference count here .
> + */
> + pte = lookup_linux_pte(pgdir, hva, &tsize_pages);
> + if (pte_present(pte))
> + wimg = (pte >> PTE_WIMGE_SHIFT) & MAS2_WIMGE_MASK;
> else {
> if (printk_ratelimit())
> pr_err("%s: pte not present: gfn %lx, pfn %lx\n",
> diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
> index 7e408bfc7948..92f01649450a 100644
> --- a/arch/powerpc/mm/hugetlbpage.c
> +++ b/arch/powerpc/mm/hugetlbpage.c
> @@ -108,8 +108,17 @@ int pgd_huge(pgd_t pgd)
>
> pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
> {
> - /* Only called for hugetlbfs pages, hence can ignore THP */
> - return find_linux_pte_or_hugepte(mm->pgd, addr, NULL);
> + unsigned long flags;
> + pte_t *ptep;
> + /*
> + * Only called for hugetlbfs pages, hence can ignore THP
> + * The save and restore are there only for avoiding the warning
> + * in find_linux_pte_or_hugepte().
> + */
> + local_irq_save(flags);
> + ptep = find_linux_pte_or_hugepte(mm->pgd, addr, NULL);
> + local_irq_restore(flags);
> + return ptep;
> }
>
> static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
> @@ -681,28 +690,35 @@ void hugetlb_free_pgd_range(struct mmu_gather *tlb,
> } while (addr = next, addr != end);
> }
>
> +/*
> + * We are holding mmap_sem, so a parallel huge page collapse cannot run.
> + * To prevent hugepage split, disable irq.
> + */
> struct page *
> follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
> {
> pte_t *ptep;
> struct page *page;
> unsigned shift;
> - unsigned long mask;
> + unsigned long mask, flags;
> /*
> * Transparent hugepages are handled by generic code. We can skip them
> * here.
> */
> + local_irq_save(flags);
> ptep = find_linux_pte_or_hugepte(mm->pgd, address, &shift);
>
> /* Verify it is a huge page else bail. */
> - if (!ptep || !shift || pmd_trans_huge(*(pmd_t *)ptep))
> + if (!ptep || !shift || pmd_trans_huge(*(pmd_t *)ptep)) {
> + local_irq_restore(flags);
> return ERR_PTR(-EINVAL);
> -
> + }
> mask = (1UL << shift) - 1;
> page = pte_page(*ptep);
> if (page)
> page += (address & mask) / PAGE_SIZE;
>
> + local_irq_restore(flags);
> return page;
> }
>
> @@ -949,6 +965,8 @@ void flush_dcache_icache_hugepage(struct page *page)
> *
> * So long as we atomically load page table pointers we are safe against teardown,
> * we can follow the address down to the the page and take a ref on it.
> + * Should be called with irq disabled. If is is splitting hugepage, we will return
> + * NULL;
> */
>
> pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift)
> @@ -963,6 +981,11 @@ pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift
> if (shift)
> *shift = 0;
>
> + if (!irqs_disabled()) {
Might want to use the arch_* variant for similar "beware of generic
debug stuff in real mode" reasons...
> + pr_info("%s called with irq enabled\n", __func__);
> + dump_stack();
> + }
> +
> pgdp = pgdir + pgd_index(ea);
> pgd = ACCESS_ONCE(*pgdp);
> /*
> diff --git a/arch/powerpc/perf/callchain.c b/arch/powerpc/perf/callchain.c
> index 2396dda282cd..173e4abc0c56 100644
> --- a/arch/powerpc/perf/callchain.c
> +++ b/arch/powerpc/perf/callchain.c
> @@ -125,6 +125,7 @@ static int read_user_stack_slow(void __user *ptr, void *ret, int nb)
> if (!pgdir)
> return -EFAULT;
>
> + /* FIXME!! I guess we come with irq disabled here ? */
Not that I know of, that's not guaranteed.
> ptep = find_linux_pte_or_hugepte(pgdir, addr, &shift);
> if (!shift)
> shift = PAGE_SHIFT;
More information about the Linuxppc-dev
mailing list