[RFC PATCH 3/3] mm/hugetlb: Remove pmd_huge_split_prepare
Michal Hocko
mhocko at kernel.org
Thu Jul 27 22:57:56 AEST 2017
On Thu 27-07-17 14:07:56, Aneesh Kumar K.V wrote:
> Instead of marking the pmd ready for split, invalidate the pmd. This should
> take care of powerpc requirement.
which is?
> Only side effect is that we mark the pmd
> invalid early. This can result in us blocking access to the page a bit longer
> if we race against a thp split.
Again, this doesn't tell me what is the problem and why do we care.
> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar at linux.vnet.ibm.com>
> ---
> arch/powerpc/include/asm/book3s/64/hash-4k.h | 2 -
> arch/powerpc/include/asm/book3s/64/hash-64k.h | 2 -
> arch/powerpc/include/asm/book3s/64/pgtable.h | 9 ----
> arch/powerpc/include/asm/book3s/64/radix.h | 6 ---
> arch/powerpc/mm/pgtable-hash64.c | 22 --------
> include/asm-generic/pgtable.h | 8 ---
> mm/huge_memory.c | 73 +++++++++++++--------------
> 7 files changed, 35 insertions(+), 87 deletions(-)
>
> diff --git a/arch/powerpc/include/asm/book3s/64/hash-4k.h b/arch/powerpc/include/asm/book3s/64/hash-4k.h
> index 0c4e470571ca..7d914f4fc534 100644
> --- a/arch/powerpc/include/asm/book3s/64/hash-4k.h
> +++ b/arch/powerpc/include/asm/book3s/64/hash-4k.h
> @@ -100,8 +100,6 @@ extern pmd_t hash__pmdp_collapse_flush(struct vm_area_struct *vma,
> extern void hash__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
> pgtable_t pgtable);
> extern pgtable_t hash__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp);
> -extern void hash__pmdp_huge_split_prepare(struct vm_area_struct *vma,
> - unsigned long address, pmd_t *pmdp);
> extern pmd_t hash__pmdp_huge_get_and_clear(struct mm_struct *mm,
> unsigned long addr, pmd_t *pmdp);
> extern int hash__has_transparent_hugepage(void);
> diff --git a/arch/powerpc/include/asm/book3s/64/hash-64k.h b/arch/powerpc/include/asm/book3s/64/hash-64k.h
> index 8c8fb6fbdabe..b856e130c678 100644
> --- a/arch/powerpc/include/asm/book3s/64/hash-64k.h
> +++ b/arch/powerpc/include/asm/book3s/64/hash-64k.h
> @@ -164,8 +164,6 @@ extern pmd_t hash__pmdp_collapse_flush(struct vm_area_struct *vma,
> extern void hash__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
> pgtable_t pgtable);
> extern pgtable_t hash__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp);
> -extern void hash__pmdp_huge_split_prepare(struct vm_area_struct *vma,
> - unsigned long address, pmd_t *pmdp);
> extern pmd_t hash__pmdp_huge_get_and_clear(struct mm_struct *mm,
> unsigned long addr, pmd_t *pmdp);
> extern int hash__has_transparent_hugepage(void);
> diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h
> index ece6912fae8e..557915792214 100644
> --- a/arch/powerpc/include/asm/book3s/64/pgtable.h
> +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
> @@ -1122,15 +1122,6 @@ static inline pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm,
> extern pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
> pmd_t *pmdp);
>
> -#define __HAVE_ARCH_PMDP_HUGE_SPLIT_PREPARE
> -static inline void pmdp_huge_split_prepare(struct vm_area_struct *vma,
> - unsigned long address, pmd_t *pmdp)
> -{
> - if (radix_enabled())
> - return radix__pmdp_huge_split_prepare(vma, address, pmdp);
> - return hash__pmdp_huge_split_prepare(vma, address, pmdp);
> -}
> -
> #define pmd_move_must_withdraw pmd_move_must_withdraw
> struct spinlock;
> static inline int pmd_move_must_withdraw(struct spinlock *new_pmd_ptl,
> diff --git a/arch/powerpc/include/asm/book3s/64/radix.h b/arch/powerpc/include/asm/book3s/64/radix.h
> index 558fea3b2d22..a779a43b643b 100644
> --- a/arch/powerpc/include/asm/book3s/64/radix.h
> +++ b/arch/powerpc/include/asm/book3s/64/radix.h
> @@ -270,12 +270,6 @@ static inline pmd_t radix__pmd_mkhuge(pmd_t pmd)
> return __pmd(pmd_val(pmd) | _PAGE_PTE | R_PAGE_LARGE);
> return __pmd(pmd_val(pmd) | _PAGE_PTE);
> }
> -static inline void radix__pmdp_huge_split_prepare(struct vm_area_struct *vma,
> - unsigned long address, pmd_t *pmdp)
> -{
> - /* Nothing to do for radix. */
> - return;
> -}
>
> extern unsigned long radix__pmd_hugepage_update(struct mm_struct *mm, unsigned long addr,
> pmd_t *pmdp, unsigned long clr,
> diff --git a/arch/powerpc/mm/pgtable-hash64.c b/arch/powerpc/mm/pgtable-hash64.c
> index c0a7372bdaa6..00aee1485714 100644
> --- a/arch/powerpc/mm/pgtable-hash64.c
> +++ b/arch/powerpc/mm/pgtable-hash64.c
> @@ -296,28 +296,6 @@ pgtable_t hash__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
> return pgtable;
> }
>
> -void hash__pmdp_huge_split_prepare(struct vm_area_struct *vma,
> - unsigned long address, pmd_t *pmdp)
> -{
> - VM_BUG_ON(address & ~HPAGE_PMD_MASK);
> - VM_BUG_ON(REGION_ID(address) != USER_REGION_ID);
> - VM_BUG_ON(pmd_devmap(*pmdp));
> -
> - /*
> - * We can't mark the pmd none here, because that will cause a race
> - * against exit_mmap. We need to continue mark pmd TRANS HUGE, while
> - * we spilt, but at the same time we wan't rest of the ppc64 code
> - * not to insert hash pte on this, because we will be modifying
> - * the deposited pgtable in the caller of this function. Hence
> - * clear the _PAGE_USER so that we move the fault handling to
> - * higher level function and that will serialize against ptl.
> - * We need to flush existing hash pte entries here even though,
> - * the translation is still valid, because we will withdraw
> - * pgtable_t after this.
> - */
> - pmd_hugepage_update(vma->vm_mm, address, pmdp, 0, _PAGE_PRIVILEGED);
> -}
> -
> /*
> * A linux hugepage PMD was changed and the corresponding hash table entries
> * neesd to be flushed.
> diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
> index ece5e399567a..b934e41277ac 100644
> --- a/include/asm-generic/pgtable.h
> +++ b/include/asm-generic/pgtable.h
> @@ -313,14 +313,6 @@ extern pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
> pmd_t *pmdp);
> #endif
>
> -#ifndef __HAVE_ARCH_PMDP_HUGE_SPLIT_PREPARE
> -static inline void pmdp_huge_split_prepare(struct vm_area_struct *vma,
> - unsigned long address, pmd_t *pmdp)
> -{
> -
> -}
> -#endif
> -
> #ifndef __HAVE_ARCH_PTE_SAME
> static inline int pte_same(pte_t pte_a, pte_t pte_b)
> {
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index e654592f04e7..0bd9850b1d81 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -1923,8 +1923,8 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
> struct mm_struct *mm = vma->vm_mm;
> struct page *page;
> pgtable_t pgtable;
> - pmd_t old, _pmd;
> - bool young, write, soft_dirty;
> + pmd_t old_pmd, _pmd;
> + bool young, write, dirty, soft_dirty;
> unsigned long addr;
> int i;
>
> @@ -1956,14 +1956,39 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
> return __split_huge_zero_page_pmd(vma, haddr, pmd);
> }
>
> - page = pmd_page(*pmd);
> + /*
> + * Up to this point the pmd is present and huge and userland has the
> + * whole access to the hugepage during the split (which happens in
> + * place). If we overwrite the pmd with the not-huge version pointing
> + * to the pte here (which of course we could if all CPUs were bug
> + * free), userland could trigger a small page size TLB miss on the
> + * small sized TLB while the hugepage TLB entry is still established in
> + * the huge TLB. Some CPU doesn't like that.
> + * See http://support.amd.com/us/Processor_TechDocs/41322.pdf, Erratum
> + * 383 on page 93. Intel should be safe but is also warns that it's
> + * only safe if the permission and cache attributes of the two entries
> + * loaded in the two TLB is identical (which should be the case here).
> + * But it is generally safer to never allow small and huge TLB entries
> + * for the same virtual address to be loaded simultaneously. So instead
> + * of doing "pmd_populate(); flush_pmd_tlb_range();" we first mark the
> + * current pmd notpresent (atomically because here the pmd_trans_huge
> + * and pmd_trans_splitting must remain set at all times on the pmd
> + * until the split is complete for this pmd), then we flush the SMP TLB
> + * and finally we write the non-huge version of the pmd entry with
> + * pmd_populate.
> + */
> + old_pmd = pmdp_invalidate(vma, haddr, pmd);
> +
> + page = pmd_page(old_pmd);
> VM_BUG_ON_PAGE(!page_count(page), page);
> page_ref_add(page, HPAGE_PMD_NR - 1);
> - write = pmd_write(*pmd);
> - young = pmd_young(*pmd);
> - soft_dirty = pmd_soft_dirty(*pmd);
> -
> - pmdp_huge_split_prepare(vma, haddr, pmd);
> + write = pmd_write(old_pmd);
> + young = pmd_young(old_pmd);
> + dirty = pmd_dirty(*pmd);
> + soft_dirty = pmd_soft_dirty(old_pmd);
> + /*
> + * withdraw the table only after we mark the pmd entry invalid
> + */
> pgtable = pgtable_trans_huge_withdraw(mm, pmd);
> pmd_populate(mm, &_pmd, pgtable);
>
> @@ -1990,6 +2015,8 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
> if (soft_dirty)
> entry = pte_mksoft_dirty(entry);
> }
> + if (dirty)
> + SetPageDirty(page + i);
> pte = pte_offset_map(&_pmd, addr);
> BUG_ON(!pte_none(*pte));
> set_pte_at(mm, addr, pte, entry);
> @@ -2017,36 +2044,6 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
> }
>
> smp_wmb(); /* make pte visible before pmd */
> - /*
> - * Up to this point the pmd is present and huge and userland has the
> - * whole access to the hugepage during the split (which happens in
> - * place). If we overwrite the pmd with the not-huge version pointing
> - * to the pte here (which of course we could if all CPUs were bug
> - * free), userland could trigger a small page size TLB miss on the
> - * small sized TLB while the hugepage TLB entry is still established in
> - * the huge TLB. Some CPU doesn't like that.
> - * See http://support.amd.com/us/Processor_TechDocs/41322.pdf, Erratum
> - * 383 on page 93. Intel should be safe but is also warns that it's
> - * only safe if the permission and cache attributes of the two entries
> - * loaded in the two TLB is identical (which should be the case here).
> - * But it is generally safer to never allow small and huge TLB entries
> - * for the same virtual address to be loaded simultaneously. So instead
> - * of doing "pmd_populate(); flush_pmd_tlb_range();" we first mark the
> - * current pmd notpresent (atomically because here the pmd_trans_huge
> - * and pmd_trans_splitting must remain set at all times on the pmd
> - * until the split is complete for this pmd), then we flush the SMP TLB
> - * and finally we write the non-huge version of the pmd entry with
> - * pmd_populate.
> - */
> - old = pmdp_invalidate(vma, haddr, pmd);
> -
> - /*
> - * Transfer dirty bit using value returned by pmd_invalidate() to be
> - * sure we don't race with CPU that can set the bit under us.
> - */
> - if (pmd_dirty(old))
> - SetPageDirty(page);
> -
> pmd_populate(mm, pmd, pgtable);
>
> if (freeze) {
> --
> 2.13.3
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo at kvack.org. For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont at kvack.org"> email at kvack.org </a>
--
Michal Hocko
SUSE Labs
More information about the Linuxppc-dev
mailing list