[PATCH -V5 06/25] powerpc: Reduce PTE table memory wastage
David Gibson
dwg at au1.ibm.com
Wed Apr 10 14:46:11 EST 2013
On Thu, Apr 04, 2013 at 11:27:44AM +0530, Aneesh Kumar K.V wrote:
> From: "Aneesh Kumar K.V" <aneesh.kumar at linux.vnet.ibm.com>
>
> We allocate one page for the last level of linux page table. With THP and
> large page size of 16MB, that would mean we are wasting large part
> of that page. To map 16MB area, we only need a PTE space of 2K with 64K
> page size. This patch reduce the space wastage by sharing the page
> allocated for the last level of linux page table with multiple pmd
> entries. We call these smaller chunks PTE page fragments and allocated
> page, PTE page.
>
> In order to support systems which doesn't have 64K HPTE support, we also
> add another 2K to PTE page fragment. The second half of the PTE fragments
> is used for storing slot and secondary bit information of an HPTE. With this
> we now have a 4K PTE fragment.
>
> We use a simple approach to share the PTE page. On allocation, we bump the
> PTE page refcount to 16 and share the PTE page with the next 16 pte alloc
> request. This should help in the node locality of the PTE page fragment,
> assuming that the immediate pte alloc request will mostly come from the
> same NUMA node. We don't try to reuse the freed PTE page fragment. Hence
> we could be waisting some space.
>
> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar at linux.vnet.ibm.com>
> ---
> arch/powerpc/include/asm/mmu-book3e.h | 4 +
> arch/powerpc/include/asm/mmu-hash64.h | 4 +
> arch/powerpc/include/asm/page.h | 4 +
> arch/powerpc/include/asm/pgalloc-64.h | 72 ++++-------------
> arch/powerpc/kernel/setup_64.c | 4 +-
> arch/powerpc/mm/mmu_context_hash64.c | 35 +++++++++
> arch/powerpc/mm/pgtable_64.c | 137 +++++++++++++++++++++++++++++++++
> 7 files changed, 202 insertions(+), 58 deletions(-)
>
> diff --git a/arch/powerpc/include/asm/mmu-book3e.h b/arch/powerpc/include/asm/mmu-book3e.h
> index 99d43e0..affbd68 100644
> --- a/arch/powerpc/include/asm/mmu-book3e.h
> +++ b/arch/powerpc/include/asm/mmu-book3e.h
> @@ -231,6 +231,10 @@ typedef struct {
> u64 high_slices_psize; /* 4 bits per slice for now */
> u16 user_psize; /* page size index */
> #endif
> +#ifdef CONFIG_PPC_64K_PAGES
> + /* for 4K PTE fragment support */
> + struct page *pgtable_page;
> +#endif
> } mm_context_t;
>
> /* Page size definitions, common between 32 and 64-bit
> diff --git a/arch/powerpc/include/asm/mmu-hash64.h b/arch/powerpc/include/asm/mmu-hash64.h
> index 35bb51e..300ac3c 100644
> --- a/arch/powerpc/include/asm/mmu-hash64.h
> +++ b/arch/powerpc/include/asm/mmu-hash64.h
> @@ -498,6 +498,10 @@ typedef struct {
> unsigned long acop; /* mask of enabled coprocessor types */
> unsigned int cop_pid; /* pid value used with coprocessors */
> #endif /* CONFIG_PPC_ICSWX */
> +#ifdef CONFIG_PPC_64K_PAGES
> + /* for 4K PTE fragment support */
> + struct page *pgtable_page;
> +#endif
> } mm_context_t;
>
>
> diff --git a/arch/powerpc/include/asm/page.h b/arch/powerpc/include/asm/page.h
> index f072e97..38e7ff6 100644
> --- a/arch/powerpc/include/asm/page.h
> +++ b/arch/powerpc/include/asm/page.h
> @@ -378,7 +378,11 @@ void arch_free_page(struct page *page, int order);
>
> struct vm_area_struct;
>
> +#ifdef CONFIG_PPC_64K_PAGES
> +typedef pte_t *pgtable_t;
> +#else
> typedef struct page *pgtable_t;
> +#endif
Ugh, that's pretty horrible, though I don't see an easy way around it.
> #include <asm-generic/memory_model.h>
> #endif /* __ASSEMBLY__ */
> diff --git a/arch/powerpc/include/asm/pgalloc-64.h b/arch/powerpc/include/asm/pgalloc-64.h
> index cdbf555..3418989 100644
> --- a/arch/powerpc/include/asm/pgalloc-64.h
> +++ b/arch/powerpc/include/asm/pgalloc-64.h
> @@ -150,6 +150,13 @@ static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t table,
>
> #else /* if CONFIG_PPC_64K_PAGES */
>
> +extern pte_t *page_table_alloc(struct mm_struct *, unsigned long, int);
> +extern void page_table_free(struct mm_struct *, unsigned long *, int);
> +extern void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift);
> +#ifdef CONFIG_SMP
> +extern void __tlb_remove_table(void *_table);
> +#endif
> +
> #define pud_populate(mm, pud, pmd) pud_set(pud, (unsigned long)pmd)
>
> static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd,
> @@ -161,90 +168,42 @@ static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd,
> static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd,
> pgtable_t pte_page)
> {
> - pmd_populate_kernel(mm, pmd, page_address(pte_page));
> + pmd_set(pmd, (unsigned long)pte_page);
> }
>
> static inline pgtable_t pmd_pgtable(pmd_t pmd)
> {
> - return pmd_page(pmd);
> + return (pgtable_t)(pmd_val(pmd) & -sizeof(pte_t)*PTRS_PER_PTE);
> }
>
> static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm,
> unsigned long address)
> {
> - return (pte_t *)__get_free_page(GFP_KERNEL | __GFP_REPEAT | __GFP_ZERO);
> + return (pte_t *)page_table_alloc(mm, address, 1);
> }
>
> static inline pgtable_t pte_alloc_one(struct mm_struct *mm,
> - unsigned long address)
> + unsigned long address)
> {
> - struct page *page;
> - pte_t *pte;
> -
> - pte = pte_alloc_one_kernel(mm, address);
> - if (!pte)
> - return NULL;
> - page = virt_to_page(pte);
> - pgtable_page_ctor(page);
> - return page;
> + return (pgtable_t)page_table_alloc(mm, address, 0);
> }
>
> static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
> {
> - free_page((unsigned long)pte);
> + page_table_free(mm, (unsigned long *)pte, 1);
> }
>
> static inline void pte_free(struct mm_struct *mm, pgtable_t ptepage)
> {
> - pgtable_page_dtor(ptepage);
> - __free_page(ptepage);
> -}
> -
> -static inline void pgtable_free(void *table, unsigned index_size)
> -{
> - if (!index_size)
> - free_page((unsigned long)table);
> - else {
> - BUG_ON(index_size > MAX_PGTABLE_INDEX_SIZE);
> - kmem_cache_free(PGT_CACHE(index_size), table);
> - }
> + page_table_free(mm, (unsigned long *)ptepage, 0);
> }
>
> -#ifdef CONFIG_SMP
> -static inline void pgtable_free_tlb(struct mmu_gather *tlb,
> - void *table, int shift)
> -{
> - unsigned long pgf = (unsigned long)table;
> - BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE);
> - pgf |= shift;
> - tlb_remove_table(tlb, (void *)pgf);
> -}
> -
> -static inline void __tlb_remove_table(void *_table)
> -{
> - void *table = (void *)((unsigned long)_table & ~MAX_PGTABLE_INDEX_SIZE);
> - unsigned shift = (unsigned long)_table & MAX_PGTABLE_INDEX_SIZE;
> -
> - pgtable_free(table, shift);
> -}
> -#else /* !CONFIG_SMP */
> -static inline void pgtable_free_tlb(struct mmu_gather *tlb,
> - void *table, int shift)
> -{
> - pgtable_free(table, shift);
> -}
> -#endif /* CONFIG_SMP */
> -
> static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t table,
> unsigned long address)
> {
> - struct page *page = page_address(table);
> -
> tlb_flush_pgtable(tlb, address);
> - pgtable_page_dtor(page);
> - pgtable_free_tlb(tlb, page, 0);
> + pgtable_free_tlb(tlb, table, 0);
> }
> -
> #endif /* CONFIG_PPC_64K_PAGES */
>
> static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
> @@ -258,7 +217,6 @@ static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
> kmem_cache_free(PGT_CACHE(PMD_INDEX_SIZE), pmd);
> }
>
> -
> #define __pmd_free_tlb(tlb, pmd, addr) \
> pgtable_free_tlb(tlb, pmd, PMD_INDEX_SIZE)
> #ifndef CONFIG_PPC_64K_PAGES
> diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
> index 6da881b..04d833c 100644
> --- a/arch/powerpc/kernel/setup_64.c
> +++ b/arch/powerpc/kernel/setup_64.c
> @@ -575,7 +575,9 @@ void __init setup_arch(char **cmdline_p)
> init_mm.end_code = (unsigned long) _etext;
> init_mm.end_data = (unsigned long) _edata;
> init_mm.brk = klimit;
> -
> +#ifdef CONFIG_PPC_64K_PAGES
> + init_mm.context.pgtable_page = NULL;
> +#endif
> irqstack_early_init();
> exc_lvl_early_init();
> emergency_stack_init();
> diff --git a/arch/powerpc/mm/mmu_context_hash64.c b/arch/powerpc/mm/mmu_context_hash64.c
> index 59cd773..fbfdca2 100644
> --- a/arch/powerpc/mm/mmu_context_hash64.c
> +++ b/arch/powerpc/mm/mmu_context_hash64.c
> @@ -86,6 +86,9 @@ int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
> spin_lock_init(mm->context.cop_lockp);
> #endif /* CONFIG_PPC_ICSWX */
>
> +#ifdef CONFIG_PPC_64K_PAGES
> + mm->context.pgtable_page = NULL;
> +#endif
> return 0;
> }
>
> @@ -97,13 +100,45 @@ void __destroy_context(int context_id)
> }
> EXPORT_SYMBOL_GPL(__destroy_context);
>
> +#ifdef CONFIG_PPC_64K_PAGES
> +static void destroy_pagetable_page(struct mm_struct *mm)
> +{
> + int count;
> + struct page *page;
> +
> + page = mm->context.pgtable_page;
> + if (!page)
> + return;
> +
> + /* drop all the pending references */
> + count = atomic_read(&page->_mapcount) + 1;
> + /* We allow PTE_FRAG_NR(16) fragments from a PTE page */
> + count = atomic_sub_return(16 - count, &page->_count);
You should really move PTE_FRAG_NR to a header so you can actually use
it here rather than hard coding 16.
It took me a fair while to convince myself that there is no race here
with something altering mapcount and count between the atomic_read()
and the atomic_sub_return(). It could do with a comment to explain
why that is safe.
Re-using the mapcount field for your index also seems odd, and it took
me a while to convince myself that that's safe too. Wouldn't it be
simpler to store a pointer to the next sub-page in the mm_context
instead? You can get from that to the struct page easily enough with a
shift and pfn_to_page().
> + if (!count) {
> + pgtable_page_dtor(page);
> + reset_page_mapcount(page);
> + free_hot_cold_page(page, 0);
It would be nice to use put_page() somehow instead of duplicating its
logic, though I realise the sparc code you've based this on does the
same thing.
> + }
> +}
> +
> +#else
> +static inline void destroy_pagetable_page(struct mm_struct *mm)
> +{
> + return;
> +}
> +#endif
> +
> +
> void destroy_context(struct mm_struct *mm)
> {
> +
> #ifdef CONFIG_PPC_ICSWX
> drop_cop(mm->context.acop, mm);
> kfree(mm->context.cop_lockp);
> mm->context.cop_lockp = NULL;
> #endif /* CONFIG_PPC_ICSWX */
> +
> + destroy_pagetable_page(mm);
> __destroy_context(mm->context.id);
> subpage_prot_free(mm);
> mm->context.id = MMU_NO_CONTEXT;
> diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
> index e212a27..e79840b 100644
> --- a/arch/powerpc/mm/pgtable_64.c
> +++ b/arch/powerpc/mm/pgtable_64.c
> @@ -337,3 +337,140 @@ EXPORT_SYMBOL(__ioremap_at);
> EXPORT_SYMBOL(iounmap);
> EXPORT_SYMBOL(__iounmap);
> EXPORT_SYMBOL(__iounmap_at);
> +
> +#ifdef CONFIG_PPC_64K_PAGES
> +/*
> + * we support 16 fragments per PTE page. This is limited by how many
> + * bits we can pack in page->_mapcount. We use the first half for
> + * tracking the usage for rcu page table free.
> + */
> +#define PTE_FRAG_NR 16
> +/*
> + * We use a 2K PTE page fragment and another 2K for storing
> + * real_pte_t hash index
> + */
> +#define PTE_FRAG_SIZE (2 * PTRS_PER_PTE * sizeof(pte_t))
> +
> +static pte_t *get_from_cache(struct mm_struct *mm)
> +{
> + int index;
> + pte_t *ret = NULL;
> + struct page *page;
> +
> + spin_lock(&mm->page_table_lock);
> + page = mm->context.pgtable_page;
> + if (page) {
> + void *p = page_address(page);
> + index = atomic_add_return(1, &page->_mapcount);
> + ret = (pte_t *) (p + (index * PTE_FRAG_SIZE));
> + /*
> + * If we have taken up all the fragments mark PTE page NULL
> + */
> + if (index == PTE_FRAG_NR - 1)
> + mm->context.pgtable_page = NULL;
> + }
> + spin_unlock(&mm->page_table_lock);
> + return ret;
> +}
> +
> +static pte_t *__alloc_for_cache(struct mm_struct *mm, int kernel)
> +{
> + pte_t *ret = NULL;
> + struct page *page = alloc_page(GFP_KERNEL | __GFP_NOTRACK |
> + __GFP_REPEAT | __GFP_ZERO);
> + if (!page)
> + return NULL;
> +
> + spin_lock(&mm->page_table_lock);
> + /*
> + * If we find pgtable_page set, we return
> + * the allocated page with single fragement
> + * count.
> + */
> + if (likely(!mm->context.pgtable_page)) {
> + atomic_set(&page->_count, PTE_FRAG_NR);
> + atomic_set(&page->_mapcount, 0);
> + mm->context.pgtable_page = page;
> + }
.. and in the unlikely case where there *is* a pgtable_page already
set, what then? Seems like you should BUG_ON, or at least return NULL
- as it is you will return the first sub-page of that page again,
which is very likely in use.
> + spin_unlock(&mm->page_table_lock);
> +
> + ret = (unsigned long *)page_address(page);
> + if (!kernel)
> + pgtable_page_ctor(page);
> +
> + return ret;
> +}
> +
> +pte_t *page_table_alloc(struct mm_struct *mm, unsigned long vmaddr, int kernel)
> +{
> + pte_t *pte;
> +
> + pte = get_from_cache(mm);
> + if (pte)
> + return pte;
> +
> + return __alloc_for_cache(mm, kernel);
> +}
> +
> +void page_table_free(struct mm_struct *mm, unsigned long *table, int kernel)
> +{
> + struct page *page = virt_to_page(table);
> + if (put_page_testzero(page)) {
> + if (!kernel)
> + pgtable_page_dtor(page);
> + reset_page_mapcount(page);
> + free_hot_cold_page(page, 0);
> + }
> +}
> +
> +#ifdef CONFIG_SMP
> +static void page_table_free_rcu(void *table)
> +{
> + struct page *page = virt_to_page(table);
> + if (put_page_testzero(page)) {
> + pgtable_page_dtor(page);
> + reset_page_mapcount(page);
> + free_hot_cold_page(page, 0);
> + }
> +}
> +
> +void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift)
> +{
> + unsigned long pgf = (unsigned long)table;
> +
> + BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE);
> + pgf |= shift;
> + tlb_remove_table(tlb, (void *)pgf);
> +}
> +
> +void __tlb_remove_table(void *_table)
> +{
> + void *table = (void *)((unsigned long)_table & ~MAX_PGTABLE_INDEX_SIZE);
> + unsigned shift = (unsigned long)_table & MAX_PGTABLE_INDEX_SIZE;
> +
> + if (!shift)
> + /* PTE page needs special handling */
> + page_table_free_rcu(table);
> + else {
> + BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE);
> + kmem_cache_free(PGT_CACHE(shift), table);
> + }
> +}
> +#else
> +void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift)
> +{
> + if (!shift) {
> + /* PTE page needs special handling */
> + struct page *page = virt_to_page(table);
> + if (put_page_testzero(page)) {
> + pgtable_page_dtor(page);
> + reset_page_mapcount(page);
> + free_hot_cold_page(page, 0);
> + }
> + } else {
> + BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE);
> + kmem_cache_free(PGT_CACHE(shift), table);
> + }
> +}
> +#endif
> +#endif /* CONFIG_PPC_64K_PAGES */
--
David Gibson | I'll have my music baroque, and my code
david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson
-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 198 bytes
Desc: Digital signature
URL: <http://lists.ozlabs.org/pipermail/linuxppc-dev/attachments/20130410/c1fc146e/attachment-0001.sig>
More information about the Linuxppc-dev
mailing list