[PATCH v1 2/3] powerpc/mm/radix: Fix PTE/PMD fragment count for early page table mappings
Aneesh Kumar K.V
aneesh.kumar at linux.ibm.com
Tue Jun 23 20:37:34 AEST 2020
Bharata B Rao <bharata at linux.ibm.com> writes:
> We can hit the following BUG_ON during memory unplug:
>
> kernel BUG at arch/powerpc/mm/book3s64/pgtable.c:342!
> Oops: Exception in kernel mode, sig: 5 [#1]
> LE PAGE_SIZE=64K MMU=Radix SMP NR_CPUS=2048 NUMA pSeries
> NIP [c000000000093308] pmd_fragment_free+0x48/0xc0
> LR [c00000000147bfec] remove_pagetable+0x578/0x60c
> Call Trace:
> 0xc000008050000000 (unreliable)
> remove_pagetable+0x384/0x60c
> radix__remove_section_mapping+0x18/0x2c
> remove_section_mapping+0x1c/0x3c
> arch_remove_memory+0x11c/0x180
> try_remove_memory+0x120/0x1b0
> __remove_memory+0x20/0x40
> dlpar_remove_lmb+0xc0/0x114
> dlpar_memory+0x8b0/0xb20
> handle_dlpar_errorlog+0xc0/0x190
> pseries_hp_work_fn+0x2c/0x60
> process_one_work+0x30c/0x810
> worker_thread+0x98/0x540
> kthread+0x1c4/0x1d0
> ret_from_kernel_thread+0x5c/0x74
>
> This occurs when unplug is attempted for such memory which has
> been mapped using memblock pages as part of early kernel page
> table setup. We wouldn't have initialized the PMD or PTE fragment
> count for those PMD or PTE pages.
>
> Fixing this includes 3 parts:
>
> - Re-walk the init_mm page tables from mem_init() and initialize
> the PMD and PTE fragment count to 1.
> - When freeing PUD, PMD and PTE page table pages, check explicitly
> if they come from memblock and if so free then appropriately.
> - When we do early memblock based allocation of PMD and PUD pages,
> allocate in PAGE_SIZE granularity so that we are sure the
> complete page is used as pagetable page.
>
> Since we now do PAGE_SIZE allocations for both PUD table and
> PMD table (Note that PTE table allocation is already of PAGE_SIZE),
> we end up allocating more memory for the same amount of system RAM.
> Here is a comparision of how much more we need for a 64T and 2G
> system after this patch:
>
> 1. 64T system
> -------------
> 64T RAM would need 64G for vmemmap with struct page size being 64B.
>
> 128 PUD tables for 64T memory (1G mappings)
> 1 PUD table and 64 PMD tables for 64G vmemmap (2M mappings)
>
> With default PUD[PMD]_TABLE_SIZE(4K), (128+1+64)*4K=772K
> With PAGE_SIZE(64K) table allocations, (128+1+64)*64K=12352K
>
> 2. 2G system
> ------------
> 2G RAM would need 2M for vmemmap with struct page size being 64B.
>
> 1 PUD table for 2G memory (1G mapping)
> 1 PUD table and 1 PMD table for 2M vmemmap (2M mappings)
>
> With default PUD[PMD]_TABLE_SIZE(4K), (1+1+1)*4K=12K
> With new PAGE_SIZE(64K) table allocations, (1+1+1)*64K=192K
How about we just do
void pmd_fragment_free(unsigned long *pmd)
{
struct page *page = virt_to_page(pmd);
/*
* Early pmd pages allocated via memblock
* allocator need to be freed differently
*/
if (PageReserved(page))
return free_reserved_page(page);
BUG_ON(atomic_read(&page->pt_frag_refcount) <= 0);
if (atomic_dec_and_test(&page->pt_frag_refcount)) {
pgtable_pmd_page_dtor(page);
__free_page(page);
}
}
That way we could avoid the fixup_pgtable_fragments completely?
>
> Signed-off-by: Bharata B Rao <bharata at linux.ibm.com>
> ---
> arch/powerpc/include/asm/book3s/64/pgalloc.h | 11 ++-
> arch/powerpc/include/asm/book3s/64/radix.h | 1 +
> arch/powerpc/include/asm/sparsemem.h | 1 +
> arch/powerpc/mm/book3s64/pgtable.c | 31 +++++++-
> arch/powerpc/mm/book3s64/radix_pgtable.c | 80 +++++++++++++++++++-
> arch/powerpc/mm/mem.c | 5 ++
> arch/powerpc/mm/pgtable-frag.c | 9 ++-
> 7 files changed, 129 insertions(+), 9 deletions(-)
>
> diff --git a/arch/powerpc/include/asm/book3s/64/pgalloc.h b/arch/powerpc/include/asm/book3s/64/pgalloc.h
> index 69c5b051734f..56d695f0095c 100644
> --- a/arch/powerpc/include/asm/book3s/64/pgalloc.h
> +++ b/arch/powerpc/include/asm/book3s/64/pgalloc.h
> @@ -109,7 +109,16 @@ static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
>
> static inline void pud_free(struct mm_struct *mm, pud_t *pud)
> {
> - kmem_cache_free(PGT_CACHE(PUD_CACHE_INDEX), pud);
> + struct page *page = virt_to_page(pud);
> +
> + /*
> + * Early pud pages allocated via memblock allocator
> + * can't be directly freed to slab
> + */
> + if (PageReserved(page))
> + free_reserved_page(page);
> + else
> + kmem_cache_free(PGT_CACHE(PUD_CACHE_INDEX), pud);
> }
>
> static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
> diff --git a/arch/powerpc/include/asm/book3s/64/radix.h b/arch/powerpc/include/asm/book3s/64/radix.h
> index 0cba794c4fb8..90f05d52f46d 100644
> --- a/arch/powerpc/include/asm/book3s/64/radix.h
> +++ b/arch/powerpc/include/asm/book3s/64/radix.h
> @@ -297,6 +297,7 @@ static inline unsigned long radix__get_tree_size(void)
> int radix__create_section_mapping(unsigned long start, unsigned long end,
> int nid, pgprot_t prot);
> int radix__remove_section_mapping(unsigned long start, unsigned long end);
> +void radix__fixup_pgtable_fragments(void);
> #endif /* CONFIG_MEMORY_HOTPLUG */
> #endif /* __ASSEMBLY__ */
> #endif
> diff --git a/arch/powerpc/include/asm/sparsemem.h b/arch/powerpc/include/asm/sparsemem.h
> index c89b32443cff..d0b22a937a7a 100644
> --- a/arch/powerpc/include/asm/sparsemem.h
> +++ b/arch/powerpc/include/asm/sparsemem.h
> @@ -16,6 +16,7 @@
> extern int create_section_mapping(unsigned long start, unsigned long end,
> int nid, pgprot_t prot);
> extern int remove_section_mapping(unsigned long start, unsigned long end);
> +void fixup_pgtable_fragments(void);
>
> #ifdef CONFIG_PPC_BOOK3S_64
> extern int resize_hpt_for_hotplug(unsigned long new_mem_size);
> diff --git a/arch/powerpc/mm/book3s64/pgtable.c b/arch/powerpc/mm/book3s64/pgtable.c
> index c58ad1049909..ee94a28dc6f9 100644
> --- a/arch/powerpc/mm/book3s64/pgtable.c
> +++ b/arch/powerpc/mm/book3s64/pgtable.c
> @@ -184,6 +184,13 @@ int __meminit remove_section_mapping(unsigned long start, unsigned long end)
>
> return hash__remove_section_mapping(start, end);
> }
> +
> +void fixup_pgtable_fragments(void)
> +{
> + if (radix_enabled())
> + radix__fixup_pgtable_fragments();
> +}
> +
> #endif /* CONFIG_MEMORY_HOTPLUG */
>
> void __init mmu_partition_table_init(void)
> @@ -341,13 +348,23 @@ void pmd_fragment_free(unsigned long *pmd)
>
> BUG_ON(atomic_read(&page->pt_frag_refcount) <= 0);
> if (atomic_dec_and_test(&page->pt_frag_refcount)) {
> - pgtable_pmd_page_dtor(page);
> - __free_page(page);
> + /*
> + * Early pmd pages allocated via memblock
> + * allocator wouldn't have called _ctor
> + */
> + if (PageReserved(page))
> + free_reserved_page(page);
> + else {
> + pgtable_pmd_page_dtor(page);
> + __free_page(page);
> + }
> }
> }
>
> static inline void pgtable_free(void *table, int index)
> {
> + struct page *page;
> +
> switch (index) {
> case PTE_INDEX:
> pte_fragment_free(table, 0);
> @@ -356,7 +373,15 @@ static inline void pgtable_free(void *table, int index)
> pmd_fragment_free(table);
> break;
> case PUD_INDEX:
> - kmem_cache_free(PGT_CACHE(PUD_CACHE_INDEX), table);
> + page = virt_to_page(table);
> + /*
> + * Early pud pages allocated via memblock
> + * allocator need to be freed differently
> + */
> + if (PageReserved(page))
> + free_reserved_page(page);
> + else
> + kmem_cache_free(PGT_CACHE(PUD_CACHE_INDEX), table);
> break;
> #if defined(CONFIG_PPC_4K_PAGES) && defined(CONFIG_HUGETLB_PAGE)
> /* 16M hugepd directory at pud level */
> diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c
> index ffccfe00ca2a..58e42393d5e8 100644
> --- a/arch/powerpc/mm/book3s64/radix_pgtable.c
> +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c
> @@ -37,6 +37,71 @@
> unsigned int mmu_pid_bits;
> unsigned int mmu_base_pid;
>
> +static void fixup_pte_fragments(pmd_t *pmd)
> +{
> + int i;
> +
> + for (i = 0; i < PTRS_PER_PMD; i++, pmd++) {
> + pte_t *pte;
> + struct page *page;
> +
> + if (pmd_none(*pmd))
> + continue;
> + if (pmd_is_leaf(*pmd))
> + continue;
> +
> + pte = pte_offset_kernel(pmd, 0);
> + page = virt_to_page(pte);
> + atomic_inc(&page->pt_frag_refcount);
> + }
> +}
> +
> +static void fixup_pmd_fragments(pud_t *pud)
> +{
> + int i;
> +
> + for (i = 0; i < PTRS_PER_PUD; i++, pud++) {
> + pmd_t *pmd;
> + struct page *page;
> +
> + if (pud_none(*pud))
> + continue;
> + if (pud_is_leaf(*pud))
> + continue;
> +
> + pmd = pmd_offset(pud, 0);
> + page = virt_to_page(pmd);
> + atomic_inc(&page->pt_frag_refcount);
> + fixup_pte_fragments(pmd);
> + }
> +}
> +
> +/*
> + * Walk the init_mm page tables and fixup the PMD and PTE fragment
> + * counts. This allows the PUD, PMD and PTE pages to be freed
> + * back to buddy allocator properly during memory unplug.
> + */
> +void radix__fixup_pgtable_fragments(void)
> +{
> + int i;
> + pgd_t *pgd = pgd_offset_k(0UL);
> +
> + spin_lock(&init_mm.page_table_lock);
> + for (i = 0; i < PTRS_PER_PGD; i++, pgd++) {
> + p4d_t *p4d = p4d_offset(pgd, 0UL);
> + pud_t *pud;
> +
> + if (p4d_none(*p4d))
> + continue;
> + if (p4d_is_leaf(*p4d))
> + continue;
> +
> + pud = pud_offset(p4d, 0);
> + fixup_pmd_fragments(pud);
> + }
> + spin_unlock(&init_mm.page_table_lock);
> +}
> +
> static __ref void *early_alloc_pgtable(unsigned long size, int nid,
> unsigned long region_start, unsigned long region_end)
> {
> @@ -58,6 +123,13 @@ static __ref void *early_alloc_pgtable(unsigned long size, int nid,
> return ptr;
> }
>
> +/*
> + * When allocating pud or pmd pointers, we allocate a complete page
> + * of PAGE_SIZE rather than PUD_TABLE_SIZE or PMD_TABLE_SIZE. This
> + * is to ensure that the page obtained from the memblock allocator
> + * can be completely used as page table page and can be freed
> + * correctly when the page table entries are removed.
> + */
> static int early_map_kernel_page(unsigned long ea, unsigned long pa,
> pgprot_t flags,
> unsigned int map_page_size,
> @@ -74,8 +146,8 @@ static int early_map_kernel_page(unsigned long ea, unsigned long pa,
> pgdp = pgd_offset_k(ea);
> p4dp = p4d_offset(pgdp, ea);
> if (p4d_none(*p4dp)) {
> - pudp = early_alloc_pgtable(PUD_TABLE_SIZE, nid,
> - region_start, region_end);
> + pudp = early_alloc_pgtable(PAGE_SIZE, nid,
> + region_start, region_end);
> p4d_populate(&init_mm, p4dp, pudp);
> }
> pudp = pud_offset(p4dp, ea);
> @@ -84,8 +156,8 @@ static int early_map_kernel_page(unsigned long ea, unsigned long pa,
> goto set_the_pte;
> }
> if (pud_none(*pudp)) {
> - pmdp = early_alloc_pgtable(PMD_TABLE_SIZE, nid,
> - region_start, region_end);
> + pmdp = early_alloc_pgtable(PAGE_SIZE, nid, region_start,
> + region_end);
> pud_populate(&init_mm, pudp, pmdp);
> }
> pmdp = pmd_offset(pudp, ea);
> diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
> index 5f7fe13211e9..b8ea004c3ebf 100644
> --- a/arch/powerpc/mm/mem.c
> +++ b/arch/powerpc/mm/mem.c
> @@ -54,6 +54,10 @@
>
> #include <mm/mmu_decl.h>
>
> +void __weak fixup_pgtable_fragments(void)
> +{
> +}
> +
> #ifndef CPU_FTR_COHERENT_ICACHE
> #define CPU_FTR_COHERENT_ICACHE 0 /* XXX for now */
> #define CPU_FTR_NOEXECUTE 0
> @@ -301,6 +305,7 @@ void __init mem_init(void)
>
> memblock_free_all();
>
> + fixup_pgtable_fragments();
> #ifdef CONFIG_HIGHMEM
> {
> unsigned long pfn, highmem_mapnr;
> diff --git a/arch/powerpc/mm/pgtable-frag.c b/arch/powerpc/mm/pgtable-frag.c
> index ee4bd6d38602..16213c09896a 100644
> --- a/arch/powerpc/mm/pgtable-frag.c
> +++ b/arch/powerpc/mm/pgtable-frag.c
> @@ -114,6 +114,13 @@ void pte_fragment_free(unsigned long *table, int kernel)
> if (atomic_dec_and_test(&page->pt_frag_refcount)) {
> if (!kernel)
> pgtable_pte_page_dtor(page);
> - __free_page(page);
> + /*
> + * Early pte pages allocated via memblock
> + * allocator need to be freed differently
> + */
> + if (PageReserved(page))
> + free_reserved_page(page);
> + else
> + __free_page(page);
> }
> }
> --
> 2.21.3
More information about the Linuxppc-dev
mailing list