[RFC PATCH v0 3/5] powerpc/mm/radix: Fix PTE/PMD fragment count for early page table mappings
Aneesh Kumar K.V
aneesh.kumar at linux.ibm.com
Mon Jun 22 22:53:39 AEST 2020
Bharata B Rao <bharata at linux.ibm.com> writes:
> We can hit the following BUG_ON during memory unplug
>
> kernel BUG at arch/powerpc/mm/book3s64/pgtable.c:344!
> Oops: Exception in kernel mode, sig: 5 [#1]
> LE PAGE_SIZE=64K MMU=Radix SMP NR_CPUS=2048 NUMA pSeries
> NIP [c000000000097d48] pmd_fragment_free+0x48/0xd0
> LR [c0000000016aaefc] remove_pagetable+0x494/0x530
> Call Trace:
> _raw_spin_lock+0x54/0x80 (unreliable)
> remove_pagetable+0x2b0/0x530
> radix__remove_section_mapping+0x18/0x2c
> remove_section_mapping+0x38/0x5c
> arch_remove_memory+0x124/0x190
> try_remove_memory+0xd0/0x1c0
> __remove_memory+0x20/0x40
> dlpar_remove_lmb+0xbc/0x110
> dlpar_memory+0xa90/0xd40
> handle_dlpar_errorlog+0xa8/0x160
> pseries_hp_work_fn+0x2c/0x60
> process_one_work+0x47c/0x870
> worker_thread+0x364/0x5e0
> kthread+0x1b4/0x1c0
> ret_from_kernel_thread+0x5c/0x74
>
> This occurs when unplug is attempted for such memory which has
> been mapped using memblock pages as part of early kernel page
> table setup. We wouldn't have initialized the PMD or PTE fragment
> count for those PMD or PTE pages.
>
> Fixing this includes 3 parts:
>
> - Re-walk the init_mm page tables from mem_init() and initialize
> the PMD and PTE fragment count to 1.
> - When freeing PUD, PMD and PTE page table pages, check explicitly
> if they come from memblock and if so free then appropriately.
> - When we do early memblock based allocation of PMD and PUD pages,
> allocate in PAGE_SIZE granularity so that we are sure the
> complete page is used as pagetable page.
>
> Since we now do PAGE_SIZE allocations for both PUD table and
> PMD table (Note that PTE table allocation is already of PAGE_SIZE),
> we end up allocating more memory for the same amount of system RAM.
> Here is a comparision of how much more we need for a 64T and 2G
> system after this patch:
>
> 1. 64T system
> -------------
> 64T RAM would need 64G for vmemmap with struct page size being 64B.
>
> 128 PUD tables for 64T memory (1G mappings)
> 1 PUD table and 64 PMD tables for 64G vmemmap (2M mappings)
>
> With default PUD[PMD]_TABLE_SIZE(4K), (128+1+64)*4K=772K
> With PAGE_SIZE(64K) table allocations, (128+1+64)*64K=12352K
>
> 2. 2G system
> ------------
> 2G RAM would need 2M for vmemmap with struct page size being 64B.
>
> 1 PUD table for 2G memory (1G mapping)
> 1 PUD table and 1 PMD table for 2M vmemmap (2M mappings)
>
> With default PUD[PMD]_TABLE_SIZE(4K), (1+1+1)*4K=12K
> With new PAGE_SIZE(64K) table allocations, (1+1+1)*64K=192K
>
Reviewed-by: Aneesh Kumar K.V <aneesh.kumar at linux.ibm.com>
> Signed-off-by: Bharata B Rao <bharata at linux.ibm.com>
> ---
> arch/powerpc/include/asm/book3s/64/pgalloc.h | 11 ++-
> arch/powerpc/include/asm/book3s/64/radix.h | 1 +
> arch/powerpc/include/asm/sparsemem.h | 1 +
> arch/powerpc/mm/book3s64/pgtable.c | 31 ++++++++-
> arch/powerpc/mm/book3s64/radix_pgtable.c | 72 ++++++++++++++++++--
> arch/powerpc/mm/mem.c | 5 ++
> arch/powerpc/mm/pgtable-frag.c | 9 ++-
> 7 files changed, 121 insertions(+), 9 deletions(-)
>
> diff --git a/arch/powerpc/include/asm/book3s/64/pgalloc.h b/arch/powerpc/include/asm/book3s/64/pgalloc.h
> index a41e91bd0580..e96572fb2871 100644
> --- a/arch/powerpc/include/asm/book3s/64/pgalloc.h
> +++ b/arch/powerpc/include/asm/book3s/64/pgalloc.h
> @@ -109,7 +109,16 @@ static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
>
> static inline void pud_free(struct mm_struct *mm, pud_t *pud)
> {
> - kmem_cache_free(PGT_CACHE(PUD_CACHE_INDEX), pud);
> + struct page *page = virt_to_page(pud);
> +
> + /*
> + * Early pud pages allocated via memblock allocator
> + * can't be directly freed to slab
> + */
> + if (PageReserved(page))
> + free_reserved_page(page);
> + else
> + kmem_cache_free(PGT_CACHE(PUD_CACHE_INDEX), pud);
> }
>
> static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
> diff --git a/arch/powerpc/include/asm/book3s/64/radix.h b/arch/powerpc/include/asm/book3s/64/radix.h
> index d97db3ad9aae..0aff8750181a 100644
> --- a/arch/powerpc/include/asm/book3s/64/radix.h
> +++ b/arch/powerpc/include/asm/book3s/64/radix.h
> @@ -291,6 +291,7 @@ static inline unsigned long radix__get_tree_size(void)
> #ifdef CONFIG_MEMORY_HOTPLUG
> int radix__create_section_mapping(unsigned long start, unsigned long end, int nid);
> int radix__remove_section_mapping(unsigned long start, unsigned long end);
> +void radix__fixup_pgtable_fragments(void);
> #endif /* CONFIG_MEMORY_HOTPLUG */
> #endif /* __ASSEMBLY__ */
> #endif
> diff --git a/arch/powerpc/include/asm/sparsemem.h b/arch/powerpc/include/asm/sparsemem.h
> index 3192d454a733..e662f9232d35 100644
> --- a/arch/powerpc/include/asm/sparsemem.h
> +++ b/arch/powerpc/include/asm/sparsemem.h
> @@ -15,6 +15,7 @@
> #ifdef CONFIG_MEMORY_HOTPLUG
> extern int create_section_mapping(unsigned long start, unsigned long end, int nid);
> extern int remove_section_mapping(unsigned long start, unsigned long end);
> +void fixup_pgtable_fragments(void);
>
> #ifdef CONFIG_PPC_BOOK3S_64
> extern int resize_hpt_for_hotplug(unsigned long new_mem_size);
> diff --git a/arch/powerpc/mm/book3s64/pgtable.c b/arch/powerpc/mm/book3s64/pgtable.c
> index 2bf7e1b4fd82..be7aa8786747 100644
> --- a/arch/powerpc/mm/book3s64/pgtable.c
> +++ b/arch/powerpc/mm/book3s64/pgtable.c
> @@ -186,6 +186,13 @@ int __meminit remove_section_mapping(unsigned long start, unsigned long end)
>
> return hash__remove_section_mapping(start, end);
> }
> +
> +void fixup_pgtable_fragments(void)
> +{
> + if (radix_enabled())
> + radix__fixup_pgtable_fragments();
> +}
> +
> #endif /* CONFIG_MEMORY_HOTPLUG */
>
> void __init mmu_partition_table_init(void)
> @@ -343,13 +350,23 @@ void pmd_fragment_free(unsigned long *pmd)
>
> BUG_ON(atomic_read(&page->pt_frag_refcount) <= 0);
> if (atomic_dec_and_test(&page->pt_frag_refcount)) {
> - pgtable_pmd_page_dtor(page);
> - __free_page(page);
> + /*
> + * Early pmd pages allocated via memblock
> + * allocator wouldn't have called _ctor
> + */
> + if (PageReserved(page))
> + free_reserved_page(page);
> + else {
> + pgtable_pmd_page_dtor(page);
> + __free_page(page);
> + }
> }
> }
>
> static inline void pgtable_free(void *table, int index)
> {
> + struct page *page;
> +
> switch (index) {
> case PTE_INDEX:
> pte_fragment_free(table, 0);
> @@ -358,7 +375,15 @@ static inline void pgtable_free(void *table, int index)
> pmd_fragment_free(table);
> break;
> case PUD_INDEX:
> - kmem_cache_free(PGT_CACHE(PUD_CACHE_INDEX), table);
> + page = virt_to_page(table);
> + /*
> + * Early pud pages allocated via memblock
> + * allocator need to be freed differently
> + */
> + if (PageReserved(page))
> + free_reserved_page(page);
> + else
> + kmem_cache_free(PGT_CACHE(PUD_CACHE_INDEX), table);
> break;
> #if defined(CONFIG_PPC_4K_PAGES) && defined(CONFIG_HUGETLB_PAGE)
> /* 16M hugepd directory at pud level */
> diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c
> index 4a4fb30f6c3d..e675c0bbf9a4 100644
> --- a/arch/powerpc/mm/book3s64/radix_pgtable.c
> +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c
> @@ -36,6 +36,70 @@
> unsigned int mmu_pid_bits;
> unsigned int mmu_base_pid;
>
> +static void fixup_pte_fragments(pmd_t *pmd)
> +{
> + int i;
> +
> + for (i = 0; i < PTRS_PER_PMD; i++, pmd++) {
> + pte_t *pte;
> + struct page *page;
> +
> + if (pmd_none(*pmd))
> + continue;
> + if (pmd_is_leaf(*pmd))
> + continue;
> +
> + pte = pte_offset_kernel(pmd, 0);
> + page = virt_to_page(pte);
> + atomic_inc(&page->pt_frag_refcount);
> + }
> +}
> +
> +static void fixup_pmd_fragments(pud_t *pud)
> +{
> + int i;
> +
> + for (i = 0; i < PTRS_PER_PUD; i++, pud++) {
> + pmd_t *pmd;
> + struct page *page;
> +
> + if (pud_none(*pud))
> + continue;
> + if (pud_is_leaf(*pud))
> + continue;
> +
> + pmd = pmd_offset(pud, 0);
> + page = virt_to_page(pmd);
> + atomic_inc(&page->pt_frag_refcount);
> + fixup_pte_fragments(pmd);
> + }
> +}
> +
> +/*
> + * Walk the init_mm page tables and fixup the PMD and PTE fragment
> + * counts. This allows the PUD, PMD and PTE pages to be freed
> + * back to buddy allocator properly during memory unplug.
> + */
> +void radix__fixup_pgtable_fragments(void)
> +{
> + int i;
> + pgd_t *pgd = pgd_offset_k(0UL);
> +
> + spin_lock(&init_mm.page_table_lock);
> + for (i = 0; i < PTRS_PER_PGD; i++, pgd++) {
> + pud_t *pud;
> +
> + if (pgd_none(*pgd))
> + continue;
> + if (pgd_is_leaf(*pgd))
> + continue;
> +
> + pud = pud_offset(pgd, 0);
> + fixup_pmd_fragments(pud);
> + }
> + spin_unlock(&init_mm.page_table_lock);
> +}
> +
> static __ref void *early_alloc_pgtable(unsigned long size, int nid,
> unsigned long region_start, unsigned long region_end)
> {
> @@ -71,8 +135,8 @@ static int early_map_kernel_page(unsigned long ea, unsigned long pa,
>
> pgdp = pgd_offset_k(ea);
> if (pgd_none(*pgdp)) {
> - pudp = early_alloc_pgtable(PUD_TABLE_SIZE, nid,
> - region_start, region_end);
> + pudp = early_alloc_pgtable(PAGE_SIZE, nid, region_start,
> + region_end);
> pgd_populate(&init_mm, pgdp, pudp);
> }
> pudp = pud_offset(pgdp, ea);
> @@ -81,8 +145,8 @@ static int early_map_kernel_page(unsigned long ea, unsigned long pa,
> goto set_the_pte;
> }
> if (pud_none(*pudp)) {
> - pmdp = early_alloc_pgtable(PMD_TABLE_SIZE, nid,
> - region_start, region_end);
> + pmdp = early_alloc_pgtable(PAGE_SIZE, nid, region_start,
> + region_end);
> pud_populate(&init_mm, pudp, pmdp);
> }
> pmdp = pmd_offset(pudp, ea);
> diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
> index 1c07d5a3f543..d43ad701f693 100644
> --- a/arch/powerpc/mm/mem.c
> +++ b/arch/powerpc/mm/mem.c
> @@ -53,6 +53,10 @@
>
> #include <mm/mmu_decl.h>
>
> +void __weak fixup_pgtable_fragments(void)
> +{
> +}
> +
> #ifndef CPU_FTR_COHERENT_ICACHE
> #define CPU_FTR_COHERENT_ICACHE 0 /* XXX for now */
> #define CPU_FTR_NOEXECUTE 0
> @@ -307,6 +311,7 @@ void __init mem_init(void)
>
> memblock_free_all();
>
> + fixup_pgtable_fragments();
> #ifdef CONFIG_HIGHMEM
> {
> unsigned long pfn, highmem_mapnr;
> diff --git a/arch/powerpc/mm/pgtable-frag.c b/arch/powerpc/mm/pgtable-frag.c
> index ee4bd6d38602..16213c09896a 100644
> --- a/arch/powerpc/mm/pgtable-frag.c
> +++ b/arch/powerpc/mm/pgtable-frag.c
> @@ -114,6 +114,13 @@ void pte_fragment_free(unsigned long *table, int kernel)
> if (atomic_dec_and_test(&page->pt_frag_refcount)) {
> if (!kernel)
> pgtable_pte_page_dtor(page);
> - __free_page(page);
> + /*
> + * Early pte pages allocated via memblock
> + * allocator need to be freed differently
> + */
> + if (PageReserved(page))
> + free_reserved_page(page);
> + else
> + __free_page(page);
> }
> }
> --
> 2.21.0
More information about the Linuxppc-dev
mailing list