[PATCH v5 3/4] powerpc/mm: add radix__remove_section_mapping()
Balbir Singh
bsingharora at gmail.com
Tue Jan 17 18:22:51 AEDT 2017
On Mon, Jan 16, 2017 at 01:07:45PM -0600, Reza Arbab wrote:
> Tear down and free the four-level page tables of physical mappings
> during memory hotremove.
>
> Borrow the basic structure of remove_pagetable() and friends from the
> identically-named x86 functions. Reduce the frequency of tlb flushes and
> page_table_lock spinlocks by only doing them in the outermost function.
> There was some question as to whether the locking is needed at all.
> Leave it for now, but we could consider dropping it.
>
> Memory must be offline to be removed, thus not in use. So there
> shouldn't be the sort of concurrent page walking activity here that
> might prompt us to use RCU.
>
> Signed-off-by: Reza Arbab <arbab at linux.vnet.ibm.com>
> ---
> arch/powerpc/include/asm/book3s/64/radix.h | 1 +
> arch/powerpc/mm/pgtable-book3s64.c | 2 +-
> arch/powerpc/mm/pgtable-radix.c | 133 +++++++++++++++++++++++++++++
> 3 files changed, 135 insertions(+), 1 deletion(-)
>
> diff --git a/arch/powerpc/include/asm/book3s/64/radix.h b/arch/powerpc/include/asm/book3s/64/radix.h
> index 43c2571..0032b66 100644
> --- a/arch/powerpc/include/asm/book3s/64/radix.h
> +++ b/arch/powerpc/include/asm/book3s/64/radix.h
> @@ -294,6 +294,7 @@ static inline unsigned long radix__get_tree_size(void)
>
> #ifdef CONFIG_MEMORY_HOTPLUG
> int radix__create_section_mapping(unsigned long start, unsigned long end);
> +int radix__remove_section_mapping(unsigned long start, unsigned long end);
> #endif /* CONFIG_MEMORY_HOTPLUG */
> #endif /* __ASSEMBLY__ */
> #endif
> diff --git a/arch/powerpc/mm/pgtable-book3s64.c b/arch/powerpc/mm/pgtable-book3s64.c
> index 2b13f6b..b798ff6 100644
> --- a/arch/powerpc/mm/pgtable-book3s64.c
> +++ b/arch/powerpc/mm/pgtable-book3s64.c
> @@ -139,7 +139,7 @@ int create_section_mapping(unsigned long start, unsigned long end)
> int remove_section_mapping(unsigned long start, unsigned long end)
> {
> if (radix_enabled())
> - return -ENODEV;
> + return radix__remove_section_mapping(start, end);
>
> return hash__remove_section_mapping(start, end);
> }
> diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c
> index 075b4ec..cfba666 100644
> --- a/arch/powerpc/mm/pgtable-radix.c
> +++ b/arch/powerpc/mm/pgtable-radix.c
> @@ -476,10 +476,143 @@ void radix__setup_initial_memory_limit(phys_addr_t first_memblock_base,
> }
>
Shouldn't most of these functions have __meminit?
> #ifdef CONFIG_MEMORY_HOTPLUG
> +static void free_pte_table(pte_t *pte_start, pmd_t *pmd)
> +{
> + pte_t *pte;
> + int i;
> +
> + for (i = 0; i < PTRS_PER_PTE; i++) {
> + pte = pte_start + i;
> + if (!pte_none(*pte))
> + return;
If !pte_none() we fail the hotplug? Or silently
leave the allocated pte's around. I guess this is
the same as x86
> + }
> +
> + pte_free_kernel(&init_mm, pte_start);
> + pmd_clear(pmd);
> +}
> +
> +static void free_pmd_table(pmd_t *pmd_start, pud_t *pud)
> +{
> + pmd_t *pmd;
> + int i;
> +
> + for (i = 0; i < PTRS_PER_PMD; i++) {
> + pmd = pmd_start + i;
> + if (!pmd_none(*pmd))
> + return;
> + }
> +
> + pmd_free(&init_mm, pmd_start);
> + pud_clear(pud);
> +}
> +
> +static void remove_pte_table(pte_t *pte_start, unsigned long addr,
> + unsigned long end)
> +{
> + unsigned long next;
> + pte_t *pte;
> +
> + pte = pte_start + pte_index(addr);
> + for (; addr < end; addr = next, pte++) {
> + next = (addr + PAGE_SIZE) & PAGE_MASK;
> + if (next > end)
> + next = end;
> +
> + if (!pte_present(*pte))
> + continue;
> +
> + pte_clear(&init_mm, addr, pte);
> + }
> +}
> +
> +static void remove_pmd_table(pmd_t *pmd_start, unsigned long addr,
> + unsigned long end)
> +{
> + unsigned long next;
> + pte_t *pte_base;
> + pmd_t *pmd;
> +
> + pmd = pmd_start + pmd_index(addr);
> + for (; addr < end; addr = next, pmd++) {
> + next = pmd_addr_end(addr, end);
> +
> + if (!pmd_present(*pmd))
> + continue;
> +
> + if (pmd_huge(*pmd)) {
> + pte_clear(&init_mm, addr, (pte_t *)pmd);
pmd_clear()?
> + continue;
> + }
> +
> + pte_base = (pte_t *)pmd_page_vaddr(*pmd);
> + remove_pte_table(pte_base, addr, next);
> + free_pte_table(pte_base, pmd);
> + }
> +}
> +
> +static void remove_pud_table(pud_t *pud_start, unsigned long addr,
> + unsigned long end)
> +{
> + unsigned long next;
> + pmd_t *pmd_base;
> + pud_t *pud;
> +
> + pud = pud_start + pud_index(addr);
> + for (; addr < end; addr = next, pud++) {
> + next = pud_addr_end(addr, end);
> +
> + if (!pud_present(*pud))
> + continue;
> +
> + if (pud_huge(*pud)) {
> + pte_clear(&init_mm, addr, (pte_t *)pud);
pud_clear()?
> + continue;
> + }
> +
> + pmd_base = (pmd_t *)pud_page_vaddr(*pud);
> + remove_pmd_table(pmd_base, addr, next);
> + free_pmd_table(pmd_base, pud);
> + }
> +}
> +
> +static void remove_pagetable(unsigned long start, unsigned long end)
> +{
> + unsigned long addr, next;
> + pud_t *pud_base;
> + pgd_t *pgd;
> +
> + spin_lock(&init_mm.page_table_lock);
> +
x86 does more granular lock acquisition only during
clearing the relevant entries. I suppose we don't have
to worry about it since its not fast path and frequent.
Balbir Singh.
More information about the Linuxppc-dev
mailing list