[RFC] powerpc/mm/pgtable: Split mappings on hot-unplug
Balbir Singh
bsingharora at gmail.com
Fri Feb 2 14:08:05 AEDT 2018
On Thu, Feb 1, 2018 at 11:48 PM, Balbir Singh <bsingharora at gmail.com> wrote:
> This patch splits the a linear mapping if the hot-unplug range
> is smaller than the mapping size. The code detects if the mapping
> needs to be split into a smaller size and if so, uses the stop
> machine infrastructure to map the current linear mapping with
> a smaller size mapping. Then the requested area is unmapped.
>
> A new API to do a local TLB flush is introduced and exported.
> This API is used in stop mapping after clearing the larger
> mapping and before creating the smaller mapping. There is also
> a check to ensure that we don't overlap with kernel text
> (the region that is being split) as it could cause the executing
> text to fault.
>
> There are some caveats with this approach, as the unmapping can
> cause reserved memory area to be freed up (memblock reserved
> mappings and allocations as well), that can be a problem. There
> could be a case where radix__map_kernel_page() may fault if the
> area selected to unmap contains the process page table, this is
> not a new problem created by the patch, but something to watch
> out for while testing. The right approach to solving the hot-unplug
> issue is to make the node/DIMM to be marked hotpluggable so that
> all the memory in that is movable. Currently there is no method
> of enforcing the hotpluggable property via libvirt or qemu.
>
> I've tested these changes under a kvm guest with 2 vcpus, from
> a split mapping point of view, some of the caveats mentioned
> above applied to the testing I did.
>
> TODOs
>
> 1. In addition of checking overlapping text, we should check for
> reserved memory allocations in the unmapped region
> 2. The code has duplication across several remove_*pgtable
> (pgd/pud/pmd), we could create helper functions to simplify
> the functionality.
>
> Signed-off-by: Balbir Singh <bsingharora at gmail.com>
> ---
> .../powerpc/include/asm/book3s/64/tlbflush-radix.h | 1 +
> arch/powerpc/mm/pgtable-radix.c | 130 +++++++++++++++++----
> arch/powerpc/mm/tlb-radix.c | 6 +
> 3 files changed, 116 insertions(+), 21 deletions(-)
>
> diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h b/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h
> index 6a9e68003387..d1d4104e676d 100644
> --- a/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h
> +++ b/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h
> @@ -20,6 +20,7 @@ extern void radix__flush_pmd_tlb_range(struct vm_area_struct *vma,
> extern void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
> unsigned long end);
> extern void radix__flush_tlb_kernel_range(unsigned long start, unsigned long end);
> +extern void radix__flush_tlb_local_kernel_range(unsigned long start, unsigned long end);
>
> extern void radix__local_flush_tlb_mm(struct mm_struct *mm);
> extern void radix__local_flush_all_mm(struct mm_struct *mm);
> diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c
> index cfbbee941a76..10b6a1c433ff 100644
> --- a/arch/powerpc/mm/pgtable-radix.c
> +++ b/arch/powerpc/mm/pgtable-radix.c
> @@ -17,6 +17,7 @@
> #include <linux/of_fdt.h>
> #include <linux/mm.h>
> #include <linux/string_helpers.h>
> +#include <linux/stop_machine.h>
>
> #include <asm/pgtable.h>
> #include <asm/pgalloc.h>
> @@ -671,6 +672,35 @@ static void free_pmd_table(pmd_t *pmd_start, pud_t *pud)
> pud_clear(pud);
> }
>
> +struct change_mapping_params {
> + pte_t *pte;
> + unsigned long start;
> + unsigned long end;
> + unsigned long size;
> + unsigned long aligned_start;
> + unsigned long aligned_end;
> +};
> +
> +static int stop_machine_change_mapping(void *data)
> +{
> + struct change_mapping_params *params =
> + (struct change_mapping_params *)data;
> + unsigned long addr;
> +
> + if (!data)
> + return -1;
> +
> + spin_unlock(&init_mm.page_table_lock);
> + pte_clear(&init_mm, params->aligned_start, params->pte);
> + radix__flush_tlb_local_kernel_range(params->aligned_start,
> + params->aligned_end);
> + for (addr = params->aligned_start; addr < params->aligned_end;
> + addr += params->size)
> + create_physical_mapping(addr, addr + params->size);
> + spin_lock(&init_mm.page_table_lock);
> + return 0;
> +}
> +
> static void remove_pte_table(pte_t *pte_start, unsigned long addr,
> unsigned long end)
> {
> @@ -699,12 +729,65 @@ static void remove_pte_table(pte_t *pte_start, unsigned long addr,
> }
> }
>
> +/*
> + * clear the pte and potentially split the mapping helper
> + * Return values are as follows
> + *
> + * 0: pte cleared, no changes required
> + * 1: pte cleared and mapping split
> + * -1: We can't clear this pte range
> + */
> +static int clear_and_maybe_split_kernel_mapping(unsigned long addr,
> + unsigned long end, unsigned long size, unsigned long prev_size,
> + pte_t *pte)
> +{
> + unsigned long mask = ~(size - 1);
> + unsigned long aligned_start = addr & mask;
> + unsigned long aligned_end = addr + size;
> + struct change_mapping_params params;
> + bool split_region = false;
> +
> + if ((end - addr) < size) {
> + /*
> + * We're going to clear the PTE, but not flushed
> + * the mapping, time to remap and flush. The
> + * effects if visible outside the processor or
> + * if we are running in code close to the
> + * mapping we cleared, we are in trouble.
> + */
> + if (overlaps_kernel_text(aligned_start, addr) ||
> + overlaps_kernel_text(end, aligned_end)) {
> + /*
> + * Hack, just return, don't pte_clear
> + */
> + return -1;
> + }
> + split_region = true;
> + }
> +
> + if (split_region) {
> + params.pte = pte;
> + params.start = addr;
> + params.end = end;
> + params.size = prev_size;
> + params.aligned_end = aligned_end;
> + params.aligned_end = min_t(unsigned long, aligned_end,
> + (unsigned long)__va(memblock_end_of_DRAM()));
> + stop_machine(stop_machine_change_mapping, ¶ms, NULL);
> + return 1;
> + }
> +
> + pte_clear(&init_mm, addr, pte);
> + return 0;
> +}
> +
> static void remove_pmd_table(pmd_t *pmd_start, unsigned long addr,
> unsigned long end)
> {
> unsigned long next;
> pte_t *pte_base;
> pmd_t *pmd;
> + int rc = 0;
>
> pmd = pmd_start + pmd_index(addr);
> for (; addr < end; addr = next, pmd++) {
> @@ -714,14 +797,15 @@ static void remove_pmd_table(pmd_t *pmd_start, unsigned long addr,
> continue;
>
> if (pmd_huge(*pmd)) {
> - if (!IS_ALIGNED(addr, PMD_SIZE) ||
> - !IS_ALIGNED(next, PMD_SIZE)) {
> - WARN_ONCE(1, "%s: unaligned range\n", __func__);
> + rc = clear_and_maybe_split_kernel_mapping(addr, end,
> + PMD_SIZE, PAGE_SIZE, (pte_t *)pmd);
> + if (rc == 0)
> continue;
> - }
> -
> - pte_clear(&init_mm, addr, (pte_t *)pmd);
> - continue;
> + if (rc == 1) {
> + pte_base = (pte_t *)pmd_page_vaddr(*pmd);
> + remove_pte_table(pte_base, addr, end);
> + } else
> + return;
> }
>
> pte_base = (pte_t *)pmd_page_vaddr(*pmd);
> @@ -736,6 +820,7 @@ static void remove_pud_table(pud_t *pud_start, unsigned long addr,
> unsigned long next;
> pmd_t *pmd_base;
> pud_t *pud;
> + int rc = 0;
>
> pud = pud_start + pud_index(addr);
> for (; addr < end; addr = next, pud++) {
> @@ -745,14 +830,15 @@ static void remove_pud_table(pud_t *pud_start, unsigned long addr,
> continue;
>
> if (pud_huge(*pud)) {
> - if (!IS_ALIGNED(addr, PUD_SIZE) ||
> - !IS_ALIGNED(next, PUD_SIZE)) {
> - WARN_ONCE(1, "%s: unaligned range\n", __func__);
> + rc = clear_and_maybe_split_kernel_mapping(addr, end,
> + PUD_SIZE, PMD_SIZE, (pte_t *)pud);
> + if (rc == 0)
> continue;
> - }
> -
> - pte_clear(&init_mm, addr, (pte_t *)pud);
> - continue;
> + if (rc == 1) {
> + pmd_base = (pmd_t *)pud_page_vaddr(*pud);
> + remove_pmd_table(pmd_base, addr, end);
> + } else
> + return;
> }
>
> pmd_base = (pmd_t *)pud_page_vaddr(*pud);
> @@ -766,6 +852,7 @@ static void remove_pagetable(unsigned long start, unsigned long end)
> unsigned long addr, next;
> pud_t *pud_base;
> pgd_t *pgd;
> + int rc = 0;
>
> spin_lock(&init_mm.page_table_lock);
>
> @@ -777,14 +864,15 @@ static void remove_pagetable(unsigned long start, unsigned long end)
> continue;
>
> if (pgd_huge(*pgd)) {
> - if (!IS_ALIGNED(addr, PGDIR_SIZE) ||
> - !IS_ALIGNED(next, PGDIR_SIZE)) {
> - WARN_ONCE(1, "%s: unaligned range\n", __func__);
> + rc = clear_and_maybe_split_kernel_mapping(addr, end,
> + PGD_SIZE, PUD_SIZE, (pte_t *)pgd);
> + if (rc == 0)
> continue;
> - }
> -
> - pte_clear(&init_mm, addr, (pte_t *)pgd);
> - continue;
> + if (rc == 1) {
> + pud_base = (pud_t *)pgd_page_vaddr(*pgd);
> + remove_pud_table(pud_base, addr, end);
> + } else
> + return;
> }
>
> pud_base = (pud_t *)pgd_page_vaddr(*pgd);
> diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c
> index 884f4b705b57..9bbeaef1ee9b 100644
> --- a/arch/powerpc/mm/tlb-radix.c
> +++ b/arch/powerpc/mm/tlb-radix.c
> @@ -323,6 +323,12 @@ void radix__flush_tlb_kernel_range(unsigned long start, unsigned long end)
> }
> EXPORT_SYMBOL(radix__flush_tlb_kernel_range);
>
> +void radix__flush_tlb_local_kernel_range(unsigned long start, unsigned long end)
> +{
> + _tlbie_pid(0, RIC_FLUSH_ALL);
> +}
> +EXPORT_SYMBOL(radix__flush_tlb_local_kernel_range);
> +
> #define TLB_FLUSH_ALL -1UL
>
> /*
> --
> 2.13.6
>
+Bharata
Balbir Singh.
More information about the Linuxppc-dev
mailing list