[PATCH v2 14/16] powerpc/book3s64/vmemmap: Switch radix to use a different vmemmap handling function

Ritesh Harjani (IBM) ritesh.list at gmail.com
Wed Jun 28 11:33:03 AEST 2023


"Aneesh Kumar K.V" <aneesh.kumar at linux.ibm.com> writes:

> This is in preparation to update radix to implement vmemmap optimization
> for devdax. Below are the rules w.r.t radix vmemmap mapping
>
> 1. First try to map things using PMD (2M)
> 2. With altmap if altmap cross-boundary check returns true, fall back to
>    PAGE_SIZE
> 3. If we can't allocate PMD_SIZE backing memory for vmemmap, fallback to
>    PAGE_SIZE
>
> On removing vmemmap mapping, check if every subsection that is using the
> vmemmap area is invalid. If found to be invalid, that implies we can safely
> free the vmemmap area. We don't use the PAGE_UNUSED pattern used by x86
> because with 64K page size, we need to do the above check even at the
> PAGE_SIZE granularity.
>
> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar at linux.ibm.com>
> ---
>  arch/powerpc/include/asm/book3s/64/radix.h |   2 +
>  arch/powerpc/include/asm/pgtable.h         |   3 +
>  arch/powerpc/mm/book3s64/radix_pgtable.c   | 319 +++++++++++++++++++--
>  arch/powerpc/mm/init_64.c                  |  26 +-
>  4 files changed, 319 insertions(+), 31 deletions(-)
>
> diff --git a/arch/powerpc/include/asm/book3s/64/radix.h b/arch/powerpc/include/asm/book3s/64/radix.h
> index 8cdff5a05011..87d4c1e62491 100644
> --- a/arch/powerpc/include/asm/book3s/64/radix.h
> +++ b/arch/powerpc/include/asm/book3s/64/radix.h
> @@ -332,6 +332,8 @@ extern int __meminit radix__vmemmap_create_mapping(unsigned long start,
>  					     unsigned long phys);
>  int __meminit radix__vmemmap_populate(unsigned long start, unsigned long end,
>  				      int node, struct vmem_altmap *altmap);
> +void __ref radix__vmemmap_free(unsigned long start, unsigned long end,
> +			       struct vmem_altmap *altmap);
>  extern void radix__vmemmap_remove_mapping(unsigned long start,
>  				    unsigned long page_size);
>  
> diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h
> index 9972626ddaf6..6d4cd2ebae6e 100644
> --- a/arch/powerpc/include/asm/pgtable.h
> +++ b/arch/powerpc/include/asm/pgtable.h
> @@ -168,6 +168,9 @@ static inline bool is_ioremap_addr(const void *x)
>  
>  struct seq_file;
>  void arch_report_meminfo(struct seq_file *m);
> +int __meminit vmemmap_populated(unsigned long vmemmap_addr, int vmemmap_map_size);
> +bool altmap_cross_boundary(struct vmem_altmap *altmap, unsigned long start,
> +			   unsigned long page_size);
>  #endif /* CONFIG_PPC64 */
>  
>  #endif /* __ASSEMBLY__ */
> diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c
> index d7e2dd3d4add..ef886fab643d 100644
> --- a/arch/powerpc/mm/book3s64/radix_pgtable.c
> +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c
> @@ -742,8 +742,57 @@ static void free_pud_table(pud_t *pud_start, p4d_t *p4d)
>  	p4d_clear(p4d);
>  }
>  
> +static bool __meminit vmemmap_pmd_is_unused(unsigned long addr, unsigned long end)
> +{
> +	unsigned long start = ALIGN_DOWN(addr, PMD_SIZE);
> +
> +	return !vmemmap_populated(start, PMD_SIZE);
> +}
> +
> +static bool __meminit vmemmap_page_is_unused(unsigned long addr, unsigned long end)
> +{
> +	unsigned long start = ALIGN_DOWN(addr, PAGE_SIZE);
> +
> +	return !vmemmap_populated(start, PAGE_SIZE);
> +
> +}
> +
> +static void __meminit free_vmemmap_pages(struct page *page,
> +					 struct vmem_altmap *altmap,
> +					 int order)
> +{
> +	unsigned int nr_pages = 1 << order;
> +
> +	if (altmap) {
> +		unsigned long alt_start, alt_end;
> +		unsigned long base_pfn = page_to_pfn(page);
> +
> +		/*
> +		 * with 1G vmemmap mmaping we can have things setup
> +		 * such that even though atlmap is specified we never
> +		 * used altmap.
> +		 */
> +		alt_start = altmap->base_pfn;
> +		alt_end = altmap->base_pfn + altmap->reserve +
> +			altmap->free + altmap->alloc + altmap->align;
> +
> +		if (base_pfn >= alt_start && base_pfn < alt_end) {
> +			vmem_altmap_free(altmap, nr_pages);
> +			return;
> +		}
> +	}
> +
> +	if (PageReserved(page)) {
> +		/* allocated from memblock */
> +		while (nr_pages--)
> +			free_reserved_page(page++);
> +	} else
> +		free_pages((unsigned long)page_address(page), order);
> +}
> +
>  static void remove_pte_table(pte_t *pte_start, unsigned long addr,
> -			     unsigned long end, bool direct)
> +			     unsigned long end, bool direct,
> +			     struct vmem_altmap *altmap)
>  {
>  	unsigned long next, pages = 0;
>  	pte_t *pte;
> @@ -757,24 +806,23 @@ static void remove_pte_table(pte_t *pte_start, unsigned long addr,
>  		if (!pte_present(*pte))
>  			continue;
>  
> -		if (!PAGE_ALIGNED(addr) || !PAGE_ALIGNED(next)) {
> -			/*
> -			 * The vmemmap_free() and remove_section_mapping()
> -			 * codepaths call us with aligned addresses.
> -			 */
> -			WARN_ONCE(1, "%s: unaligned range\n", __func__);
> -			continue;
> +		if (PAGE_ALIGNED(addr) && PAGE_ALIGNED(next)) {
> +			if (!direct)
> +				free_vmemmap_pages(pte_page(*pte), altmap, 0);
> +			pte_clear(&init_mm, addr, pte);
> +			pages++;
> +		} else if (!direct && vmemmap_page_is_unused(addr, next)) {
> +			free_vmemmap_pages(pte_page(*pte), altmap, 0);
> +			pte_clear(&init_mm, addr, pte);
>  		}
> -
> -		pte_clear(&init_mm, addr, pte);
> -		pages++;
>  	}
>  	if (direct)
>  		update_page_count(mmu_virtual_psize, -pages);
>  }
>  
>  static void __meminit remove_pmd_table(pmd_t *pmd_start, unsigned long addr,
> -				       unsigned long end, bool direct)
> +				       unsigned long end, bool direct,
> +				       struct vmem_altmap *altmap)
>  {
>  	unsigned long next, pages = 0;
>  	pte_t *pte_base;
> @@ -788,18 +836,21 @@ static void __meminit remove_pmd_table(pmd_t *pmd_start, unsigned long addr,
>  			continue;
>  
>  		if (pmd_is_leaf(*pmd)) {
> -			if (!IS_ALIGNED(addr, PMD_SIZE) ||
> -			    !IS_ALIGNED(next, PMD_SIZE)) {
> -				WARN_ONCE(1, "%s: unaligned range\n", __func__);
> -				continue;
> +			if (IS_ALIGNED(addr, PMD_SIZE) &&
> +			    IS_ALIGNED(next, PMD_SIZE)) {
> +				if (!direct)
> +					free_vmemmap_pages(pmd_page(*pmd), altmap, get_order(PMD_SIZE));
> +				pte_clear(&init_mm, addr, (pte_t *)pmd);
> +				pages++;
> +			} else if (vmemmap_pmd_is_unused(addr, next)) {
> +				free_vmemmap_pages(pmd_page(*pmd), altmap, get_order(PMD_SIZE));
> +				pte_clear(&init_mm, addr, (pte_t *)pmd);
>  			}
> -			pte_clear(&init_mm, addr, (pte_t *)pmd);
> -			pages++;
>  			continue;
>  		}
>  
>  		pte_base = (pte_t *)pmd_page_vaddr(*pmd);
> -		remove_pte_table(pte_base, addr, next, direct);
> +		remove_pte_table(pte_base, addr, next, direct, altmap);
>  		free_pte_table(pte_base, pmd);
>  	}
>  	if (direct)
> @@ -807,7 +858,8 @@ static void __meminit remove_pmd_table(pmd_t *pmd_start, unsigned long addr,
>  }
>  
>  static void __meminit remove_pud_table(pud_t *pud_start, unsigned long addr,
> -				       unsigned long end, bool direct)
> +				       unsigned long end, bool direct,
> +				       struct vmem_altmap *altmap)
>  {
>  	unsigned long next, pages = 0;
>  	pmd_t *pmd_base;
> @@ -832,15 +884,16 @@ static void __meminit remove_pud_table(pud_t *pud_start, unsigned long addr,
>  		}
>  
>  		pmd_base = pud_pgtable(*pud);
> -		remove_pmd_table(pmd_base, addr, next, direct);
> +		remove_pmd_table(pmd_base, addr, next, direct, altmap);
>  		free_pmd_table(pmd_base, pud);
>  	}
>  	if (direct)
>  		update_page_count(MMU_PAGE_1G, -pages);
>  }
>  
> -static void __meminit remove_pagetable(unsigned long start, unsigned long end,
> -				       bool direct)
> +static void __meminit
> +remove_pagetable(unsigned long start, unsigned long end, bool direct,
> +		 struct vmem_altmap *altmap)
>  {
>  	unsigned long addr, next;
>  	pud_t *pud_base;
> @@ -869,7 +922,7 @@ static void __meminit remove_pagetable(unsigned long start, unsigned long end,
>  		}
>  
>  		pud_base = p4d_pgtable(*p4d);
> -		remove_pud_table(pud_base, addr, next, direct);
> +		remove_pud_table(pud_base, addr, next, direct, altmap);
>  		free_pud_table(pud_base, p4d);
>  	}
>  
> @@ -892,7 +945,7 @@ int __meminit radix__create_section_mapping(unsigned long start,
>  
>  int __meminit radix__remove_section_mapping(unsigned long start, unsigned long end)
>  {
> -	remove_pagetable(start, end, true);
> +	remove_pagetable(start, end, true, NULL);
>  	return 0;
>  }
>  #endif /* CONFIG_MEMORY_HOTPLUG */
> @@ -924,10 +977,224 @@ int __meminit radix__vmemmap_create_mapping(unsigned long start,
>  	return 0;
>  }
>  
> +int __meminit vmemmap_check_pmd(pmd_t *pmd, int node,
> +				unsigned long addr, unsigned long next)
> +{
> +	int large = pmd_large(*pmd);
> +
> +	if (pmd_large(*pmd))

we already got the value of pmd_large into "large" variable.
we can use just if (large) right?

> +		vmemmap_verify((pte_t *)pmd, node, addr, next);

maybe we can use pmdp_ptep() function here which we used in the 1st patch?
also shouldn't this be pmdp in the function argument instead of pmd?

> +
> +	return large;
> +}
> +
> +void __meminit vmemmap_set_pmd(pmd_t *pmdp, void *p, int node,
> +			       unsigned long addr, unsigned long next)
> +{
> +	pte_t entry;
> +	pte_t *ptep = pmdp_ptep(pmdp);
> +
> +	VM_BUG_ON(!IS_ALIGNED(addr, PMD_SIZE));
> +	entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL);
> +	set_pte_at(&init_mm, addr, ptep, entry);
> +	asm volatile("ptesync": : :"memory");
> +
> +	vmemmap_verify(ptep, node, addr, next);
> +}
> +
> +static pte_t * __meminit radix__vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node,
> +						     struct vmem_altmap *altmap,
> +						     struct page *reuse)
> +{
> +	pte_t *pte = pte_offset_kernel(pmd, addr);
> +
> +	if (pte_none(*pte)) {
> +		pte_t entry;
> +		void *p;
> +
> +		if (!reuse) {
> +			/*
> +			 * make sure we don't create altmap mappings
> +			 * covering things outside the device.
> +			 */
> +			if (altmap && altmap_cross_boundary(altmap, addr, PAGE_SIZE))
> +				altmap = NULL;
> +
> +			p = vmemmap_alloc_block_buf(PAGE_SIZE, node, altmap);
> +			if (!p) {
> +				if (altmap)
> +					p = vmemmap_alloc_block_buf(PAGE_SIZE, node, NULL);
> +				if (!p)
> +					return NULL;
> +			}

Above if conditions are quite confusing when looking for the 1st time?
Can we do this? Did I get it right?

                if (!p && altmap)
                  p = vmemmap_alloc_block_buf(PAGE_SIZE, node, NULL);

                if (!p)
                  return NULL;

-ritesh


More information about the Linuxppc-dev mailing list