[PATCH v4 2/3] powerpc/mm/hugetlb: Add support for reserving gigantic huge pages via kernel command line

Christophe LEROY christophe.leroy at c-s.fr
Wed Aug 2 16:13:43 AEST 2017


Hi,

Le 28/07/2017 à 07:01, Aneesh Kumar K.V a écrit :
> With commit aa888a74977a8 ("hugetlb: support larger than MAX_ORDER") we added
> support for allocating gigantic hugepages via kernel command line. Switch
> ppc64 arch specific code to use that.
> 
> W.r.t FSL support, we now limit our allocation range using BOOTMEM_ALLOC_ACCESSIBLE.
> 
> We use the kernel command line to do reservation of hugetlb pages on powernv
> platforms. On pseries hash mmu mode the supported gigantic huge page size is
> 16GB and that can only be allocated with hypervisor assist. For pseries the
> command line option doesn't do the allocation. Instead pseries does gigantic
> hugepage allocation based on hypervisor hint that is specified via
> "ibm,expected#pages" property of the memory node.

It looks like it doesn't work on the 8xx:

root at vgoip:~# dmesg | grep -i huge
[    0.000000] Kernel command line: console=ttyCPM0,115200N8 
ip=172.25.231.25:172.25.231.1::255.0.0.0:vgoip:eth0:off hugepagesz=8M 
hugepages=4
[    0.416722] HugeTLB registered 8.00 MiB page size, pre-allocated 4 pages
[    0.423184] HugeTLB registered 512 KiB page size, pre-allocated 0 pages
root at vgoip:~# cat /proc/meminfo
MemTotal:         123388 kB
MemFree:           77900 kB
MemAvailable:      78412 kB
Buffers:               0 kB
Cached:             3964 kB
SwapCached:            0 kB
Active:             3788 kB
Inactive:           1680 kB
Active(anon):       1636 kB
Inactive(anon):       20 kB
Active(file):       2152 kB
Inactive(file):     1660 kB
Unevictable:           0 kB
Mlocked:               0 kB
SwapTotal:             0 kB
SwapFree:              0 kB
Dirty:                 0 kB
Writeback:             0 kB
AnonPages:          1552 kB
Mapped:             2404 kB
Shmem:               152 kB
Slab:                  0 kB
SReclaimable:          0 kB
SUnreclaim:            0 kB
KernelStack:         304 kB
PageTables:          208 kB
NFS_Unstable:          0 kB
Bounce:                0 kB
WritebackTmp:          0 kB
CommitLimit:       45308 kB
Committed_AS:      16664 kB
VmallocTotal:     866304 kB
VmallocUsed:           0 kB
VmallocChunk:          0 kB
HugePages_Total:       0
HugePages_Free:        0
HugePages_Rsvd:        0
HugePages_Surp:        0
Hugepagesize:        512 kB

Christophe
> 
> Cc: Scott Wood <oss at buserror.net>
> Cc: Christophe Leroy <christophe.leroy at c-s.fr>
> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar at linux.vnet.ibm.com>
> ---
>   arch/powerpc/include/asm/book3s/64/mmu-hash.h |   2 +-
>   arch/powerpc/include/asm/hugetlb.h            |  14 --
>   arch/powerpc/kernel/setup-common.c            |   7 -
>   arch/powerpc/mm/hash_utils_64.c               |   2 +-
>   arch/powerpc/mm/hugetlbpage.c                 | 177 +++-----------------------
>   arch/powerpc/mm/init_32.c                     |   2 -
>   6 files changed, 22 insertions(+), 182 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
> index 6981a52b3887..f28d21c69f79 100644
> --- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h
> +++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
> @@ -468,7 +468,7 @@ extern int htab_bolt_mapping(unsigned long vstart, unsigned long vend,
>   			     int psize, int ssize);
>   int htab_remove_mapping(unsigned long vstart, unsigned long vend,
>   			int psize, int ssize);
> -extern void add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages);
> +extern void pseries_add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages);
>   extern void demote_segment_4k(struct mm_struct *mm, unsigned long addr);
>   
>   #ifdef CONFIG_PPC_PSERIES
> diff --git a/arch/powerpc/include/asm/hugetlb.h b/arch/powerpc/include/asm/hugetlb.h
> index 7f4025a6c69e..b8a0fb442c64 100644
> --- a/arch/powerpc/include/asm/hugetlb.h
> +++ b/arch/powerpc/include/asm/hugetlb.h
> @@ -218,18 +218,4 @@ static inline pte_t *hugepte_offset(hugepd_t hpd, unsigned long addr,
>   }
>   #endif /* CONFIG_HUGETLB_PAGE */
>   
> -/*
> - * FSL Book3E platforms require special gpage handling - the gpages
> - * are reserved early in the boot process by memblock instead of via
> - * the .dts as on IBM platforms.
> - */
> -#if defined(CONFIG_HUGETLB_PAGE) && (defined(CONFIG_PPC_FSL_BOOK3E) || \
> -    defined(CONFIG_PPC_8xx))
> -extern void __init reserve_hugetlb_gpages(void);
> -#else
> -static inline void reserve_hugetlb_gpages(void)
> -{
> -}
> -#endif
> -
>   #endif /* _ASM_POWERPC_HUGETLB_H */
> diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c
> index 94a948207cd2..0f896f17d5ab 100644
> --- a/arch/powerpc/kernel/setup-common.c
> +++ b/arch/powerpc/kernel/setup-common.c
> @@ -916,13 +916,6 @@ void __init setup_arch(char **cmdline_p)
>   	/* Reserve large chunks of memory for use by CMA for KVM. */
>   	kvm_cma_reserve();
>   
> -	/*
> -	 * Reserve any gigantic pages requested on the command line.
> -	 * memblock needs to have been initialized by the time this is
> -	 * called since this will reserve memory.
> -	 */
> -	reserve_hugetlb_gpages();
> -
>   	klp_init_thread_info(&init_thread_info);
>   
>   	init_mm.start_code = (unsigned long)_stext;
> diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
> index 7a20669c19e7..2f1f6bc04012 100644
> --- a/arch/powerpc/mm/hash_utils_64.c
> +++ b/arch/powerpc/mm/hash_utils_64.c
> @@ -509,7 +509,7 @@ static int __init htab_dt_scan_hugepage_blocks(unsigned long node,
>   			phys_addr, block_size, expected_pages);
>   	if (phys_addr + (16 * GB) <= memblock_end_of_DRAM()) {
>   		memblock_reserve(phys_addr, block_size * expected_pages);
> -		add_gpage(phys_addr, block_size, expected_pages);
> +		pseries_add_gpage(phys_addr, block_size, expected_pages);
>   	}
>   	return 0;
>   }
> diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
> index e1bf5ca397fe..a0271d738a30 100644
> --- a/arch/powerpc/mm/hugetlbpage.c
> +++ b/arch/powerpc/mm/hugetlbpage.c
> @@ -36,26 +36,6 @@
>   unsigned int HPAGE_SHIFT;
>   EXPORT_SYMBOL(HPAGE_SHIFT);
>   
> -/*
> - * Tracks gpages after the device tree is scanned and before the
> - * huge_boot_pages list is ready.  On non-Freescale implementations, this is
> - * just used to track 16G pages and so is a single array.  FSL-based
> - * implementations may have more than one gpage size, so we need multiple
> - * arrays
> - */
> -#if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_8xx)
> -#define MAX_NUMBER_GPAGES	128
> -struct psize_gpages {
> -	u64 gpage_list[MAX_NUMBER_GPAGES];
> -	unsigned int nr_gpages;
> -};
> -static struct psize_gpages gpage_freearray[MMU_PAGE_COUNT];
> -#else
> -#define MAX_NUMBER_GPAGES	1024
> -static u64 gpage_freearray[MAX_NUMBER_GPAGES];
> -static unsigned nr_gpages;
> -#endif
> -
>   #define hugepd_none(hpd)	(hpd_val(hpd) == 0)
>   
>   pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, unsigned long sz)
> @@ -210,145 +190,20 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz
>   	return hugepte_offset(*hpdp, addr, pdshift);
>   }
>   
> -#if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_8xx)
> -/* Build list of addresses of gigantic pages.  This function is used in early
> - * boot before the buddy allocator is setup.
> - */
> -void add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages)
> -{
> -	unsigned int idx = shift_to_mmu_psize(__ffs(page_size));
> -	int i;
> -
> -	if (addr == 0)
> -		return;
> -
> -	gpage_freearray[idx].nr_gpages = number_of_pages;
> -
> -	for (i = 0; i < number_of_pages; i++) {
> -		gpage_freearray[idx].gpage_list[i] = addr;
> -		addr += page_size;
> -	}
> -}
> -
> -/*
> - * Moves the gigantic page addresses from the temporary list to the
> - * huge_boot_pages list.
> - */
> -int alloc_bootmem_huge_page(struct hstate *hstate)
> -{
> -	struct huge_bootmem_page *m;
> -	int idx = shift_to_mmu_psize(huge_page_shift(hstate));
> -	int nr_gpages = gpage_freearray[idx].nr_gpages;
> -
> -	if (nr_gpages == 0)
> -		return 0;
> -
> -#ifdef CONFIG_HIGHMEM
> -	/*
> -	 * If gpages can be in highmem we can't use the trick of storing the
> -	 * data structure in the page; allocate space for this
> -	 */
> -	m = memblock_virt_alloc(sizeof(struct huge_bootmem_page), 0);
> -	m->phys = gpage_freearray[idx].gpage_list[--nr_gpages];
> -#else
> -	m = phys_to_virt(gpage_freearray[idx].gpage_list[--nr_gpages]);
> -#endif
> -
> -	list_add(&m->list, &huge_boot_pages);
> -	gpage_freearray[idx].nr_gpages = nr_gpages;
> -	gpage_freearray[idx].gpage_list[nr_gpages] = 0;
> -	m->hstate = hstate;
> -
> -	return 1;
> -}
> +#ifdef CONFIG_PPC_BOOK3S_64
>   /*
> - * Scan the command line hugepagesz= options for gigantic pages; store those in
> - * a list that we use to allocate the memory once all options are parsed.
> + * Tracks gpages after the device tree is scanned and before the
> + * huge_boot_pages list is ready on pSeries.
>    */
> -
> -unsigned long gpage_npages[MMU_PAGE_COUNT];
> -
> -static int __init do_gpage_early_setup(char *param, char *val,
> -				       const char *unused, void *arg)
> -{
> -	static phys_addr_t size;
> -	unsigned long npages;
> -
> -	/*
> -	 * The hugepagesz and hugepages cmdline options are interleaved.  We
> -	 * use the size variable to keep track of whether or not this was done
> -	 * properly and skip over instances where it is incorrect.  Other
> -	 * command-line parsing code will issue warnings, so we don't need to.
> -	 *
> -	 */
> -	if ((strcmp(param, "default_hugepagesz") == 0) ||
> -	    (strcmp(param, "hugepagesz") == 0)) {
> -		size = memparse(val, NULL);
> -	} else if (strcmp(param, "hugepages") == 0) {
> -		if (size != 0) {
> -			if (sscanf(val, "%lu", &npages) <= 0)
> -				npages = 0;
> -			if (npages > MAX_NUMBER_GPAGES) {
> -				pr_warn("MMU: %lu pages requested for page "
> -#ifdef CONFIG_PHYS_ADDR_T_64BIT
> -					"size %llu KB, limiting to "
> -#else
> -					"size %u KB, limiting to "
> -#endif
> -					__stringify(MAX_NUMBER_GPAGES) "\n",
> -					npages, size / 1024);
> -				npages = MAX_NUMBER_GPAGES;
> -			}
> -			gpage_npages[shift_to_mmu_psize(__ffs(size))] = npages;
> -			size = 0;
> -		}
> -	}
> -	return 0;
> -}
> -
> +#define MAX_NUMBER_GPAGES	1024
> +__initdata static u64 gpage_freearray[MAX_NUMBER_GPAGES];
> +__initdata static unsigned nr_gpages;
>   
>   /*
> - * This function allocates physical space for pages that are larger than the
> - * buddy allocator can handle.  We want to allocate these in highmem because
> - * the amount of lowmem is limited.  This means that this function MUST be
> - * called before lowmem_end_addr is set up in MMU_init() in order for the lmb
> - * allocate to grab highmem.
> - */
> -void __init reserve_hugetlb_gpages(void)
> -{
> -	static __initdata char cmdline[COMMAND_LINE_SIZE];
> -	phys_addr_t size, base;
> -	int i;
> -
> -	strlcpy(cmdline, boot_command_line, COMMAND_LINE_SIZE);
> -	parse_args("hugetlb gpages", cmdline, NULL, 0, 0, 0,
> -			NULL, &do_gpage_early_setup);
> -
> -	/*
> -	 * Walk gpage list in reverse, allocating larger page sizes first.
> -	 * Skip over unsupported sizes, or sizes that have 0 gpages allocated.
> -	 * When we reach the point in the list where pages are no longer
> -	 * considered gpages, we're done.
> -	 */
> -	for (i = MMU_PAGE_COUNT-1; i >= 0; i--) {
> -		if (mmu_psize_defs[i].shift == 0 || gpage_npages[i] == 0)
> -			continue;
> -		else if (mmu_psize_to_shift(i) < (MAX_ORDER + PAGE_SHIFT))
> -			break;
> -
> -		size = (phys_addr_t)(1ULL << mmu_psize_to_shift(i));
> -		base = memblock_alloc_base(size * gpage_npages[i], size,
> -					   MEMBLOCK_ALLOC_ANYWHERE);
> -		add_gpage(base, size, gpage_npages[i]);
> -	}
> -}
> -
> -#else /* !PPC_FSL_BOOK3E */
> -
> -/* Build list of addresses of gigantic pages.  This function is used in early
> + * Build list of addresses of gigantic pages.  This function is used in early
>    * boot before the buddy allocator is setup.
>    */
> -void add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages)
> +void __init pseries_add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages)
>   {
>   	if (!addr)
>   		return;
> @@ -360,10 +215,7 @@ void add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages)
>   	}
>   }
>   
> -/* Moves the gigantic page addresses from the temporary list to the
> - * huge_boot_pages list.
> - */
> -int alloc_bootmem_huge_page(struct hstate *hstate)
> +int __init pseries_alloc_bootmem_huge_page(struct hstate *hstate)
>   {
>   	struct huge_bootmem_page *m;
>   	if (nr_gpages == 0)
> @@ -376,6 +228,17 @@ int alloc_bootmem_huge_page(struct hstate *hstate)
>   }
>   #endif
>   
> +
> +int __init alloc_bootmem_huge_page(struct hstate *h)
> +{
> +
> +#ifdef CONFIG_PPC_BOOK3S_64
> +	if (firmware_has_feature(FW_FEATURE_LPAR) && !radix_enabled())
> +		return pseries_alloc_bootmem_huge_page(h);
> +#endif
> +	return __alloc_bootmem_huge_page(h);
> +}
> +
>   #if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_8xx)
>   #define HUGEPD_FREELIST_SIZE \
>   	((PAGE_SIZE - sizeof(struct hugepd_freelist)) / sizeof(pte_t))
> diff --git a/arch/powerpc/mm/init_32.c b/arch/powerpc/mm/init_32.c
> index 8a7c38b8d335..436d9721ab63 100644
> --- a/arch/powerpc/mm/init_32.c
> +++ b/arch/powerpc/mm/init_32.c
> @@ -132,8 +132,6 @@ void __init MMU_init(void)
>   	 * Reserve gigantic pages for hugetlb.  This MUST occur before
>   	 * lowmem_end_addr is initialized below.
>   	 */
> -	reserve_hugetlb_gpages();
> -
>   	if (memblock.memory.cnt > 1) {
>   #ifndef CONFIG_WII
>   		memblock_enforce_memory_limit(memblock.memory.regions[0].size);
> 


More information about the Linuxppc-dev mailing list