[PATCH] powerpc/mm/book3s64/hash: Update 4k PAGE_SIZE kernel mapping

Cameron Berkenpas cam at neo-zeon.de
Thu Oct 17 03:21:55 AEDT 2019


Seems to work for me so far! I've tried successfully against 5.2.21 and 
5.3.6.

Thanks!

-Cameron

On 10/15/19 10:51 PM, Aneesh Kumar K.V wrote:
> With commit: 0034d395f89d ("powerpc/mm/hash64: Map all the kernel
> regions in the same 0xc range"), kernel now split the 64TB address range
> into 4 contexts each of 16TB. That implies we can do only 16TB linear
> mapping. This results in boot failure on some P9 systems.
>
> Fix this by redoing the hash 4k mapping as below.
>
>   vmalloc start     = 0xd000000000000000
>   IO start          = 0xd000380000000000
>   vmemmap start     = 0xf000000000000000
>
> Vmalloc area is now 56TB in size and IO remap 8TB. We need to keep them in the
> same top nibble address because we map both of them in the Linux page table and they
> share the init_mm page table. We need a large vmalloc space because we use
> percpu embedded first chunk allocator.
>
> Both linear and vmemmap range is of 64TB size each and is mapped respectively
> using 0xc and 0xf top nibble.
>
> Fixes: 0034d395f89d ("powerpc/mm/hash64: Map all the kernel regions in the same 0xc range")
> Reported-by: Cameron Berkenpas <cam at neo-zeon.de>
> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar at linux.ibm.com>
> ---
>   arch/powerpc/include/asm/book3s/64/hash-4k.h  | 54 ++++++++++--
>   arch/powerpc/include/asm/book3s/64/hash-64k.h | 73 ++++++++++++++++-
>   arch/powerpc/include/asm/book3s/64/hash.h     | 82 ++-----------------
>   3 files changed, 123 insertions(+), 86 deletions(-)
>
> diff --git a/arch/powerpc/include/asm/book3s/64/hash-4k.h b/arch/powerpc/include/asm/book3s/64/hash-4k.h
> index 8fd8599c9395..4cbb9fe22d76 100644
> --- a/arch/powerpc/include/asm/book3s/64/hash-4k.h
> +++ b/arch/powerpc/include/asm/book3s/64/hash-4k.h
> @@ -12,23 +12,59 @@
>    * Hence also limit max EA bits to 64TB.
>    */
>   #define MAX_EA_BITS_PER_CONTEXT		46
> -
> -#define REGION_SHIFT		(MAX_EA_BITS_PER_CONTEXT - 2)
> +/*
> + * For 4k hash, considering we restricted by a page table sizing that
> + * limit our address range to 64TB, keep the kernel virtual
> + * mapping in 0xd region.
> + */
> +#define H_KERN_VIRT_START	ASM_CONST(0xd000000000000000)
>   
>   /*
> - * Our page table limit us to 64TB. Hence for the kernel mapping,
> - * each MAP area is limited to 16 TB.
> - * The four map areas are:  linear mapping, vmap, IO and vmemmap
> + * Top 4 bits are ignored in page table walk.
>    */
> -#define H_KERN_MAP_SIZE		(ASM_CONST(1) << REGION_SHIFT)
> +#define EA_MASK			(~(0xfUL << 60))
>   
>   /*
> - * Define the address range of the kernel non-linear virtual area
> - * 16TB
> + * Place vmalloc and IO in the 64TB range because we map them via linux page
> + * table and table size is limited to 64TB.
> + */
> +#define H_VMALLOC_START		H_KERN_VIRT_START
> +/*
> + * 56TB vmalloc size. We require large vmalloc space for percpu mapping.
>    */
> -#define H_KERN_VIRT_START	ASM_CONST(0xc000100000000000)
> +#define H_VMALLOC_SIZE		(56UL << 40)
> +#define H_VMALLOC_END		(H_VMALLOC_START + H_VMALLOC_SIZE)
> +
> +#define H_KERN_IO_START		H_VMALLOC_END
> +#define H_KERN_IO_SIZE		(8UL << 40)
> +#define H_KERN_IO_END		(H_KERN_IO_START + H_KERN_IO_SIZE)
> +
> +#define H_VMEMMAP_START		ASM_CONST(0xf000000000000000)
> +#define H_VMEMMAP_SIZE		(1UL << MAX_EA_BITS_PER_CONTEXT)
> +#define H_VMEMMAP_END		(H_VMEMMAP_START + H_VMEMMAP_SIZE)
>   
>   #ifndef __ASSEMBLY__
> +static inline int get_region_id(unsigned long ea)
> +{
> +	int id = (ea >> 60UL);
> +
> +	switch (id) {
> +	case 0x0:
> +		return USER_REGION_ID;
> +	case 0xc:
> +		return LINEAR_MAP_REGION_ID;
> +	case 0xd:
> +		if (ea < H_KERN_IO_START)
> +			return VMALLOC_REGION_ID;
> +		else
> +			return IO_REGION_ID;
> +	case 0xf:
> +		return VMEMMAP_REGION_ID;
> +	default:
> +		return INVALID_REGION_ID;
> +	}
> +}
> +
>   #define H_PTE_TABLE_SIZE	(sizeof(pte_t) << H_PTE_INDEX_SIZE)
>   #define H_PMD_TABLE_SIZE	(sizeof(pmd_t) << H_PMD_INDEX_SIZE)
>   #define H_PUD_TABLE_SIZE	(sizeof(pud_t) << H_PUD_INDEX_SIZE)
> diff --git a/arch/powerpc/include/asm/book3s/64/hash-64k.h b/arch/powerpc/include/asm/book3s/64/hash-64k.h
> index d1d9177d9ebd..fc44bc590ac8 100644
> --- a/arch/powerpc/include/asm/book3s/64/hash-64k.h
> +++ b/arch/powerpc/include/asm/book3s/64/hash-64k.h
> @@ -13,18 +13,61 @@
>    * is handled in the hotpath.
>    */
>   #define MAX_EA_BITS_PER_CONTEXT		49
> -#define REGION_SHIFT		MAX_EA_BITS_PER_CONTEXT
> +
> +/*
> + * Define the address range of the kernel non-linear virtual area
> + * 2PB
> + */
> +#define H_KERN_VIRT_START	ASM_CONST(0xc008000000000000)
>   
>   /*
>    * We use one context for each MAP area.
>    */
> +#define REGION_SHIFT		MAX_EA_BITS_PER_CONTEXT
>   #define H_KERN_MAP_SIZE		(1UL << MAX_EA_BITS_PER_CONTEXT)
>   
>   /*
> - * Define the address range of the kernel non-linear virtual area
> - * 2PB
> + * Top 2 bits are ignored in page table walk.
>    */
> -#define H_KERN_VIRT_START	ASM_CONST(0xc008000000000000)
> +#define EA_MASK			(~(0xcUL << 60))
> +
> +/*
> + * +------------------------------+
> + * |                              |
> + * |                              |
> + * |                              |
> + * +------------------------------+  Kernel virtual map end (0xc00e000000000000)
> + * |                              |
> + * |                              |
> + * |      512TB/16TB of vmemmap   |
> + * |                              |
> + * |                              |
> + * +------------------------------+  Kernel vmemmap  start
> + * |                              |
> + * |      512TB/16TB of IO map    |
> + * |                              |
> + * +------------------------------+  Kernel IO map start
> + * |                              |
> + * |      512TB/16TB of vmap      |
> + * |                              |
> + * +------------------------------+  Kernel virt start (0xc008000000000000)
> + * |                              |
> + * |                              |
> + * |                              |
> + * +------------------------------+  Kernel linear (0xc.....)
> + */
> +
> +#define H_VMALLOC_START		H_KERN_VIRT_START
> +#define H_VMALLOC_SIZE		H_KERN_MAP_SIZE
> +#define H_VMALLOC_END		(H_VMALLOC_START + H_VMALLOC_SIZE)
> +
> +#define H_KERN_IO_START		H_VMALLOC_END
> +#define H_KERN_IO_SIZE		H_KERN_MAP_SIZE
> +#define H_KERN_IO_END		(H_KERN_IO_START + H_KERN_IO_SIZE)
> +
> +#define H_VMEMMAP_START		H_KERN_IO_END
> +#define H_VMEMMAP_SIZE		H_KERN_MAP_SIZE
> +#define H_VMEMMAP_END		(H_VMEMMAP_START + H_VMEMMAP_SIZE)
>   
>   /*
>    * 64k aligned address free up few of the lower bits of RPN for us
> @@ -69,6 +112,28 @@
>   #ifndef __ASSEMBLY__
>   #include <asm/errno.h>
>   
> +#define NON_LINEAR_REGION_ID(ea)	((((unsigned long)(ea) - H_KERN_VIRT_START) >> REGION_SHIFT) + 2)
> +
> +static inline int get_region_id(unsigned long ea)
> +{
> +	int region_id;
> +	int id = (ea >> 60UL);
> +
> +	if (id == 0)
> +		return USER_REGION_ID;
> +
> +	if (id != (PAGE_OFFSET >> 60))
> +		return INVALID_REGION_ID;
> +
> +	if (ea < H_KERN_VIRT_START)
> +		return LINEAR_MAP_REGION_ID;
> +
> +	BUILD_BUG_ON(NON_LINEAR_REGION_ID(H_VMALLOC_START) != 2);
> +
> +	region_id = NON_LINEAR_REGION_ID(ea);
> +	return region_id;
> +}
> +
>   /*
>    * With 64K pages on hash table, we have a special PTE format that
>    * uses a second "half" of the page table to encode sub-page information
> diff --git a/arch/powerpc/include/asm/book3s/64/hash.h b/arch/powerpc/include/asm/book3s/64/hash.h
> index 2781ebf6add4..e279224629a2 100644
> --- a/arch/powerpc/include/asm/book3s/64/hash.h
> +++ b/arch/powerpc/include/asm/book3s/64/hash.h
> @@ -11,6 +11,15 @@
>    *
>    */
>   #define H_PTE_NONE_MASK		_PAGE_HPTEFLAGS
> +/*
> + * Region IDs
> + */
> +#define USER_REGION_ID		0
> +#define LINEAR_MAP_REGION_ID	1
> +#define VMALLOC_REGION_ID	2
> +#define IO_REGION_ID		3
> +#define VMEMMAP_REGION_ID	4
> +#define INVALID_REGION_ID	5
>   
>   #ifdef CONFIG_PPC_64K_PAGES
>   #include <asm/book3s/64/hash-64k.h>
> @@ -29,10 +38,6 @@
>   #define H_PGTABLE_EADDR_SIZE	(H_PTE_INDEX_SIZE + H_PMD_INDEX_SIZE + \
>   				 H_PUD_INDEX_SIZE + H_PGD_INDEX_SIZE + PAGE_SHIFT)
>   #define H_PGTABLE_RANGE		(ASM_CONST(1) << H_PGTABLE_EADDR_SIZE)
> -/*
> - * Top 2 bits are ignored in page table walk.
> - */
> -#define EA_MASK			(~(0xcUL << 60))
>   
>   /*
>    * We store the slot details in the second half of page table.
> @@ -45,56 +50,6 @@
>   #define H_PUD_CACHE_INDEX	(H_PUD_INDEX_SIZE)
>   #endif
>   
> -/*
> - * +------------------------------+
> - * |                              |
> - * |                              |
> - * |                              |
> - * +------------------------------+  Kernel virtual map end (0xc00e000000000000)
> - * |                              |
> - * |                              |
> - * |      512TB/16TB of vmemmap   |
> - * |                              |
> - * |                              |
> - * +------------------------------+  Kernel vmemmap  start
> - * |                              |
> - * |      512TB/16TB of IO map    |
> - * |                              |
> - * +------------------------------+  Kernel IO map start
> - * |                              |
> - * |      512TB/16TB of vmap      |
> - * |                              |
> - * +------------------------------+  Kernel virt start (0xc008000000000000)
> - * |                              |
> - * |                              |
> - * |                              |
> - * +------------------------------+  Kernel linear (0xc.....)
> - */
> -
> -#define H_VMALLOC_START		H_KERN_VIRT_START
> -#define H_VMALLOC_SIZE		H_KERN_MAP_SIZE
> -#define H_VMALLOC_END		(H_VMALLOC_START + H_VMALLOC_SIZE)
> -
> -#define H_KERN_IO_START		H_VMALLOC_END
> -#define H_KERN_IO_SIZE		H_KERN_MAP_SIZE
> -#define H_KERN_IO_END		(H_KERN_IO_START + H_KERN_IO_SIZE)
> -
> -#define H_VMEMMAP_START		H_KERN_IO_END
> -#define H_VMEMMAP_SIZE		H_KERN_MAP_SIZE
> -#define H_VMEMMAP_END		(H_VMEMMAP_START + H_VMEMMAP_SIZE)
> -
> -#define NON_LINEAR_REGION_ID(ea)	((((unsigned long)ea - H_KERN_VIRT_START) >> REGION_SHIFT) + 2)
> -
> -/*
> - * Region IDs
> - */
> -#define USER_REGION_ID		0
> -#define LINEAR_MAP_REGION_ID	1
> -#define VMALLOC_REGION_ID	NON_LINEAR_REGION_ID(H_VMALLOC_START)
> -#define IO_REGION_ID		NON_LINEAR_REGION_ID(H_KERN_IO_START)
> -#define VMEMMAP_REGION_ID	NON_LINEAR_REGION_ID(H_VMEMMAP_START)
> -#define INVALID_REGION_ID	(VMEMMAP_REGION_ID + 1)
> -
>   /*
>    * Defines the address of the vmemap area, in its own region on
>    * hash table CPUs.
> @@ -112,25 +67,6 @@
>   #define H_PUD_BAD_BITS		(PMD_TABLE_SIZE-1)
>   
>   #ifndef __ASSEMBLY__
> -static inline int get_region_id(unsigned long ea)
> -{
> -	int region_id;
> -	int id = (ea >> 60UL);
> -
> -	if (id == 0)
> -		return USER_REGION_ID;
> -
> -	if (id != (PAGE_OFFSET >> 60))
> -		return INVALID_REGION_ID;
> -
> -	if (ea < H_KERN_VIRT_START)
> -		return LINEAR_MAP_REGION_ID;
> -
> -	BUILD_BUG_ON(NON_LINEAR_REGION_ID(H_VMALLOC_START) != 2);
> -
> -	region_id = NON_LINEAR_REGION_ID(ea);
> -	return region_id;
> -}
>   
>   #define	hash__pmd_bad(pmd)		(pmd_val(pmd) & H_PMD_BAD_BITS)
>   #define	hash__pud_bad(pud)		(pud_val(pud) & H_PUD_BAD_BITS)



More information about the Linuxppc-dev mailing list