[PATCH V6 2/4] powerpc/mm: Add support for handling > 512TB address in SLB miss
Christophe LEROY
christophe.leroy at c-s.fr
Mon Apr 9 17:19:26 AEST 2018
Le 26/03/2018 à 12:04, Aneesh Kumar K.V a écrit :
> For addresses above 512TB we allocate additional mmu contexts. To make
> it all easy, addresses above 512TB are handled with IR/DR=1 and with
> stack frame setup.
>
> The mmu_context_t is also updated to track the new extended_ids. To
> support upto 4PB we need a total 8 contexts.
>
> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar at linux.vnet.ibm.com>
> [mpe: Minor formatting tweaks and comment wording, switch BUG to WARN
> in get_ea_context().]
> Signed-off-by: Michael Ellerman <mpe at ellerman.id.au>
Compilation fails on mpc885_ads_defconfig + CONFIG_HUGETLBFS :
CC arch/powerpc/mm/slice.o
arch/powerpc/mm/slice.c: In function 'slice_get_unmapped_area':
arch/powerpc/mm/slice.c:655:2: error: implicit declaration of function
'need_extra_context' [-Werror=implicit-function-declaration]
arch/powerpc/mm/slice.c:656:3: error: implicit declaration of function
'alloc_extended_context' [-Werror=implicit-function-declaration]
cc1: all warnings being treated as errors
make[1]: *** [arch/powerpc/mm/slice.o] Error 1
make: *** [arch/powerpc/mm] Error 2
Christophe
> ---
> arch/powerpc/include/asm/book3s/64/hash-4k.h | 6 ++
> arch/powerpc/include/asm/book3s/64/hash-64k.h | 6 ++
> arch/powerpc/include/asm/book3s/64/mmu.h | 33 +++++++-
> arch/powerpc/include/asm/mmu_context.h | 39 ++++++++++
> arch/powerpc/include/asm/processor.h | 6 ++
> arch/powerpc/kernel/exceptions-64s.S | 11 ++-
> arch/powerpc/kernel/traps.c | 12 ---
> arch/powerpc/mm/copro_fault.c | 2 +-
> arch/powerpc/mm/hash_utils_64.c | 4 +-
> arch/powerpc/mm/mmu_context_book3s64.c | 15 +++-
> arch/powerpc/mm/pgtable-hash64.c | 2 +-
> arch/powerpc/mm/slb.c | 108 ++++++++++++++++++++++++++
> arch/powerpc/mm/slb_low.S | 11 ++-
> arch/powerpc/mm/slice.c | 15 +++-
> arch/powerpc/mm/tlb_hash64.c | 2 +-
> 15 files changed, 245 insertions(+), 27 deletions(-)
>
> diff --git a/arch/powerpc/include/asm/book3s/64/hash-4k.h b/arch/powerpc/include/asm/book3s/64/hash-4k.h
> index 67c5475311ee..1a35eb944481 100644
> --- a/arch/powerpc/include/asm/book3s/64/hash-4k.h
> +++ b/arch/powerpc/include/asm/book3s/64/hash-4k.h
> @@ -11,6 +11,12 @@
> #define H_PUD_INDEX_SIZE 9
> #define H_PGD_INDEX_SIZE 9
>
> +/*
> + * Each context is 512TB. But on 4k we restrict our max TASK size to 64TB
> + * Hence also limit max EA bits to 64TB.
> + */
> +#define MAX_EA_BITS_PER_CONTEXT 46
> +
> #ifndef __ASSEMBLY__
> #define H_PTE_TABLE_SIZE (sizeof(pte_t) << H_PTE_INDEX_SIZE)
> #define H_PMD_TABLE_SIZE (sizeof(pmd_t) << H_PMD_INDEX_SIZE)
> diff --git a/arch/powerpc/include/asm/book3s/64/hash-64k.h b/arch/powerpc/include/asm/book3s/64/hash-64k.h
> index 3bcf269f8f55..8d0cbbb31023 100644
> --- a/arch/powerpc/include/asm/book3s/64/hash-64k.h
> +++ b/arch/powerpc/include/asm/book3s/64/hash-64k.h
> @@ -7,6 +7,12 @@
> #define H_PUD_INDEX_SIZE 7
> #define H_PGD_INDEX_SIZE 8
>
> +/*
> + * Each context is 512TB size. SLB miss for first context/default context
> + * is handled in the hotpath.
> + */
> +#define MAX_EA_BITS_PER_CONTEXT 49
> +
> /*
> * 64k aligned address free up few of the lower bits of RPN for us
> * We steal that here. For more deatils look at pte_pfn/pfn_pte()
> diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h b/arch/powerpc/include/asm/book3s/64/mmu.h
> index c8c836e8ad1b..5094696eecd6 100644
> --- a/arch/powerpc/include/asm/book3s/64/mmu.h
> +++ b/arch/powerpc/include/asm/book3s/64/mmu.h
> @@ -91,7 +91,18 @@ struct slice_mask {
> };
>
> typedef struct {
> - mm_context_id_t id;
> + union {
> + /*
> + * We use id as the PIDR content for radix. On hash we can use
> + * more than one id. The extended ids are used when we start
> + * having address above 512TB. We allocate one extended id
> + * for each 512TB. The new id is then used with the 49 bit
> + * EA to build a new VA. We always use ESID_BITS_1T_MASK bits
> + * from EA and new context ids to build the new VAs.
> + */
> + mm_context_id_t id;
> + mm_context_id_t extended_id[TASK_SIZE_USER64/TASK_CONTEXT_SIZE];
> + };
> u16 user_psize; /* page size index */
>
> /* Number of bits in the mm_cpumask */
> @@ -196,5 +207,25 @@ extern void radix_init_pseries(void);
> static inline void radix_init_pseries(void) { };
> #endif
>
> +static inline int get_ea_context(mm_context_t *ctx, unsigned long ea)
> +{
> + int index = ea >> MAX_EA_BITS_PER_CONTEXT;
> +
> + if (likely(index < ARRAY_SIZE(ctx->extended_id)))
> + return ctx->extended_id[index];
> +
> + /* should never happen */
> + WARN_ON(1);
> + return 0;
> +}
> +
> +static inline unsigned long get_user_vsid(mm_context_t *ctx,
> + unsigned long ea, int ssize)
> +{
> + unsigned long context = get_ea_context(ctx, ea);
> +
> + return get_vsid(context, ea, ssize);
> +}
> +
> #endif /* __ASSEMBLY__ */
> #endif /* _ASM_POWERPC_BOOK3S_64_MMU_H_ */
> diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h
> index 3a15b6db9501..1835ca1505d6 100644
> --- a/arch/powerpc/include/asm/mmu_context.h
> +++ b/arch/powerpc/include/asm/mmu_context.h
> @@ -60,12 +60,51 @@ extern int hash__alloc_context_id(void);
> extern void hash__reserve_context_id(int id);
> extern void __destroy_context(int context_id);
> static inline void mmu_context_init(void) { }
> +
> +static inline int alloc_extended_context(struct mm_struct *mm,
> + unsigned long ea)
> +{
> + int context_id;
> +
> + int index = ea >> MAX_EA_BITS_PER_CONTEXT;
> +
> + context_id = hash__alloc_context_id();
> + if (context_id < 0)
> + return context_id;
> +
> + VM_WARN_ON(mm->context.extended_id[index]);
> + mm->context.extended_id[index] = context_id;
> + return context_id;
> +}
> +
> +static inline bool need_extra_context(struct mm_struct *mm, unsigned long ea)
> +{
> + int context_id;
> +
> + context_id = get_ea_context(&mm->context, ea);
> + if (!context_id)
> + return true;
> + return false;
> +}
> +
> #else
> extern void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next,
> struct task_struct *tsk);
> extern unsigned long __init_new_context(void);
> extern void __destroy_context(unsigned long context_id);
> extern void mmu_context_init(void);
> +static inline int alloc_extended_context(struct mm_struct *mm,
> + unsigned long ea)
> +{
> + /* non book3s_64 should never find this called */
> + WARN_ON(1);
> + return -ENOMEM;
> +}
> +
> +static inline bool need_extra_context(struct mm_struct *mm, unsigned long ea)
> +{
> + return false;
> +}
> #endif
>
> #if defined(CONFIG_KVM_BOOK3S_HV_POSSIBLE) && defined(CONFIG_PPC_RADIX_MMU)
> diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h
> index 01299cdc9806..75b084486ce1 100644
> --- a/arch/powerpc/include/asm/processor.h
> +++ b/arch/powerpc/include/asm/processor.h
> @@ -119,9 +119,15 @@ void release_thread(struct task_struct *);
> */
> #define TASK_SIZE_USER64 TASK_SIZE_512TB
> #define DEFAULT_MAP_WINDOW_USER64 TASK_SIZE_128TB
> +#define TASK_CONTEXT_SIZE TASK_SIZE_512TB
> #else
> #define TASK_SIZE_USER64 TASK_SIZE_64TB
> #define DEFAULT_MAP_WINDOW_USER64 TASK_SIZE_64TB
> +/*
> + * We don't need to allocate extended context ids for 4K page size, because
> + * we limit the max effective address on this config to 64TB.
> + */
> +#define TASK_CONTEXT_SIZE TASK_SIZE_64TB
> #endif
>
> /*
> diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
> index 6bee20c43feb..1a0aa70bcb2b 100644
> --- a/arch/powerpc/kernel/exceptions-64s.S
> +++ b/arch/powerpc/kernel/exceptions-64s.S
> @@ -621,7 +621,10 @@ END_MMU_FTR_SECTION_IFCLR(MMU_FTR_TYPE_RADIX)
> lwz r9,PACA_EXSLB+EX_CCR(r13) /* get saved CR */
> mtlr r10
>
> - beq- 8f /* if bad address, make full stack frame */
> + /*
> + * Large address, check whether we have to allocate new contexts.
> + */
> + beq- 8f
>
> bne- cr5,2f /* if unrecoverable exception, oops */
>
> @@ -685,7 +688,7 @@ END_MMU_FTR_SECTION_IFCLR(MMU_FTR_TYPE_RADIX)
> mr r3,r12
> mfspr r11,SPRN_SRR0
> mfspr r12,SPRN_SRR1
> - LOAD_HANDLER(r10,bad_addr_slb)
> + LOAD_HANDLER(r10, large_addr_slb)
> mtspr SPRN_SRR0,r10
> ld r10,PACAKMSR(r13)
> mtspr SPRN_SRR1,r10
> @@ -700,7 +703,7 @@ EXC_COMMON_BEGIN(unrecov_slb)
> bl unrecoverable_exception
> b 1b
>
> -EXC_COMMON_BEGIN(bad_addr_slb)
> +EXC_COMMON_BEGIN(large_addr_slb)
> EXCEPTION_PROLOG_COMMON(0x380, PACA_EXSLB)
> RECONCILE_IRQ_STATE(r10, r11)
> ld r3, PACA_EXSLB+EX_DAR(r13)
> @@ -710,7 +713,7 @@ EXC_COMMON_BEGIN(bad_addr_slb)
> std r10, _TRAP(r1)
> 2: bl save_nvgprs
> addi r3, r1, STACK_FRAME_OVERHEAD
> - bl slb_miss_bad_addr
> + bl slb_miss_large_addr
> b ret_from_except
>
> EXC_REAL_BEGIN(hardware_interrupt, 0x500, 0x100)
> diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
> index 1e48d157196a..f200bfd98b17 100644
> --- a/arch/powerpc/kernel/traps.c
> +++ b/arch/powerpc/kernel/traps.c
> @@ -1495,18 +1495,6 @@ void alignment_exception(struct pt_regs *regs)
> exception_exit(prev_state);
> }
>
> -void slb_miss_bad_addr(struct pt_regs *regs)
> -{
> - enum ctx_state prev_state = exception_enter();
> -
> - if (user_mode(regs))
> - _exception(SIGSEGV, regs, SEGV_BNDERR, regs->dar);
> - else
> - bad_page_fault(regs, regs->dar, SIGSEGV);
> -
> - exception_exit(prev_state);
> -}
> -
> void StackOverflow(struct pt_regs *regs)
> {
> printk(KERN_CRIT "Kernel stack overflow in process %p, r1=%lx\n",
> diff --git a/arch/powerpc/mm/copro_fault.c b/arch/powerpc/mm/copro_fault.c
> index 697b70ad1195..7d0945bd3a61 100644
> --- a/arch/powerpc/mm/copro_fault.c
> +++ b/arch/powerpc/mm/copro_fault.c
> @@ -112,7 +112,7 @@ int copro_calculate_slb(struct mm_struct *mm, u64 ea, struct copro_slb *slb)
> return 1;
> psize = get_slice_psize(mm, ea);
> ssize = user_segment_size(ea);
> - vsid = get_vsid(mm->context.id, ea, ssize);
> + vsid = get_user_vsid(&mm->context, ea, ssize);
> vsidkey = SLB_VSID_USER;
> break;
> case VMALLOC_REGION_ID:
> diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
> index b578148d89e6..f62325d4f5f5 100644
> --- a/arch/powerpc/mm/hash_utils_64.c
> +++ b/arch/powerpc/mm/hash_utils_64.c
> @@ -1261,7 +1261,7 @@ int hash_page_mm(struct mm_struct *mm, unsigned long ea,
> }
> psize = get_slice_psize(mm, ea);
> ssize = user_segment_size(ea);
> - vsid = get_vsid(mm->context.id, ea, ssize);
> + vsid = get_user_vsid(&mm->context, ea, ssize);
> break;
> case VMALLOC_REGION_ID:
> vsid = get_kernel_vsid(ea, mmu_kernel_ssize);
> @@ -1526,7 +1526,7 @@ void hash_preload(struct mm_struct *mm, unsigned long ea,
>
> /* Get VSID */
> ssize = user_segment_size(ea);
> - vsid = get_vsid(mm->context.id, ea, ssize);
> + vsid = get_user_vsid(&mm->context, ea, ssize);
> if (!vsid)
> return;
> /*
> diff --git a/arch/powerpc/mm/mmu_context_book3s64.c b/arch/powerpc/mm/mmu_context_book3s64.c
> index 422be81bf69f..b75194dff64c 100644
> --- a/arch/powerpc/mm/mmu_context_book3s64.c
> +++ b/arch/powerpc/mm/mmu_context_book3s64.c
> @@ -179,6 +179,19 @@ void __destroy_context(int context_id)
> }
> EXPORT_SYMBOL_GPL(__destroy_context);
>
> +static void destroy_contexts(mm_context_t *ctx)
> +{
> + int index, context_id;
> +
> + spin_lock(&mmu_context_lock);
> + for (index = 0; index < ARRAY_SIZE(ctx->extended_id); index++) {
> + context_id = ctx->extended_id[index];
> + if (context_id)
> + ida_remove(&mmu_context_ida, context_id);
> + }
> + spin_unlock(&mmu_context_lock);
> +}
> +
> #ifdef CONFIG_PPC_64K_PAGES
> static void destroy_pagetable_page(struct mm_struct *mm)
> {
> @@ -217,7 +230,7 @@ void destroy_context(struct mm_struct *mm)
> else
> subpage_prot_free(mm);
> destroy_pagetable_page(mm);
> - __destroy_context(mm->context.id);
> + destroy_contexts(&mm->context);
> mm->context.id = MMU_NO_CONTEXT;
> }
>
> diff --git a/arch/powerpc/mm/pgtable-hash64.c b/arch/powerpc/mm/pgtable-hash64.c
> index 469808e77e58..a87b18cf6749 100644
> --- a/arch/powerpc/mm/pgtable-hash64.c
> +++ b/arch/powerpc/mm/pgtable-hash64.c
> @@ -320,7 +320,7 @@ void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr,
>
> if (!is_kernel_addr(addr)) {
> ssize = user_segment_size(addr);
> - vsid = get_vsid(mm->context.id, addr, ssize);
> + vsid = get_user_vsid(&mm->context, addr, ssize);
> WARN_ON(vsid == 0);
> } else {
> vsid = get_kernel_vsid(addr, mmu_kernel_ssize);
> diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c
> index 13cfe413b40d..66577cc66dc9 100644
> --- a/arch/powerpc/mm/slb.c
> +++ b/arch/powerpc/mm/slb.c
> @@ -22,6 +22,7 @@
> #include <asm/cacheflush.h>
> #include <asm/smp.h>
> #include <linux/compiler.h>
> +#include <linux/context_tracking.h>
> #include <linux/mm_types.h>
>
> #include <asm/udbg.h>
> @@ -340,3 +341,110 @@ void slb_initialize(void)
>
> asm volatile("isync":::"memory");
> }
> +
> +static void insert_slb_entry(unsigned long vsid, unsigned long ea,
> + int bpsize, int ssize)
> +{
> + unsigned long flags, vsid_data, esid_data;
> + enum slb_index index;
> + int slb_cache_index;
> +
> + /*
> + * We are irq disabled, hence should be safe to access PACA.
> + */
> + index = get_paca()->stab_rr;
> +
> + /*
> + * simple round-robin replacement of slb starting at SLB_NUM_BOLTED.
> + */
> + if (index < (mmu_slb_size - 1))
> + index++;
> + else
> + index = SLB_NUM_BOLTED;
> +
> + get_paca()->stab_rr = index;
> +
> + flags = SLB_VSID_USER | mmu_psize_defs[bpsize].sllp;
> + vsid_data = (vsid << slb_vsid_shift(ssize)) | flags |
> + ((unsigned long) ssize << SLB_VSID_SSIZE_SHIFT);
> + esid_data = mk_esid_data(ea, ssize, index);
> +
> + asm volatile("slbmte %0, %1" : : "r" (vsid_data), "r" (esid_data)
> + : "memory");
> +
> + /*
> + * Now update slb cache entries
> + */
> + slb_cache_index = get_paca()->slb_cache_ptr;
> + if (slb_cache_index < SLB_CACHE_ENTRIES) {
> + /*
> + * We have space in slb cache for optimized switch_slb().
> + * Top 36 bits from esid_data as per ISA
> + */
> + get_paca()->slb_cache[slb_cache_index++] = esid_data >> 28;
> + get_paca()->slb_cache_ptr++;
> + } else {
> + /*
> + * Our cache is full and the current cache content strictly
> + * doesn't indicate the active SLB conents. Bump the ptr
> + * so that switch_slb() will ignore the cache.
> + */
> + get_paca()->slb_cache_ptr = SLB_CACHE_ENTRIES + 1;
> + }
> +}
> +
> +static void handle_multi_context_slb_miss(int context_id, unsigned long ea)
> +{
> + struct mm_struct *mm = current->mm;
> + unsigned long vsid;
> + int bpsize;
> +
> + /*
> + * We are always above 1TB, hence use high user segment size.
> + */
> + vsid = get_vsid(context_id, ea, mmu_highuser_ssize);
> + bpsize = get_slice_psize(mm, ea);
> + insert_slb_entry(vsid, ea, bpsize, mmu_highuser_ssize);
> +}
> +
> +void slb_miss_large_addr(struct pt_regs *regs)
> +{
> + enum ctx_state prev_state = exception_enter();
> + unsigned long ea = regs->dar;
> + int context;
> +
> + if (REGION_ID(ea) != USER_REGION_ID)
> + goto slb_bad_addr;
> +
> + /*
> + * Are we beyound what the page table layout supports ?
> + */
> + if ((ea & ~REGION_MASK) >= H_PGTABLE_RANGE)
> + goto slb_bad_addr;
> +
> + /* Lower address should have been handled by asm code */
> + if (ea < (1UL << MAX_EA_BITS_PER_CONTEXT))
> + goto slb_bad_addr;
> +
> + /*
> + * consider this as bad access if we take a SLB miss
> + * on an address above addr limit.
> + */
> + if (ea >= current->mm->context.slb_addr_limit)
> + goto slb_bad_addr;
> +
> + context = get_ea_context(¤t->mm->context, ea);
> + if (!context)
> + goto slb_bad_addr;
> +
> + handle_multi_context_slb_miss(context, ea);
> + exception_exit(prev_state);
> + return;
> +
> +slb_bad_addr:
> + if (user_mode(regs))
> + _exception(SIGSEGV, regs, SEGV_BNDERR, ea);
> + else
> + bad_page_fault(regs, ea, SIGSEGV);
> + exception_exit(prev_state);
> +}
> diff --git a/arch/powerpc/mm/slb_low.S b/arch/powerpc/mm/slb_low.S
> index 2c7c717fd2ea..a83fbd2a4a24 100644
> --- a/arch/powerpc/mm/slb_low.S
> +++ b/arch/powerpc/mm/slb_low.S
> @@ -75,10 +75,15 @@ ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_68_BIT_VA)
> */
> _GLOBAL(slb_allocate)
> /*
> - * check for bad kernel/user address
> - * (ea & ~REGION_MASK) >= PGTABLE_RANGE
> + * Check if the address falls within the range of the first context, or
> + * if we may need to handle multi context. For the first context we
> + * allocate the slb entry via the fast path below. For large address we
> + * branch out to C-code and see if additional contexts have been
> + * allocated.
> + * The test here is:
> + * (ea & ~REGION_MASK) >= (1ull << MAX_EA_BITS_PER_CONTEXT)
> */
> - rldicr. r9,r3,4,(63 - H_PGTABLE_EADDR_SIZE - 4)
> + rldicr. r9,r3,4,(63 - MAX_EA_BITS_PER_CONTEXT - 4)
> bne- 8f
>
> srdi r9,r3,60 /* get region */
> diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c
> index 09ac1a709d0c..9cd87d11fe4e 100644
> --- a/arch/powerpc/mm/slice.c
> +++ b/arch/powerpc/mm/slice.c
> @@ -648,6 +648,15 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
> slice_print_mask(" mask", &potential_mask);
>
> convert:
> + /*
> + * Try to allocate the context before we do slice convert
> + * so that we handle the context allocation failure gracefully.
> + */
> + if (need_extra_context(mm, newaddr)) {
> + if (alloc_extended_context(mm, newaddr) < 0)
> + return -ENOMEM;
> + }
> +
> slice_andnot_mask(&potential_mask, &potential_mask, &good_mask);
> if (compat_maskp && !fixed)
> slice_andnot_mask(&potential_mask, &potential_mask, compat_maskp);
> @@ -658,10 +667,14 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
> if (psize > MMU_PAGE_BASE)
> on_each_cpu(slice_flush_segments, mm, 1);
> }
> + return newaddr;
>
> return_addr:
> + if (need_extra_context(mm, newaddr)) {
> + if (alloc_extended_context(mm, newaddr) < 0)
> + return -ENOMEM;
> + }
> return newaddr;
> -
> }
> EXPORT_SYMBOL_GPL(slice_get_unmapped_area);
>
> diff --git a/arch/powerpc/mm/tlb_hash64.c b/arch/powerpc/mm/tlb_hash64.c
> index 9b23f12e863c..87d71dd25441 100644
> --- a/arch/powerpc/mm/tlb_hash64.c
> +++ b/arch/powerpc/mm/tlb_hash64.c
> @@ -89,7 +89,7 @@ void hpte_need_flush(struct mm_struct *mm, unsigned long addr,
> /* Build full vaddr */
> if (!is_kernel_addr(addr)) {
> ssize = user_segment_size(addr);
> - vsid = get_vsid(mm->context.id, addr, ssize);
> + vsid = get_user_vsid(&mm->context, addr, ssize);
> } else {
> vsid = get_kernel_vsid(addr, mmu_kernel_ssize);
> ssize = mmu_kernel_ssize;
>
More information about the Linuxppc-dev
mailing list