[PATCH V2 39/68] powerpc/mm/radix: Add tlbflush routines
Balbir Singh
bsingharora at gmail.com
Fri Apr 22 17:20:55 AEST 2016
On 09/04/16 16:13, Aneesh Kumar K.V wrote:
> Core kernel don't track the page size of the va range that we are
> invalidating. Hence we end up flushing tlb for the entire mm here.
> Later patches will improve this.
>
> We also don't flush page walk cache separetly instead use RIC=2 when
> flushing tlb, because we do a mmu gather flush after freeing page table.
>
> MMU_NO_CONTEXT is updated for hash.
>
> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar at linux.vnet.ibm.com>
> ---
> arch/powerpc/include/asm/book3s/64/mmu-hash.h | 1 +
> arch/powerpc/include/asm/book3s/64/tlbflush-hash.h | 13 +-
> .../powerpc/include/asm/book3s/64/tlbflush-radix.h | 33 +++
> arch/powerpc/include/asm/book3s/64/tlbflush.h | 20 ++
> arch/powerpc/include/asm/tlbflush.h | 1 +
> arch/powerpc/mm/Makefile | 2 +-
> arch/powerpc/mm/tlb-radix.c | 243 +++++++++++++++++++++
> 7 files changed, 308 insertions(+), 5 deletions(-)
> create mode 100644 arch/powerpc/include/asm/book3s/64/tlbflush-radix.h
> create mode 100644 arch/powerpc/mm/tlb-radix.c
>
> diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
> index 7da61b85406b..290157e8d5b2 100644
> --- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h
> +++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
> @@ -119,6 +119,7 @@
> #define POWER7_TLB_SETS 128 /* # sets in POWER7 TLB */
> #define POWER8_TLB_SETS 512 /* # sets in POWER8 TLB */
> #define POWER9_TLB_SETS_HASH 256 /* # sets in POWER9 TLB Hash mode */
> +#define POWER9_TLB_SETS_RADIX 128 /* # sets in POWER9 TLB Radix mode */
>
> #ifndef __ASSEMBLY__
>
> diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h b/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h
> index ddce8477fe0c..e90310d1a519 100644
> --- a/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h
> +++ b/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h
> @@ -1,8 +1,6 @@
> #ifndef _ASM_POWERPC_BOOK3S_64_TLBFLUSH_HASH_H
> #define _ASM_POWERPC_BOOK3S_64_TLBFLUSH_HASH_H
>
> -#define MMU_NO_CONTEXT 0
> -
> /*
> * TLB flushing for 64-bit hash-MMU CPUs
> */
> @@ -29,14 +27,21 @@ extern void __flush_tlb_pending(struct ppc64_tlb_batch *batch);
>
> static inline void arch_enter_lazy_mmu_mode(void)
> {
> - struct ppc64_tlb_batch *batch = this_cpu_ptr(&ppc64_tlb_batch);
> + struct ppc64_tlb_batch *batch;
>
> + if (radix_enabled())
> + return;
> + batch = this_cpu_ptr(&ppc64_tlb_batch);
> batch->active = 1;
> }
>
> static inline void arch_leave_lazy_mmu_mode(void)
> {
> - struct ppc64_tlb_batch *batch = this_cpu_ptr(&ppc64_tlb_batch);
> + struct ppc64_tlb_batch *batch;
> +
> + if (radix_enabled())
> + return;
> + batch = this_cpu_ptr(&ppc64_tlb_batch);
>
Are we better of doing
#ifdef CONFIG_RADIX_MMU
static inline arch_enter_lazy_mmu(...)
{
Actual code for HASH PTE's
}
#else
static inline arch_enter_lazy_mmu(...)
{
}
Unless you need a runtime switch -- which means we need both HPTE/RADIX to co-exist
> if (batch->index)
> __flush_tlb_pending(batch);
> diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h b/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h
> new file mode 100644
> index 000000000000..584ffa0a331f
> --- /dev/null
> +++ b/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h
> @@ -0,0 +1,33 @@
> +#ifndef _ASM_POWERPC_TLBFLUSH_RADIX_H
> +#define _ASM_POWERPC_TLBFLUSH_RADIX_H
> +
> +struct vm_area_struct;
> +struct mm_struct;
> +struct mmu_gather;
> +
> +static inline int mmu_get_ap(int psize)
> +{
> + return mmu_psize_defs[psize].ap;
> +}
> +
Why the abstraction, the previous patches happily used mmu_psize_defs[psize].YYY
> +extern void flush_rtlb_range(struct vm_area_struct *vma, unsigned long start,
> + unsigned long end);
> +extern void flush_rtlb_kernel_range(unsigned long start, unsigned long end);
> +
> +extern void local_flush_rtlb_mm(struct mm_struct *mm);
> +extern void local_flush_rtlb_page(struct vm_area_struct *vma, unsigned long vmaddr);
> +extern void __local_flush_rtlb_page(struct mm_struct *mm, unsigned long vmaddr,
> + unsigned long ap, int nid);
> +extern void rtlb_flush(struct mmu_gather *tlb);
> +#ifdef CONFIG_SMP
> +extern void flush_rtlb_mm(struct mm_struct *mm);
> +extern void flush_rtlb_page(struct vm_area_struct *vma, unsigned long vmaddr);
> +extern void __flush_rtlb_page(struct mm_struct *mm, unsigned long vmaddr,
> + unsigned long ap, int nid);
> +#else
> +#define flush_rtlb_mm(mm) local_flush_rtlb_mm(mm)
> +#define flush_rtlb_page(vma,addr) local_flush_rtlb_page(vma,addr)
> +#define __flush_rtlb_page(mm,addr,p,i) __local_flush_rtlb_page(mm,addr,p,i)
> +#endif
> +
> +#endif
> diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush.h b/arch/powerpc/include/asm/book3s/64/tlbflush.h
> index 37d7f289ad42..66b7bc371491 100644
> --- a/arch/powerpc/include/asm/book3s/64/tlbflush.h
> +++ b/arch/powerpc/include/asm/book3s/64/tlbflush.h
> @@ -1,51 +1,71 @@
> #ifndef _ASM_POWERPC_BOOK3S_64_TLBFLUSH_H
> #define _ASM_POWERPC_BOOK3S_64_TLBFLUSH_H
>
> +#define MMU_NO_CONTEXT ~0UL
> +
> +
> #include <asm/book3s/64/tlbflush-hash.h>
> +#include <asm/book3s/64/tlbflush-radix.h>
>
> static inline void flush_tlb_range(struct vm_area_struct *vma,
> unsigned long start, unsigned long end)
> {
> + if (radix_enabled())
> + return flush_rtlb_range(vma, start, end);
> return flush_hltlb_range(vma, start, end);
> }
>
> static inline void flush_tlb_kernel_range(unsigned long start,
> unsigned long end)
> {
> + if (radix_enabled())
> + return flush_rtlb_kernel_range(start, end);
> return flush_hltlb_kernel_range(start, end);
> }
>
> static inline void local_flush_tlb_mm(struct mm_struct *mm)
> {
> + if (radix_enabled())
> + return local_flush_rtlb_mm(mm);
> return local_flush_hltlb_mm(mm);
> }
>
> static inline void local_flush_tlb_page(struct vm_area_struct *vma,
> unsigned long vmaddr)
> {
> + if (radix_enabled())
> + return local_flush_rtlb_page(vma, vmaddr);
> return local_flush_hltlb_page(vma, vmaddr);
> }
>
> static inline void flush_tlb_page_nohash(struct vm_area_struct *vma,
> unsigned long vmaddr)
> {
> + if (radix_enabled())
> + return flush_rtlb_page(vma, vmaddr);
> return flush_hltlb_page_nohash(vma, vmaddr);
> }
>
> static inline void tlb_flush(struct mmu_gather *tlb)
> {
> + if (radix_enabled())
> + return rtlb_flush(tlb);
> return hltlb_flush(tlb);
> }
>
> #ifdef CONFIG_SMP
> static inline void flush_tlb_mm(struct mm_struct *mm)
> {
> + if (radix_enabled())
> + return flush_rtlb_mm(mm);
> return flush_hltlb_mm(mm);
> }
>
> static inline void flush_tlb_page(struct vm_area_struct *vma,
> unsigned long vmaddr)
> {
> + if (radix_enabled())
> + return flush_rtlb_page(vma, vmaddr);
> return flush_hltlb_page(vma, vmaddr);
> }
> #else
> diff --git a/arch/powerpc/include/asm/tlbflush.h b/arch/powerpc/include/asm/tlbflush.h
> index 2fc4331c5bc5..1b38eea28e5a 100644
> --- a/arch/powerpc/include/asm/tlbflush.h
> +++ b/arch/powerpc/include/asm/tlbflush.h
> @@ -58,6 +58,7 @@ extern void __flush_tlb_page(struct mm_struct *mm, unsigned long vmaddr,
>
> #elif defined(CONFIG_PPC_STD_MMU_32)
>
> +#define MMU_NO_CONTEXT (0)
> /*
> * TLB flushing for "classic" hash-MMU 32-bit CPUs, 6xx, 7xx, 7xxx
> */
> diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile
> index 9589236028f4..48aa11ae6a6b 100644
> --- a/arch/powerpc/mm/Makefile
> +++ b/arch/powerpc/mm/Makefile
> @@ -15,7 +15,7 @@ obj-$(CONFIG_PPC_BOOK3E) += tlb_low_$(CONFIG_WORD_SIZE)e.o
> hash64-$(CONFIG_PPC_NATIVE) := hash_native_64.o
> obj-$(CONFIG_PPC_BOOK3E_64) += pgtable-book3e.o
> obj-$(CONFIG_PPC_STD_MMU_64) += pgtable-hash64.o hash_utils_64.o slb_low.o slb.o $(hash64-y) mmu_context_book3s64.o
> -obj-$(CONFIG_PPC_RADIX_MMU) += pgtable-radix.o
> +obj-$(CONFIG_PPC_RADIX_MMU) += pgtable-radix.o tlb-radix.o
> obj-$(CONFIG_PPC_STD_MMU_32) += ppc_mmu_32.o hash_low_32.o mmu_context_hash32.o
> obj-$(CONFIG_PPC_STD_MMU) += tlb_hash$(CONFIG_WORD_SIZE).o
> ifeq ($(CONFIG_PPC_STD_MMU_64),y)
> diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c
> new file mode 100644
> index 000000000000..9129c0d6322c
> --- /dev/null
> +++ b/arch/powerpc/mm/tlb-radix.c
> @@ -0,0 +1,243 @@
> +/*
> + * TLB flush routines for radix kernels.
> + *
> + * Copyright (C) 2015 Aneesh Kumar K.V <aneesh.kumar at linux.vnet.ibm.com>
> + *
> + * This program is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU General Public License
> + * as published by the Free Software Foundation; either version
> + * 2 of the License, or (at your option) any later version.
> + *
> + */
> +
> +#include <linux/mm.h>
> +#include <linux/hugetlb.h>
> +#include <linux/memblock.h>
> +
> +#include <asm/tlb.h>
> +#include <asm/tlbflush.h>
> +
> +static DEFINE_RAW_SPINLOCK(native_tlbie_lock);
> +
> +static inline void __tlbiel_pid(unsigned long pid, int set)
> +{
> + unsigned long rb,rs,ric,prs,r;
> +
> + rb = PPC_BIT(53); /* IS = 1 */
> + rb |= set << PPC_BITLSHIFT(51);
Should we mask the set? set & cpu_to_be64(0x0000000000fff000)?
> + rs = ((unsigned long)pid) << PPC_BITLSHIFT(31);
> + prs = 1; /* process scoped */
> + r = 1; /* raidx format */
> + ric = 2; /* invalidate all the caches */
> +
> + asm volatile("ptesync": : :"memory");
> + asm volatile(".long 0x7c000224 | (%0 << 11) | (%1 << 16) |"
Can we have a usable name for the opcode, I know compilers might not support it yet but
having a
#define READABLE_OPCODE 0x7c000224
BTW, does this opcode work for both endians?
> + "(%2 << 17) | (%3 << 18) | (%4 << 21)"
> + : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
> + asm volatile("ptesync": : :"memory");
> +}
> +
> +/*
> + * We use 128 set in radix mode and 256 set in hpt mode.
Why?
> + */
> +static inline void _tlbiel_pid(unsigned long pid)
> +{
> + int set;
> +
> + for (set = 0; set < POWER9_TLB_SETS_RADIX ; set++) {
> + __tlbiel_pid(pid, set);
> + }
> + return;
> +}
> +
> +static inline void _tlbie_pid(unsigned long pid)
> +{
> + unsigned long rb,rs,ric,prs,r;
> +
> + rb = PPC_BIT(53); /* IS = 1 */
> + rs = pid << PPC_BITLSHIFT(31);
> + prs = 1; /* process scoped */
> + r = 1; /* raidx format */
> + ric = 2; /* invalidate all the caches */
> +
> + asm volatile("ptesync": : :"memory");
> + asm volatile(".long 0x7c000264 | (%0 << 11) | (%1 << 16) |"
> + "(%2 << 17) | (%3 << 18) | (%4 << 21)"
> + : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
Same comments as above
> + asm volatile("eieio; tlbsync; ptesync": : :"memory");
> +}
> +
> +static inline void _tlbiel_va(unsigned long va, unsigned long pid,
> + unsigned long ap)
> +{
> + unsigned long rb,rs,ric,prs,r;
> +
> + rb = va & ~(PPC_BITMASK(52, 63));
> + rb |= ap << PPC_BITLSHIFT(58);
> + rs = pid << PPC_BITLSHIFT(31);
> + prs = 1; /* process scoped */
> + r = 1; /* raidx format */
^^ radix
> + ric = 0; /* no cluster flush yet */
> +
Should be explictly set IS = 0
> + asm volatile("ptesync": : :"memory");
> + asm volatile(".long 0x7c000224 | (%0 << 11) | (%1 << 16) |"
> + "(%2 << 17) | (%3 << 18) | (%4 << 21)"
> + : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
Ditto
> + asm volatile("ptesync": : :"memory");
> +}
> +
> +static inline void _tlbie_va(unsigned long va, unsigned long pid,
> + unsigned long ap)
> +{
> + unsigned long rb,rs,ric,prs,r;
> +
> + rb = va & ~(PPC_BITMASK(52, 63));
> + rb |= ap << PPC_BITLSHIFT(58);
> + rs = pid << PPC_BITLSHIFT(31);
> + prs = 1; /* process scoped */
> + r = 1; /* raidx format */
^^ radix
> + ric = 0; /* no cluster flush yet */
> +
> + asm volatile("ptesync": : :"memory");
> + asm volatile(".long 0x7c000264 | (%0 << 11) | (%1 << 16) |"
> + "(%2 << 17) | (%3 << 18) | (%4 << 21)"
> + : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
Same as above
> + asm volatile("eieio; tlbsync; ptesync": : :"memory");
> +}
> +
> +/*
> + * Base TLB flushing operations:
> + *
> + * - flush_tlb_mm(mm) flushes the specified mm context TLB's
> + * - flush_tlb_page(vma, vmaddr) flushes one page
> + * - flush_tlb_range(vma, start, end) flushes a range of pages
> + * - flush_tlb_kernel_range(start, end) flushes kernel pages
> + *
> + * - local_* variants of page and mm only apply to the current
> + * processor
> + */
> +void local_flush_rtlb_mm(struct mm_struct *mm)
> +{
> + unsigned int pid;
> +
> + preempt_disable();
> + pid = mm->context.id;
> + if (pid != MMU_NO_CONTEXT)
> + _tlbiel_pid(pid);
> + preempt_enable();
> +}
> +EXPORT_SYMBOL(local_flush_rtlb_mm);
> +
> +void __local_flush_rtlb_page(struct mm_struct *mm, unsigned long vmaddr,
> + unsigned long ap, int nid)
> +{
> + unsigned int pid;
> +
> + preempt_disable();
> + pid = mm ? mm->context.id : 0;
> + if (pid != MMU_NO_CONTEXT)
> + _tlbiel_va(vmaddr, pid, ap);
> + preempt_enable();
> +}
> +
> +void local_flush_rtlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
> +{
> + __local_flush_rtlb_page(vma ? vma->vm_mm : NULL, vmaddr,
> + mmu_get_ap(mmu_virtual_psize), 0);
> +}
> +EXPORT_SYMBOL(local_flush_rtlb_page);
> +
> +#ifdef CONFIG_SMP
> +static int mm_is_core_local(struct mm_struct *mm)
> +{
> + return cpumask_subset(mm_cpumask(mm),
> + topology_sibling_cpumask(smp_processor_id()));
Comment should say that this should be called with preempt_disable()
> +}
> +
> +void flush_rtlb_mm(struct mm_struct *mm)
> +{
> + unsigned int pid;
> +
> + preempt_disable();
> + pid = mm->context.id;
> + if (unlikely(pid == MMU_NO_CONTEXT))
> + goto no_context;
Why did we flush from this context? Is this common?
> +
> + if (!mm_is_core_local(mm)) {
> + int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE);
> +
I think any radix CPU will support this feature -- no?
> + if (lock_tlbie)
> + raw_spin_lock(&native_tlbie_lock);
> + _tlbie_pid(pid);
> + if (lock_tlbie)
> + raw_spin_unlock(&native_tlbie_lock);
> + } else
> + _tlbiel_pid(pid);
> +no_context:
> + preempt_enable();
> +}
> +EXPORT_SYMBOL(flush_rtlb_mm);
> +
> +void __flush_rtlb_page(struct mm_struct *mm, unsigned long vmaddr,
> + unsigned long ap, int nid)
> +{
> + unsigned int pid;
> +
> + preempt_disable();
> + pid = mm ? mm->context.id : 0;
> + if (unlikely(pid == MMU_NO_CONTEXT))
> + goto bail;
bail here and no_context above?
> + if (!mm_is_core_local(mm)) {
> + int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE);
> +
> + if (lock_tlbie)
> + raw_spin_lock(&native_tlbie_lock);
> + _tlbie_va(vmaddr, pid, ap);
> + if (lock_tlbie)
> + raw_spin_unlock(&native_tlbie_lock);
> + } else
> + _tlbiel_va(vmaddr, pid, ap);
> +bail:
> + preempt_enable();
> +}
> +
> +void flush_rtlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
> +{
> + __flush_rtlb_page(vma ? vma->vm_mm : NULL, vmaddr,
> + mmu_get_ap(mmu_virtual_psize), 0);
> +}
> +EXPORT_SYMBOL(flush_rtlb_page);
> +
> +#endif /* CONFIG_SMP */
> +
> +void flush_rtlb_kernel_range(unsigned long start, unsigned long end)
> +{
> + int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE);
> +
> + if (lock_tlbie)
> + raw_spin_lock(&native_tlbie_lock);
> + _tlbie_pid(0);
Oh! so PID can be 0 for vmalloc'ed regions?
> + if (lock_tlbie)
> + raw_spin_unlock(&native_tlbie_lock);
> +}
> +EXPORT_SYMBOL(flush_rtlb_kernel_range);
> +
> +/*
> + * Currently, for range flushing, we just do a full mm flush. Because
> + * we use this in code path where we don' track the page size.
> + */
> +void flush_rtlb_range(struct vm_area_struct *vma, unsigned long start,
> + unsigned long end)
> +
> +{
> + struct mm_struct *mm = vma->vm_mm;
> + flush_rtlb_mm(mm);
> +}
> +EXPORT_SYMBOL(flush_rtlb_range);
> +
> +
> +void rtlb_flush(struct mmu_gather *tlb)
> +{
> + struct mm_struct *mm = tlb->mm;
> + flush_rtlb_mm(mm);
> +}
>
More information about the Linuxppc-dev
mailing list