[PATCH 6/6] powerpc/64s/radix: introduce options to disable use of the tlbie instruction

Tue Sep 3 10:32:56 AEST 2019

Nick,

On Tuesday, 3 September 2019 1:29:31 AM AEST Nicholas Piggin wrote:
> Introduce two options to control the use of the tlbie instruction. A
> boot time option which completely disables the kernel using the
> instruction, this is currently incompatible with HASH MMU, KVM, and
> coherent accelerators.

Some accelerators (eg. cxl, ocxl, npu) call mm_context_add_copro() to force 
global TLB invalidations:

static inline void mm_context_add_copro(struct mm_struct *mm)
{
        /*
         * If any copro is in use, increment the active CPU count
         * in order to force TLB invalidations to be global as to
         * propagate to the Nest MMU.
         */
        if (atomic_inc_return(&mm->context.copros) == 1)
                inc_mm_active_cpus(mm);
}

Admittedly I haven't dug into all the details of this patch but it sounds like 
it might break the above if TLBIE is disabled. Do you think we should add a 
WARN_ON if mm_context_add_copro() is called with TLBIE disabled? Or perhaps 
even force TLBIE to be re-enabled if it is called with it disabled?

- Alistair

> And a debugfs option can be switched at runtime and avoids using tlbie
> for invalidating CPU TLBs for normal process and kernel address
> mappings. Coherent accelerators are still managed with tlbie, as will
> KVM partition scope translations.
> 
> Cross-CPU TLB flushing is implemented with IPIs and tlbiel. This is a
> basic implementation which does not attempt to make any optimisation
> beyond the tlbie implementation.
> 
> This is useful for performance testing among other things. For example
> in certain situations on large systems, using IPIs may be faster than
> tlbie as they can be directed rather than broadcast. Later we may also
> take advantage of the IPIs to do more interesting things such as trim
> the mm cpumask more aggressively.
> 
> Signed-off-by: Nicholas Piggin <npiggin at gmail.com>
> ---
>  .../admin-guide/kernel-parameters.txt         |   4 +
>  arch/powerpc/include/asm/book3s/64/tlbflush.h |   9 +
>  arch/powerpc/kvm/book3s_hv.c                  |   6 +
>  arch/powerpc/mm/book3s64/pgtable.c            |  47 +++++
>  arch/powerpc/mm/book3s64/radix_tlb.c          | 190 ++++++++++++++++--
>  drivers/misc/cxl/main.c                       |   4 +
>  drivers/misc/ocxl/main.c                      |   4 +
>  7 files changed, 246 insertions(+), 18 deletions(-)
> 
> diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/
admin-guide/kernel-parameters.txt
> index d3cbb3ae62b6..65ae16549aa3 100644
> --- a/Documentation/admin-guide/kernel-parameters.txt
> +++ b/Documentation/admin-guide/kernel-parameters.txt
> @@ -860,6 +860,10 @@
>  	disable_radix	[PPC]
>  			Disable RADIX MMU mode on POWER9
>  
> +	disable_tlbie	[PPC]
> +			Disable TLBIE instruction. Currently does not work
> +			with KVM, with HASH MMU, or with coherent accelerators.
> +
>  	disable_cpu_apicid= [X86,APIC,SMP]
>  			Format: <int>
>  			The number of initial APIC ID for the
> diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush.h b/arch/powerpc/
include/asm/book3s/64/tlbflush.h
> index ebf572ea621e..7aa8195b6cff 100644
> --- a/arch/powerpc/include/asm/book3s/64/tlbflush.h
> +++ b/arch/powerpc/include/asm/book3s/64/tlbflush.h
> @@ -162,4 +162,13 @@ static inline void flush_tlb_pgtable(struct mmu_gather 
*tlb, unsigned long addre
>  
>  	radix__flush_tlb_pwc(tlb, address);
>  }
> +
> +extern bool tlbie_capable;
> +extern bool tlbie_enabled;
> +
> +static inline bool cputlb_use_tlbie(void)
> +{
> +	return tlbie_enabled;
> +}
> +
>  #endif /*  _ASM_POWERPC_BOOK3S_64_TLBFLUSH_H */
> diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
> index cde3f5a4b3e4..3cdaa2a09a19 100644
> --- a/arch/powerpc/kvm/book3s_hv.c
> +++ b/arch/powerpc/kvm/book3s_hv.c
> @@ -5462,6 +5462,12 @@ static int kvmppc_radix_possible(void)
>  static int kvmppc_book3s_init_hv(void)
>  {
>  	int r;
> +
> +	if (!tlbie_capable) {
> +		pr_err("KVM-HV: Host does not support TLBIE\n");
> +		return -ENODEV;
> +	}
> +
>  	/*
>  	 * FIXME!! Do we need to check on all cpus ?
>  	 */
> diff --git a/arch/powerpc/mm/book3s64/pgtable.c b/arch/powerpc/mm/book3s64/
pgtable.c
> index 351eb78eed55..75483b40fcb1 100644
> --- a/arch/powerpc/mm/book3s64/pgtable.c
> +++ b/arch/powerpc/mm/book3s64/pgtable.c
> @@ -8,6 +8,7 @@
>  #include <linux/memblock.h>
>  #include <misc/cxl-base.h>
>  
> +#include <asm/debugfs.h>
>  #include <asm/pgalloc.h>
>  #include <asm/tlb.h>
>  #include <asm/trace.h>
> @@ -469,3 +470,49 @@ int pmd_move_must_withdraw(struct spinlock 
*new_pmd_ptl,
>  
>  	return true;
>  }
> +
> +/*
> + * Does the CPU support tlbie?
> + */
> +bool tlbie_capable __read_mostly = true;
> +EXPORT_SYMBOL(tlbie_capable);
> +
> +/*
> + * Should tlbie be used for management of CPU TLBs, for kernel and process
> + * address spaces? tlbie may still be used for nMMU accelerators, and for 
KVM
> + * guest address spaces.
> + */
> +bool tlbie_enabled __read_mostly = true;
> +
> +static int __init setup_disable_tlbie(char *str)
> +{
> +	if (!radix_enabled()) {
> +		pr_err("disable_tlbie: Unable to disable TLBIE with Hash MMU.\n");
> +		return 1;
> +	}
> +
> +	tlbie_capable = false;
> +	tlbie_enabled = false;
> +
> +        return 1;
> +}
> +__setup("disable_tlbie", setup_disable_tlbie);
> +
> +static int __init pgtable_debugfs_setup(void)
> +{
> +	if (!tlbie_capable)
> +		return 0;
> +
> +	/*
> +	 * There is no locking vs tlb flushing when changing this value.
> +	 * The tlb flushers will see one value or another, and use either
> +	 * tlbie or tlbiel with IPIs. In both cases the TLBs will be
> +	 * invalidated as expected.
> +	 */
> +	debugfs_create_bool("tlbie_enabled", 0600,
> +			powerpc_debugfs_root,
> +			&tlbie_enabled);
> +
> +	return 0;
> +}
> +arch_initcall(pgtable_debugfs_setup);
> diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c b/arch/powerpc/mm/book3s64/
radix_tlb.c
> index f9cf8ae59831..631be42abd33 100644
> --- a/arch/powerpc/mm/book3s64/radix_tlb.c
> +++ b/arch/powerpc/mm/book3s64/radix_tlb.c
> @@ -270,6 +270,39 @@ static inline void _tlbie_pid(unsigned long pid, 
unsigned long ric)
>  	asm volatile("eieio; tlbsync; ptesync": : :"memory");
>  }
>  
> +struct tlbiel_pid {
> +	unsigned long pid;
> +	unsigned long ric;
> +};
> +
> +static void do_tlbiel_pid(void *info)
> +{
> +	struct tlbiel_pid *t = info;
> +
> +	if (t->ric == RIC_FLUSH_TLB)
> +		_tlbiel_pid(t->pid, RIC_FLUSH_TLB);
> +	else if (t->ric == RIC_FLUSH_PWC)
> +		_tlbiel_pid(t->pid, RIC_FLUSH_PWC);
> +	else
> +		_tlbiel_pid(t->pid, RIC_FLUSH_ALL);
> +}
> +
> +static inline void _tlbiel_pid_multicast(struct mm_struct *mm,
> +				unsigned long pid, unsigned long ric)
> +{
> +	struct cpumask *cpus = mm_cpumask(mm);
> +	struct tlbiel_pid t = { .pid = pid, .ric = ric };
> +
> +	on_each_cpu_mask(cpus, do_tlbiel_pid, &t, 1);
> +	/*
> +	 * Always want the CPU translations to be invalidated with tlbiel in
> +	 * these paths, so while coprocessors must use tlbie, we can not
> +	 * optimise away the tlbiel component.
> +	 */
> +	if (atomic_read(&mm->context.copros) > 0)
> +		_tlbie_pid(pid, RIC_FLUSH_ALL);
> +}
> +
>  static inline void _tlbie_lpid(unsigned long lpid, unsigned long ric)
>  {
>  	asm volatile("ptesync": : :"memory");
> @@ -370,6 +403,53 @@ static __always_inline void _tlbie_va(unsigned long va, 
unsigned long pid,
>  	asm volatile("eieio; tlbsync; ptesync": : :"memory");
>  }
>  
> +struct tlbiel_va {
> +	unsigned long pid;
> +	unsigned long va;
> +	unsigned long psize;
> +	unsigned long ric;
> +};
> +
> +static void do_tlbiel_va(void *info)
> +{
> +	struct tlbiel_va *t = info;
> +
> +	if (t->ric == RIC_FLUSH_TLB)
> +		_tlbiel_va(t->va, t->pid, t->psize, RIC_FLUSH_TLB);
> +	else if (t->ric == RIC_FLUSH_PWC)
> +		_tlbiel_va(t->va, t->pid, t->psize, RIC_FLUSH_PWC);
> +	else
> +		_tlbiel_va(t->va, t->pid, t->psize, RIC_FLUSH_ALL);
> +}
> +
> +static inline void _tlbiel_va_multicast(struct mm_struct *mm,
> +				unsigned long va, unsigned long pid,
> +				unsigned long psize, unsigned long ric)
> +{
> +	struct cpumask *cpus = mm_cpumask(mm);
> +	struct tlbiel_va t = { .va = va, .pid = pid, .psize = psize, .ric = ric };
> +	on_each_cpu_mask(cpus, do_tlbiel_va, &t, 1);
> +	if (atomic_read(&mm->context.copros) > 0)
> +		_tlbie_va(va, pid, psize, RIC_FLUSH_TLB);
> +}
> +
> +struct tlbiel_va_range {
> +	unsigned long pid;
> +	unsigned long start;
> +	unsigned long end;
> +	unsigned long page_size;
> +	unsigned long psize;
> +	bool also_pwc;
> +};
> +
> +static void do_tlbiel_va_range(void *info)
> +{
> +	struct tlbiel_va_range *t = info;
> +
> +	_tlbiel_va_range(t->start, t->end, t->pid, t->page_size,
> +				    t->psize, t->also_pwc);
> +}
> +
>  static __always_inline void _tlbie_lpid_va(unsigned long va, unsigned long 
lpid,
>  			      unsigned long psize, unsigned long ric)
>  {
> @@ -393,6 +473,21 @@ static inline void _tlbie_va_range(unsigned long start, 
unsigned long end,
>  	asm volatile("eieio; tlbsync; ptesync": : :"memory");
>  }
>  
> +static inline void _tlbiel_va_range_multicast(struct mm_struct *mm,
> +				unsigned long start, unsigned long end,
> +				unsigned long pid, unsigned long page_size,
> +				unsigned long psize, bool also_pwc)
> +{
> +	struct cpumask *cpus = mm_cpumask(mm);
> +	struct tlbiel_va_range t = { .start = start, .end = end,
> +				.pid = pid, .page_size = page_size,
> +				.psize = psize, .also_pwc = also_pwc };
> +
> +	on_each_cpu_mask(cpus, do_tlbiel_va_range, &t, 1);
> +	if (atomic_read(&mm->context.copros) > 0)
> +		_tlbie_va_range(start, end, pid, page_size, psize, also_pwc);
> +}
> +
>  /*
>   * Base TLB flushing operations:
>   *
> @@ -530,10 +625,14 @@ void radix__flush_tlb_mm(struct mm_struct *mm)
>  			goto local;
>  		}
>  
> -		if (mm_needs_flush_escalation(mm))
> -			_tlbie_pid(pid, RIC_FLUSH_ALL);
> -		else
> -			_tlbie_pid(pid, RIC_FLUSH_TLB);
> +		if (cputlb_use_tlbie()) {
> +			if (mm_needs_flush_escalation(mm))
> +				_tlbie_pid(pid, RIC_FLUSH_ALL);
> +			else
> +				_tlbie_pid(pid, RIC_FLUSH_TLB);
> +		} else {
> +			_tlbiel_pid_multicast(mm, pid, RIC_FLUSH_TLB);
> +		}
>  	} else {
>  local:
>  		_tlbiel_pid(pid, RIC_FLUSH_TLB);
> @@ -559,7 +658,10 @@ static void __flush_all_mm(struct mm_struct *mm, bool 
fullmm)
>  				goto local;
>  			}
>  		}
> -		_tlbie_pid(pid, RIC_FLUSH_ALL);
> +		if (cputlb_use_tlbie())
> +			_tlbie_pid(pid, RIC_FLUSH_ALL);
> +		else
> +			_tlbiel_pid_multicast(mm, pid, RIC_FLUSH_ALL);
>  	} else {
>  local:
>  		_tlbiel_pid(pid, RIC_FLUSH_ALL);
> @@ -594,7 +696,10 @@ void radix__flush_tlb_page_psize(struct mm_struct *mm, 
unsigned long vmaddr,
>  			exit_flush_lazy_tlbs(mm);
>  			goto local;
>  		}
> -		_tlbie_va(vmaddr, pid, psize, RIC_FLUSH_TLB);
> +		if (cputlb_use_tlbie())
> +			_tlbie_va(vmaddr, pid, psize, RIC_FLUSH_TLB);
> +		else
> +			_tlbiel_va_multicast(mm, vmaddr, pid, psize, RIC_FLUSH_TLB);
>  	} else {
>  local:
>  		_tlbiel_va(vmaddr, pid, psize, RIC_FLUSH_TLB);
> @@ -616,6 +721,24 @@ EXPORT_SYMBOL(radix__flush_tlb_page);
>  #define radix__flush_all_mm radix__local_flush_all_mm
>  #endif /* CONFIG_SMP */
>  
> +static void do_tlbiel_kernel(void *info)
> +{
> +	_tlbiel_pid(0, RIC_FLUSH_ALL);
> +}
> +
> +static inline void _tlbiel_kernel_broadcast(void)
> +{
> +	on_each_cpu(do_tlbiel_kernel, NULL, 1);
> +	if (tlbie_capable) {
> +		/*
> +		 * Coherent accelerators don't refcount kernel memory mappings,
> +		 * so have to always issue a tlbie for them. This is quite a
> +		 * slow path anyway.
> +		 */
> +		_tlbie_pid(0, RIC_FLUSH_ALL);
> +	}
> +}
> +
>  /*
>   * If kernel TLBIs ever become local rather than global, then
>   * drivers/misc/ocxl/link.c:ocxl_link_add_pe will need some work, as it
> @@ -623,7 +746,10 @@ EXPORT_SYMBOL(radix__flush_tlb_page);
>   */
>  void radix__flush_tlb_kernel_range(unsigned long start, unsigned long end)
>  {
> -	_tlbie_pid(0, RIC_FLUSH_ALL);
> +	if (cputlb_use_tlbie())
> +		_tlbie_pid(0, RIC_FLUSH_ALL);
> +	else
> +		_tlbiel_kernel_broadcast();
>  }
>  EXPORT_SYMBOL(radix__flush_tlb_kernel_range);
>  
> @@ -679,10 +805,14 @@ static inline void __radix__flush_tlb_range(struct 
mm_struct *mm,
>  		if (local) {
>  			_tlbiel_pid(pid, RIC_FLUSH_TLB);
>  		} else {
> -			if (mm_needs_flush_escalation(mm))
> -				_tlbie_pid(pid, RIC_FLUSH_ALL);
> -			else
> -				_tlbie_pid(pid, RIC_FLUSH_TLB);
> +			if (cputlb_use_tlbie()) {
> +				if (mm_needs_flush_escalation(mm))
> +					_tlbie_pid(pid, RIC_FLUSH_ALL);
> +				else
> +					_tlbie_pid(pid, RIC_FLUSH_TLB);
> +			} else {
> +				_tlbiel_pid_multicast(mm, pid, RIC_FLUSH_TLB);
> +			}
>  		}
>  	} else {
>  		bool hflush = flush_all_sizes;
> @@ -707,8 +837,8 @@ static inline void __radix__flush_tlb_range(struct 
mm_struct *mm,
>  				gflush = false;
>  		}
>  
> -		asm volatile("ptesync": : :"memory");
>  		if (local) {
> +			asm volatile("ptesync": : :"memory");
>  			__tlbiel_va_range(start, end, pid, page_size, mmu_virtual_psize);
>  			if (hflush)
>  				__tlbiel_va_range(hstart, hend, pid,
> @@ -717,7 +847,8 @@ static inline void __radix__flush_tlb_range(struct 
mm_struct *mm,
>  				__tlbiel_va_range(gstart, gend, pid,
>  						PUD_SIZE, MMU_PAGE_1G);
>  			asm volatile("ptesync": : :"memory");
> -		} else {
> +		} else if (cputlb_use_tlbie()) {
> +			asm volatile("ptesync": : :"memory");
>  			__tlbie_va_range(start, end, pid, page_size, mmu_virtual_psize);
>  			if (hflush)
>  				__tlbie_va_range(hstart, hend, pid,
> @@ -727,6 +858,15 @@ static inline void __radix__flush_tlb_range(struct 
mm_struct *mm,
>  						PUD_SIZE, MMU_PAGE_1G);
>  			fixup_tlbie();
>  			asm volatile("eieio; tlbsync; ptesync": : :"memory");
> +		} else {
> +			_tlbiel_va_range_multicast(mm,
> +					start, end, pid, page_size, mmu_virtual_psize, false);
> +			if (hflush)
> +				_tlbiel_va_range_multicast(mm,
> +					hstart, hend, pid, PMD_SIZE, MMU_PAGE_2M, false);
> +			if (gflush)
> +				_tlbiel_va_range_multicast(mm,
> +					gstart, gend, pid, PUD_SIZE, MMU_PAGE_1G, false);
>  		}
>  	}
>  	preempt_enable();
> @@ -903,16 +1043,26 @@ static __always_inline void 
__radix__flush_tlb_range_psize(struct mm_struct *mm,
>  		if (local) {
>  			_tlbiel_pid(pid, also_pwc ? RIC_FLUSH_ALL : RIC_FLUSH_TLB);
>  		} else {
> -			if (mm_needs_flush_escalation(mm))
> -				also_pwc = true;
> +			if (cputlb_use_tlbie()) {
> +				if (mm_needs_flush_escalation(mm))
> +					also_pwc = true;
> +
> +				_tlbie_pid(pid,
> +					also_pwc ?  RIC_FLUSH_ALL : RIC_FLUSH_TLB);
> +			} else {
> +				_tlbiel_pid_multicast(mm, pid,
> +					also_pwc ?  RIC_FLUSH_ALL : RIC_FLUSH_TLB);
> +			}
>  
> -			_tlbie_pid(pid, also_pwc ? RIC_FLUSH_ALL : RIC_FLUSH_TLB);
>  		}
>  	} else {
>  		if (local)
>  			_tlbiel_va_range(start, end, pid, page_size, psize, also_pwc);
> -		else
> +		else if (cputlb_use_tlbie())
>  			_tlbie_va_range(start, end, pid, page_size, psize, also_pwc);
> +		else
> +			_tlbiel_va_range_multicast(mm,
> +					start, end, pid, page_size, psize, also_pwc);
>  	}
>  	preempt_enable();
>  }
> @@ -954,7 +1104,11 @@ void radix__flush_tlb_collapsed_pmd(struct mm_struct 
*mm, unsigned long addr)
>  			exit_flush_lazy_tlbs(mm);
>  			goto local;
>  		}
> -		_tlbie_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize, true);
> +		if (cputlb_use_tlbie())
> +			_tlbie_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize, 
true);
> +		else
> +			_tlbiel_va_range_multicast(mm,
> +					addr, end, pid, PAGE_SIZE, mmu_virtual_psize, true);
>  	} else {
>  local:
>  		_tlbiel_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize, true);
> diff --git a/drivers/misc/cxl/main.c b/drivers/misc/cxl/main.c
> index 482a2c1b340a..43b312d06e3e 100644
> --- a/drivers/misc/cxl/main.c
> +++ b/drivers/misc/cxl/main.c
> @@ -18,6 +18,7 @@
>  #include <linux/sched/task.h>
>  
>  #include <asm/cputable.h>
> +#include <asm/mmu.h>
>  #include <misc/cxl-base.h>
>  
>  #include "cxl.h"
> @@ -315,6 +316,9 @@ static int __init init_cxl(void)
>  {
>  	int rc = 0;
>  
> +	if (!tlbie_capable)
> +		return -EINVAL;
> +
>  	if ((rc = cxl_file_init()))
>  		return rc;
>  
> diff --git a/drivers/misc/ocxl/main.c b/drivers/misc/ocxl/main.c
> index 7210d9e059be..ef73cf35dda2 100644
> --- a/drivers/misc/ocxl/main.c
> +++ b/drivers/misc/ocxl/main.c
> @@ -2,12 +2,16 @@
>  // Copyright 2017 IBM Corp.
>  #include <linux/module.h>
>  #include <linux/pci.h>
> +#include <asm/mmu.h>
>  #include "ocxl_internal.h"
>  
>  static int __init init_ocxl(void)
>  {
>  	int rc = 0;
>  
> +	if (!tlbie_capable)
> +		return -EINVAL;
> +
>  	rc = ocxl_file_init();
>  	if (rc)
>  		return rc;
>