[PATCH v2 2/3] powerpc/powernv/npu: Use size-based ATSD invalidates

Thu Oct 4 15:20:39 AEST 2018

Reviewed-By: Alistair Popple <alistair at popple.id.au>

On Wednesday, 3 October 2018 11:51:33 AM AEST Mark Hairgrove wrote:
> Prior to this change only two types of ATSDs were issued to the NPU:
> invalidates targeting a single page and invalidates targeting the whole
> address space. The crossover point happened at the configurable
> atsd_threshold which defaulted to 2M. Invalidates that size or smaller
> would issue per-page invalidates for the whole range.
> 
> The NPU supports more invalidation sizes however: 64K, 2M, 1G, and all.
> These invalidates target addresses aligned to their size. 2M is a common
> invalidation size for GPU-enabled applications because that is a GPU
> page size, so reducing the number of invalidates by 32x in that case is a
> clear improvement.
> 
> ATSD latency is high in general so now we always issue a single invalidate
> rather than multiple. This will over-invalidate in some cases, but for any
> invalidation size over 2M it matches or improves the prior behavior.
> There's also an improvement for single-page invalidates since the prior
> version issued two invalidates for that case instead of one.
> 
> With this change all issued ATSDs now perform a flush, so the flush
> parameter has been removed from all the helpers.
> 
> To show the benefit here are some performance numbers from a
> microbenchmark which creates a 1G allocation then uses mprotect with
> PROT_NONE to trigger invalidates in strides across the allocation.
> 
> One NPU (1 GPU):
> 
>          mprotect rate (GB/s)
> Stride   Before      After      Speedup
> 64K         5.3        5.6           5%
> 1M         39.3       57.4          46%
> 2M         49.7       82.6          66%
> 4M        286.6      285.7           0%
> 
> Two NPUs (6 GPUs):
> 
>          mprotect rate (GB/s)
> Stride   Before      After      Speedup
> 64K         6.5        7.4          13%
> 1M         33.4       67.9         103%
> 2M         38.7       93.1         141%
> 4M        356.7      354.6          -1%
> 
> Anything over 2M is roughly the same as before since both cases issue a
> single ATSD.
> 
> Signed-off-by: Mark Hairgrove <mhairgrove at nvidia.com>
> ---
>  arch/powerpc/platforms/powernv/npu-dma.c |  103 ++++++++++++++++--------------
>  1 files changed, 55 insertions(+), 48 deletions(-)
> 
> diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c
> index c8f438a..e4c0fab 100644
> --- a/arch/powerpc/platforms/powernv/npu-dma.c
> +++ b/arch/powerpc/platforms/powernv/npu-dma.c
> @@ -18,6 +18,7 @@
>  #include <linux/memblock.h>
>  #include <linux/iommu.h>
>  #include <linux/debugfs.h>
> +#include <linux/sizes.h>
>  
>  #include <asm/debugfs.h>
>  #include <asm/tlb.h>
> @@ -458,8 +459,7 @@ static void put_mmio_atsd_reg(struct npu *npu, int reg)
>  #define XTS_ATSD_AVA    1
>  #define XTS_ATSD_STAT   2
>  
> -static unsigned long get_atsd_launch_val(unsigned long pid, unsigned long psize,
> -					bool flush)
> +static unsigned long get_atsd_launch_val(unsigned long pid, unsigned long psize)
>  {
>  	unsigned long launch = 0;
>  
> @@ -477,8 +477,7 @@ static unsigned long get_atsd_launch_val(unsigned long pid, unsigned long psize,
>  	/* PID */
>  	launch |= pid << PPC_BITLSHIFT(38);
>  
> -	/* No flush */
> -	launch |= !flush << PPC_BITLSHIFT(39);
> +	/* Leave "No flush" (bit 39) 0 so every ATSD performs a flush */
>  
>  	return launch;
>  }
> @@ -501,23 +500,22 @@ static void mmio_atsd_regs_write(struct mmio_atsd_reg
>  }
>  
>  static void mmio_invalidate_pid(struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS],
> -				unsigned long pid, bool flush)
> +				unsigned long pid)
>  {
> -	unsigned long launch = get_atsd_launch_val(pid, MMU_PAGE_COUNT, flush);
> +	unsigned long launch = get_atsd_launch_val(pid, MMU_PAGE_COUNT);
>  
>  	/* Invalidating the entire process doesn't use a va */
>  	mmio_atsd_regs_write(mmio_atsd_reg, XTS_ATSD_LAUNCH, launch);
>  }
>  
> -static void mmio_invalidate_va(struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS],
> -			unsigned long va, unsigned long pid, bool flush)
> +static void mmio_invalidate_range(struct mmio_atsd_reg
> +			mmio_atsd_reg[NV_MAX_NPUS], unsigned long pid,
> +			unsigned long start, unsigned long psize)
>  {
> -	unsigned long launch;
> -
> -	launch = get_atsd_launch_val(pid, mmu_virtual_psize, flush);
> +	unsigned long launch = get_atsd_launch_val(pid, psize);
>  
>  	/* Write all VAs first */
> -	mmio_atsd_regs_write(mmio_atsd_reg, XTS_ATSD_AVA, va);
> +	mmio_atsd_regs_write(mmio_atsd_reg, XTS_ATSD_AVA, start);
>  
>  	/* Issue one barrier for all address writes */
>  	eieio();
> @@ -609,14 +607,36 @@ static void release_atsd_reg(struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS])
>  }
>  
>  /*
> - * Invalidate either a single address or an entire PID depending on
> - * the value of va.
> + * Invalidate a virtual address range
>   */
> -static void mmio_invalidate(struct npu_context *npu_context, int va,
> -			unsigned long address, bool flush)
> +static void mmio_invalidate(struct npu_context *npu_context,
> +			unsigned long start, unsigned long size)
>  {
>  	struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS];
>  	unsigned long pid = npu_context->mm->context.id;
> +	unsigned long atsd_start = 0;
> +	unsigned long end = start + size - 1;
> +	int atsd_psize = MMU_PAGE_COUNT;
> +
> +	/*
> +	 * Convert the input range into one of the supported sizes. If the range
> +	 * doesn't fit, use the next larger supported size. Invalidation latency
> +	 * is high, so over-invalidation is preferred to issuing multiple
> +	 * invalidates.
> +	 *
> +	 * A 4K page size isn't supported by NPU/GPU ATS, so that case is
> +	 * ignored.
> +	 */
> +	if (size == SZ_64K) {
> +		atsd_start = start;
> +		atsd_psize = MMU_PAGE_64K;
> +	} else if (ALIGN_DOWN(start, SZ_2M) == ALIGN_DOWN(end, SZ_2M)) {
> +		atsd_start = ALIGN_DOWN(start, SZ_2M);
> +		atsd_psize = MMU_PAGE_2M;
> +	} else if (ALIGN_DOWN(start, SZ_1G) == ALIGN_DOWN(end, SZ_1G)) {
> +		atsd_start = ALIGN_DOWN(start, SZ_1G);
> +		atsd_psize = MMU_PAGE_1G;
> +	}
>  
>  	if (npu_context->nmmu_flush)
>  		/*
> @@ -631,23 +651,25 @@ static void mmio_invalidate(struct npu_context *npu_context, int va,
>  	 * an invalidate.
>  	 */
>  	acquire_atsd_reg(npu_context, mmio_atsd_reg);
> -	if (va)
> -		mmio_invalidate_va(mmio_atsd_reg, address, pid, flush);
> +
> +	if (atsd_psize == MMU_PAGE_COUNT)
> +		mmio_invalidate_pid(mmio_atsd_reg, pid);
>  	else
> -		mmio_invalidate_pid(mmio_atsd_reg, pid, flush);
> +		mmio_invalidate_range(mmio_atsd_reg, pid, atsd_start,
> +					atsd_psize);
>  
>  	mmio_invalidate_wait(mmio_atsd_reg);
> -	if (flush) {
> -		/*
> -		 * The GPU requires two flush ATSDs to ensure all entries have
> -		 * been flushed. We use PID 0 as it will never be used for a
> -		 * process on the GPU.
> -		 */
> -		mmio_invalidate_pid(mmio_atsd_reg, 0, true);
> -		mmio_invalidate_wait(mmio_atsd_reg);
> -		mmio_invalidate_pid(mmio_atsd_reg, 0, true);
> -		mmio_invalidate_wait(mmio_atsd_reg);
> -	}
> +
> +	/*
> +	 * The GPU requires two flush ATSDs to ensure all entries have been
> +	 * flushed. We use PID 0 as it will never be used for a process on the
> +	 * GPU.
> +	 */
> +	mmio_invalidate_pid(mmio_atsd_reg, 0);
> +	mmio_invalidate_wait(mmio_atsd_reg);
> +	mmio_invalidate_pid(mmio_atsd_reg, 0);
> +	mmio_invalidate_wait(mmio_atsd_reg);
> +
>  	release_atsd_reg(mmio_atsd_reg);
>  }
>  
> @@ -664,7 +686,7 @@ static void pnv_npu2_mn_release(struct mmu_notifier *mn,
>  	 * There should be no more translation requests for this PID, but we
>  	 * need to ensure any entries for it are removed from the TLB.
>  	 */
> -	mmio_invalidate(npu_context, 0, 0, true);
> +	mmio_invalidate(npu_context, 0, ~0UL);
>  }
>  
>  static void pnv_npu2_mn_change_pte(struct mmu_notifier *mn,
> @@ -673,8 +695,7 @@ static void pnv_npu2_mn_change_pte(struct mmu_notifier *mn,
>  				pte_t pte)
>  {
>  	struct npu_context *npu_context = mn_to_npu_context(mn);
> -
> -	mmio_invalidate(npu_context, 1, address, true);
> +	mmio_invalidate(npu_context, address, PAGE_SIZE);
>  }
>  
>  static void pnv_npu2_mn_invalidate_range(struct mmu_notifier *mn,
> @@ -682,21 +703,7 @@ static void pnv_npu2_mn_invalidate_range(struct mmu_notifier *mn,
>  					unsigned long start, unsigned long end)
>  {
>  	struct npu_context *npu_context = mn_to_npu_context(mn);
> -	unsigned long address;
> -
> -	if (end - start > atsd_threshold) {
> -		/*
> -		 * Just invalidate the entire PID if the address range is too
> -		 * large.
> -		 */
> -		mmio_invalidate(npu_context, 0, 0, true);
> -	} else {
> -		for (address = start; address < end; address += PAGE_SIZE)
> -			mmio_invalidate(npu_context, 1, address, false);
> -
> -		/* Do the flush only on the final addess == end */
> -		mmio_invalidate(npu_context, 1, address, true);
> -	}
> +	mmio_invalidate(npu_context, start, end - start);
>  }
>  
>  static const struct mmu_notifier_ops nv_nmmu_notifier_ops = {
>