[PATCH v2 1/3] powerpc/powernv/npu: Reduce eieio usage when issuing ATSD invalidates

Alistair Popple alistair at popple.id.au
Thu Oct 4 15:20:17 AEST 2018


Reviewed-by: Alistair Popple <alistair at popple.id.au>

On Wednesday, 3 October 2018 11:51:32 AM AEST Mark Hairgrove wrote:
> There are two types of ATSDs issued to the NPU: invalidates targeting a
> specific virtual address and invalidates targeting the whole address
> space. In both cases prior to this change, the sequence was:
> 
>     for each NPU
>         - Write the target address to the XTS_ATSD_AVA register
>         - EIEIO
>         - Write the launch value to issue the ATSD
> 
> First, a target address is not required when invalidating the whole
> address space, so that write and the EIEIO have been removed. The AP
> (size) field in the launch is not needed either.
> 
> Second, for per-address invalidates the above sequence is inefficient in
> the common case of multiple NPUs because an EIEIO is issued per NPU. This
> unnecessarily forces the launches of later ATSDs to be ordered with the
> launches of earlier ones. The new sequence only issues a single EIEIO:
> 
>     for each NPU
>         - Write the target address to the XTS_ATSD_AVA register
>     EIEIO
>     for each NPU
>         - Write the launch value to issue the ATSD
> 
> Performance results were gathered using a microbenchmark which creates a
> 1G allocation then uses mprotect with PROT_NONE to trigger invalidates in
> strides across the allocation.
> 
> With only a single NPU active (one GPU) the difference is in the noise for
> both types of invalidates (+/-1%).
> 
> With two NPUs active (on a 6-GPU system) the effect is more noticeable:
> 
>          mprotect rate (GB/s)
> Stride   Before      After      Speedup
> 64K         5.9        6.5          10%
> 1M         31.2       33.4           7%
> 2M         36.3       38.7           7%
> 4M        322.6      356.7          11%
> 
> Signed-off-by: Mark Hairgrove <mhairgrove at nvidia.com>
> ---
>  arch/powerpc/platforms/powernv/npu-dma.c |   99 ++++++++++++++---------------
>  1 files changed, 48 insertions(+), 51 deletions(-)
> 
> diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c
> index 8006c54..c8f438a 100644
> --- a/arch/powerpc/platforms/powernv/npu-dma.c
> +++ b/arch/powerpc/platforms/powernv/npu-dma.c
> @@ -454,79 +454,76 @@ static void put_mmio_atsd_reg(struct npu *npu, int reg)
>  }
>  
>  /* MMIO ATSD register offsets */
> -#define XTS_ATSD_AVA  1
> -#define XTS_ATSD_STAT 2
> +#define XTS_ATSD_LAUNCH 0
> +#define XTS_ATSD_AVA    1
> +#define XTS_ATSD_STAT   2
>  
> -static void mmio_launch_invalidate(struct mmio_atsd_reg *mmio_atsd_reg,
> -				unsigned long launch, unsigned long va)
> +static unsigned long get_atsd_launch_val(unsigned long pid, unsigned long psize,
> +					bool flush)
>  {
> -	struct npu *npu = mmio_atsd_reg->npu;
> -	int reg = mmio_atsd_reg->reg;
> +	unsigned long launch = 0;
>  
> -	__raw_writeq_be(va, npu->mmio_atsd_regs[reg] + XTS_ATSD_AVA);
> -	eieio();
> -	__raw_writeq_be(launch, npu->mmio_atsd_regs[reg]);
> +	if (psize == MMU_PAGE_COUNT) {
> +		/* IS set to invalidate entire matching PID */
> +		launch |= PPC_BIT(12);
> +	} else {
> +		/* AP set to invalidate region of psize */
> +		launch |= (u64)mmu_get_ap(psize) << PPC_BITLSHIFT(17);
> +	}
> +
> +	/* PRS set to process-scoped */
> +	launch |= PPC_BIT(13);
> +
> +	/* PID */
> +	launch |= pid << PPC_BITLSHIFT(38);
> +
> +	/* No flush */
> +	launch |= !flush << PPC_BITLSHIFT(39);
> +
> +	return launch;
>  }
>  
> -static void mmio_invalidate_pid(struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS],
> -				unsigned long pid, bool flush)
> +static void mmio_atsd_regs_write(struct mmio_atsd_reg
> +			mmio_atsd_reg[NV_MAX_NPUS], unsigned long offset,
> +			unsigned long val)
>  {
> -	int i;
> -	unsigned long launch;
> +	struct npu *npu;
> +	int i, reg;
>  
>  	for (i = 0; i <= max_npu2_index; i++) {
> -		if (mmio_atsd_reg[i].reg < 0)
> +		reg = mmio_atsd_reg[i].reg;
> +		if (reg < 0)
>  			continue;
>  
> -		/* IS set to invalidate matching PID */
> -		launch = PPC_BIT(12);
> -
> -		/* PRS set to process-scoped */
> -		launch |= PPC_BIT(13);
> -
> -		/* AP */
> -		launch |= (u64)
> -			mmu_get_ap(mmu_virtual_psize) << PPC_BITLSHIFT(17);
> -
> -		/* PID */
> -		launch |= pid << PPC_BITLSHIFT(38);
> +		npu = mmio_atsd_reg[i].npu;
> +		__raw_writeq_be(val, npu->mmio_atsd_regs[reg] + offset);
> +	}
> +}
>  
> -		/* No flush */
> -		launch |= !flush << PPC_BITLSHIFT(39);
> +static void mmio_invalidate_pid(struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS],
> +				unsigned long pid, bool flush)
> +{
> +	unsigned long launch = get_atsd_launch_val(pid, MMU_PAGE_COUNT, flush);
>  
> -		/* Invalidating the entire process doesn't use a va */
> -		mmio_launch_invalidate(&mmio_atsd_reg[i], launch, 0);
> -	}
> +	/* Invalidating the entire process doesn't use a va */
> +	mmio_atsd_regs_write(mmio_atsd_reg, XTS_ATSD_LAUNCH, launch);
>  }
>  
>  static void mmio_invalidate_va(struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS],
>  			unsigned long va, unsigned long pid, bool flush)
>  {
> -	int i;
>  	unsigned long launch;
>  
> -	for (i = 0; i <= max_npu2_index; i++) {
> -		if (mmio_atsd_reg[i].reg < 0)
> -			continue;
> -
> -		/* IS set to invalidate target VA */
> -		launch = 0;
> +	launch = get_atsd_launch_val(pid, mmu_virtual_psize, flush);
>  
> -		/* PRS set to process scoped */
> -		launch |= PPC_BIT(13);
> +	/* Write all VAs first */
> +	mmio_atsd_regs_write(mmio_atsd_reg, XTS_ATSD_AVA, va);
>  
> -		/* AP */
> -		launch |= (u64)
> -			mmu_get_ap(mmu_virtual_psize) << PPC_BITLSHIFT(17);
> -
> -		/* PID */
> -		launch |= pid << PPC_BITLSHIFT(38);
> -
> -		/* No flush */
> -		launch |= !flush << PPC_BITLSHIFT(39);
> +	/* Issue one barrier for all address writes */
> +	eieio();
>  
> -		mmio_launch_invalidate(&mmio_atsd_reg[i], launch, va);
> -	}
> +	/* Launch */
> +	mmio_atsd_regs_write(mmio_atsd_reg, XTS_ATSD_LAUNCH, launch);
>  }
>  
>  #define mn_to_npu_context(x) container_of(x, struct npu_context, mn)
> 




More information about the Linuxppc-dev mailing list