[PATCH v3] powerpc/npu-dma.c: Fix deadlock in mmio_invalidate

Thu Mar 8 14:41:53 AEDT 2018

Michael,

This won't apply cleanly on top of Balbir's MMIO ATSD Flushing patch
(https://patchwork.ozlabs.org/patch/848343/). I will resend a v4 which applies
cleanly on top of that as the rebase/merge is non-trivial.

- Alistair

On Friday, 2 March 2018 4:18:45 PM AEDT Alistair Popple wrote:
> When sending TLB invalidates to the NPU we need to send extra flushes due
> to a hardware issue. The original implementation would lock the all the
> ATSD MMIO registers sequentially before unlocking and relocking each of
> them sequentially to do the extra flush.
> 
> This introduced a deadlock as it is possible for one thread to hold one
> ATSD register whilst waiting for another register to be freed while the
> other thread is holding that register waiting for the one in the first
> thread to be freed.
> 
> For example if there are two threads and two ATSD registers:
> 
> Thread A	Thread B
> Acquire 1
> Acquire 2
> Release 1	Acquire 1
> Wait 1		Wait 2
> 
> Both threads will be stuck waiting to acquire a register resulting in an
> RCU stall warning or soft lockup.
> 
> This patch solves the deadlock by refactoring the code to ensure registers
> are not released between flushes and to ensure all registers are either
> acquired or released together and in order.
> 
> Fixes: bbd5ff50afff ("powerpc/powernv/npu-dma: Add explicit flush when sending an ATSD")
> Signed-off-by: Alistair Popple <alistair at popple.id.au>
> 
> ---
> 
> v2:
>  - Added memory barriers around ATSD register aquisition/release
>  - Added compiler barriers around npdev[][] assignment
> 
> v3:
>  - Added comments to describe required locking
> 
> ---
>  arch/powerpc/platforms/powernv/npu-dma.c | 228 +++++++++++++++++++------------
>  1 file changed, 139 insertions(+), 89 deletions(-)
> 
> diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c
> index 0a253b64ac5f..0dec96eb3358 100644
> --- a/arch/powerpc/platforms/powernv/npu-dma.c
> +++ b/arch/powerpc/platforms/powernv/npu-dma.c
> @@ -410,6 +410,11 @@ struct npu_context {
>  	void *priv;
>  };
>  
> +struct mmio_atsd_reg {
> +	struct npu *npu;
> +	int reg;
> +};
> +
>  /*
>   * Find a free MMIO ATSD register and mark it in use. Return -ENOSPC
>   * if none are available.
> @@ -419,7 +424,7 @@ static int get_mmio_atsd_reg(struct npu *npu)
>  	int i;
>  
>  	for (i = 0; i < npu->mmio_atsd_count; i++) {
> -		if (!test_and_set_bit(i, &npu->mmio_atsd_usage))
> +		if (!test_and_set_bit_lock(i, &npu->mmio_atsd_usage))
>  			return i;
>  	}
>  
> @@ -428,86 +433,90 @@ static int get_mmio_atsd_reg(struct npu *npu)
>  
>  static void put_mmio_atsd_reg(struct npu *npu, int reg)
>  {
> -	clear_bit(reg, &npu->mmio_atsd_usage);
> +	clear_bit_unlock(reg, &npu->mmio_atsd_usage);
>  }
>  
>  /* MMIO ATSD register offsets */
>  #define XTS_ATSD_AVA  1
>  #define XTS_ATSD_STAT 2
>  
> -static int mmio_launch_invalidate(struct npu *npu, unsigned long launch,
> -				unsigned long va)
> +static void mmio_launch_invalidate(struct mmio_atsd_reg *mmio_atsd_reg,
> +				unsigned long launch, unsigned long va)
>  {
> -	int mmio_atsd_reg;
> -
> -	do {
> -		mmio_atsd_reg = get_mmio_atsd_reg(npu);
> -		cpu_relax();
> -	} while (mmio_atsd_reg < 0);
> +	struct npu *npu = mmio_atsd_reg->npu;
> +	int reg = mmio_atsd_reg->reg;
>  
>  	__raw_writeq(cpu_to_be64(va),
> -		npu->mmio_atsd_regs[mmio_atsd_reg] + XTS_ATSD_AVA);
> +		npu->mmio_atsd_regs[reg] + XTS_ATSD_AVA);
>  	eieio();
> -	__raw_writeq(cpu_to_be64(launch), npu->mmio_atsd_regs[mmio_atsd_reg]);
> -
> -	return mmio_atsd_reg;
> +	__raw_writeq(cpu_to_be64(launch), npu->mmio_atsd_regs[reg]);
>  }
>  
> -static int mmio_invalidate_pid(struct npu *npu, unsigned long pid, bool flush)
> +static void mmio_invalidate_pid(struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS],
> +				unsigned long pid, bool flush)
>  {
> +	int i;
>  	unsigned long launch;
>  
> -	/* IS set to invalidate matching PID */
> -	launch = PPC_BIT(12);
> +	for (i = 0; i <= max_npu2_index; i++) {
> +		if (mmio_atsd_reg[i].reg < 0)
> +			continue;
> +
> +		/* IS set to invalidate matching PID */
> +		launch = PPC_BIT(12);
>  
> -	/* PRS set to process-scoped */
> -	launch |= PPC_BIT(13);
> +		/* PRS set to process-scoped */
> +		launch |= PPC_BIT(13);
>  
> -	/* AP */
> -	launch |= (u64) mmu_get_ap(mmu_virtual_psize) << PPC_BITLSHIFT(17);
> +		/* AP */
> +		launch |= (u64)
> +			mmu_get_ap(mmu_virtual_psize) << PPC_BITLSHIFT(17);
>  
> -	/* PID */
> -	launch |= pid << PPC_BITLSHIFT(38);
> +		/* PID */
> +		launch |= pid << PPC_BITLSHIFT(38);
>  
> -	/* No flush */
> -	launch |= !flush << PPC_BITLSHIFT(39);
> +		/* No flush */
> +		launch |= !flush << PPC_BITLSHIFT(39);
>  
> -	/* Invalidating the entire process doesn't use a va */
> -	return mmio_launch_invalidate(npu, launch, 0);
> +		/* Invalidating the entire process doesn't use a va */
> +		mmio_launch_invalidate(&mmio_atsd_reg[i], launch, 0);
> +	}
>  }
>  
> -static int mmio_invalidate_va(struct npu *npu, unsigned long va,
> -			unsigned long pid, bool flush)
> +static void mmio_invalidate_va(struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS],
> +			unsigned long va, unsigned long pid, bool flush)
>  {
> +	int i;
>  	unsigned long launch;
>  
> -	/* IS set to invalidate target VA */
> -	launch = 0;
> +	for (i = 0; i <= max_npu2_index; i++) {
> +		if (mmio_atsd_reg[i].reg < 0)
> +			continue;
>  
> -	/* PRS set to process scoped */
> -	launch |= PPC_BIT(13);
> +		/* IS set to invalidate target VA */
> +		launch = 0;
>  
> -	/* AP */
> -	launch |= (u64) mmu_get_ap(mmu_virtual_psize) << PPC_BITLSHIFT(17);
> +		/* PRS set to process scoped */
> +		launch |= PPC_BIT(13);
>  
> -	/* PID */
> -	launch |= pid << PPC_BITLSHIFT(38);
> +		/* AP */
> +		launch |= (u64)
> +			mmu_get_ap(mmu_virtual_psize) << PPC_BITLSHIFT(17);
>  
> -	/* No flush */
> -	launch |= !flush << PPC_BITLSHIFT(39);
> +		/* PID */
> +		launch |= pid << PPC_BITLSHIFT(38);
>  
> -	return mmio_launch_invalidate(npu, launch, va);
> +		/* No flush */
> +		launch |= !flush << PPC_BITLSHIFT(39);
> +
> +		mmio_launch_invalidate(&mmio_atsd_reg[i], launch, va);
> +	}
>  }
>  
>  #define mn_to_npu_context(x) container_of(x, struct npu_context, mn)
>  
> -struct mmio_atsd_reg {
> -	struct npu *npu;
> -	int reg;
> -};
> -
>  static void mmio_invalidate_wait(
> -	struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS], bool flush)
> +	struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS])
>  {
>  	struct npu *npu;
>  	int i, reg;
> @@ -522,16 +531,65 @@ static void mmio_invalidate_wait(
>  		reg = mmio_atsd_reg[i].reg;
>  		while (__raw_readq(npu->mmio_atsd_regs[reg] + XTS_ATSD_STAT))
>  			cpu_relax();
> +	}
> +}
>  
> -		put_mmio_atsd_reg(npu, reg);
> +/*
> + * Acquires all the address translation shootdown (ATSD) registers required to
> + * launch an ATSD on all links this npu_context is active on.
> + */
> +static void acquire_atsd_reg(struct npu_context *npu_context,
> +			struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS])
> +{
> +	int i, j;
> +	struct npu *npu;
> +	struct pci_dev *npdev;
> +	struct pnv_phb *nphb;
>  
> -		/*
> -		 * The GPU requires two flush ATSDs to ensure all entries have
> -		 * been flushed. We use PID 0 as it will never be used for a
> -		 * process on the GPU.
> +	for (i = 0; i <= max_npu2_index; i++) {
> +		mmio_atsd_reg[i].reg = -1;
> +		for (j = 0; j < NV_MAX_LINKS; j++) {
> +			/* There are no ordering requirements with respect to
> +			 * the setup of struct npu_context, but to ensure
> +			 * consistent behaviour we need to ensure npdev[][] is
> +			 * only read once.
> +			 */
> +			npdev = READ_ONCE(npu_context->npdev[i][j]);
> +			if (!npdev)
> +				continue;
> +
> +			nphb = pci_bus_to_host(npdev->bus)->private_data;
> +			npu = &nphb->npu;
> +			mmio_atsd_reg[i].npu = npu;
> +			mmio_atsd_reg[i].reg = get_mmio_atsd_reg(npu);
> +			while (mmio_atsd_reg[i].reg < 0) {
> +				mmio_atsd_reg[i].reg = get_mmio_atsd_reg(npu);
> +				cpu_relax();
> +			}
> +			break;
> +		}
> +	}
> +}
> +
> +/*
> + * Release previously acquired ATSD registers. To avoid deadlocks the registers
> + * must be released in the same order they were acquired above in
> + * acquire_atsd_reg.
> + */
> +static void release_atsd_reg(struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS])
> +{
> +	int i;
> +
> +	for (i = 0; i <= max_npu2_index; i++) {
> +		/* We can't rely on npu_context->npdev[][] being the same here
> +		 * as when acquire_atsd_reg() was called, hence we use the
> +		 * values stored in mmio_atsd_reg during the acquire phase
> +		 * rather than re-reading npdev[][]
>  		 */
> -		if (flush)
> -			mmio_invalidate_pid(npu, 0, true);
> +		if (mmio_atsd_reg[i].reg < 0)
> +			continue;
> +
> +		put_mmio_atsd_reg(mmio_atsd_reg[i].npu, mmio_atsd_reg[i].reg);
>  	}
>  }
>  
> @@ -542,10 +600,6 @@ static void mmio_invalidate_wait(
>  static void mmio_invalidate(struct npu_context *npu_context, int va,
>  			unsigned long address, bool flush)
>  {
> -	int i, j;
> -	struct npu *npu;
> -	struct pnv_phb *nphb;
> -	struct pci_dev *npdev;
>  	struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS];
>  	unsigned long pid = npu_context->mm->context.id;
>  
> @@ -561,37 +615,25 @@ static void mmio_invalidate(struct npu_context *npu_context, int va,
>  	 * Loop over all the NPUs this process is active on and launch
>  	 * an invalidate.
>  	 */
> -	for (i = 0; i <= max_npu2_index; i++) {
> -		mmio_atsd_reg[i].reg = -1;
> -		for (j = 0; j < NV_MAX_LINKS; j++) {
> -			npdev = npu_context->npdev[i][j];
> -			if (!npdev)
> -				continue;
> -
> -			nphb = pci_bus_to_host(npdev->bus)->private_data;
> -			npu = &nphb->npu;
> -			mmio_atsd_reg[i].npu = npu;
> -
> -			if (va)
> -				mmio_atsd_reg[i].reg =
> -					mmio_invalidate_va(npu, address, pid,
> -							flush);
> -			else
> -				mmio_atsd_reg[i].reg =
> -					mmio_invalidate_pid(npu, pid, flush);
> -
> -			/*
> -			 * The NPU hardware forwards the shootdown to all GPUs
> -			 * so we only have to launch one shootdown per NPU.
> -			 */
> -			break;
> -		}
> +	acquire_atsd_reg(npu_context, mmio_atsd_reg);
> +	if (va)
> +		mmio_invalidate_va(mmio_atsd_reg, address, pid, flush);
> +	else
> +		mmio_invalidate_pid(mmio_atsd_reg, pid, flush);
> +
> +	mmio_invalidate_wait(mmio_atsd_reg);
> +	if (flush) {
> +		/*
> +		 * The GPU requires two flush ATSDs to ensure all entries have
> +		 * been flushed. We use PID 0 as it will never be used for a
> +		 * process on the GPU.
> +		 */
> +		mmio_invalidate_pid(mmio_atsd_reg, 0, true);
> +		mmio_invalidate_wait(mmio_atsd_reg);
> +		mmio_invalidate_pid(mmio_atsd_reg, 0, true);
> +		mmio_invalidate_wait(mmio_atsd_reg);
>  	}
> -
> -	mmio_invalidate_wait(mmio_atsd_reg, flush);
> -	if (flush)
> -		/* Wait for the flush to complete */
> -		mmio_invalidate_wait(mmio_atsd_reg, false);
> +	release_atsd_reg(mmio_atsd_reg);
>  }
>  
>  static void pnv_npu2_mn_release(struct mmu_notifier *mn,
> @@ -726,7 +768,15 @@ struct npu_context *pnv_npu2_init_context(struct pci_dev *gpdev,
>  	if (WARN_ON(of_property_read_u32(nvlink_dn, "ibm,npu-link-index",
>  							&nvlink_index)))
>  		return ERR_PTR(-ENODEV);
> -	npu_context->npdev[npu->index][nvlink_index] = npdev;
> +
> +	/* npdev is a pci_dev pointer setup by the PCI code. We assign it to
> +	 * npdev[][] to indicate to the mmu notifiers that an invalidation
> +	 * should also be sent over this nvlink. The notifiers don't use any
> +	 * other fields in npu_context, so we just need to ensure that when they
> +	 * deference npu_context->npdev[][] it is either a valid pointer or
> +	 * NULL.
> +	 */
> +	WRITE_ONCE(npu_context->npdev[npu->index][nvlink_index], npdev);
>  
>  	if (!nphb->npu.nmmu_flush) {
>  		/*
> @@ -778,7 +828,7 @@ void pnv_npu2_destroy_context(struct npu_context *npu_context,
>  	if (WARN_ON(of_property_read_u32(nvlink_dn, "ibm,npu-link-index",
>  							&nvlink_index)))
>  		return;
> -	npu_context->npdev[npu->index][nvlink_index] = NULL;
> +	WRITE_ONCE(npu_context->npdev[npu->index][nvlink_index], NULL);
>  	opal_npu_destroy_context(nphb->opal_id, npu_context->mm->context.id,
>  				PCI_DEVID(gpdev->bus->number, gpdev->devfn));
>  	kref_put(&npu_context->kref, pnv_npu2_release_context);
>