[PATCH 1/2] powernv/npu: Add lock to prevent race in concurrent context init/destroy

Fri Apr 13 12:02:05 AEST 2018

On Wed, 11 Apr 2018, Alistair Popple wrote:

> The pnv_npu2_init_context() and pnv_npu2_destroy_context() functions are
> used to allocate/free contexts to allow address translation and shootdown
> by the NPU on a particular GPU. Context initialisation is implicitly safe
> as it is protected by the requirement mmap_sem be held in write mode,
> however pnv_npu2_destroy_context() does not require mmap_sem to be held and
> it is not safe to call with a concurrent initialisation for a different
> GPU.
> 
> It was assumed the driver would ensure destruction was not called
> concurrently with initialisation. However the driver may be simplified by
> allowing concurrent initialisation and destruction for different GPUs. As
> npu context creation/destruction is not a performance critical path and the
> critical section is not large a single spinlock is used for simplicity.
> 
> Signed-off-by: Alistair Popple <alistair at popple.id.au>
> ---
>  arch/powerpc/platforms/powernv/npu-dma.c | 51 ++++++++++++++++++++++++++------
>  1 file changed, 42 insertions(+), 9 deletions(-)
> 
> diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c
> index 1cbef1f9cd37..cb77162f4e7a 100644
> --- a/arch/powerpc/platforms/powernv/npu-dma.c
> +++ b/arch/powerpc/platforms/powernv/npu-dma.c
> @@ -34,6 +34,12 @@
>  #define npu_to_phb(x) container_of(x, struct pnv_phb, npu)
>  
>  /*
> + * spinlock to protect initialisation of an npu_context for a particular
> + * mm_struct.
> + */
> +DEFINE_SPINLOCK(npu_context_lock);

static DEFINE_SPINLOCK

> +
> +/*
>   * Other types of TCE cache invalidation are not functional in the
>   * hardware.
>   */
> @@ -694,7 +700,8 @@ static const struct mmu_notifier_ops nv_nmmu_notifier_ops = {
>   * Returns an error if there no contexts are currently available or a
>   * npu_context which should be passed to pnv_npu2_handle_fault().
>   *
> - * mmap_sem must be held in write mode.
> + * mmap_sem must be held in write mode and must not be called from interrupt
> + * context.
>   */
>  struct npu_context *pnv_npu2_init_context(struct pci_dev *gpdev,
>  			unsigned long flags,
> @@ -741,7 +748,9 @@ struct npu_context *pnv_npu2_init_context(struct pci_dev *gpdev,
>  	/*
>  	 * Setup the NPU context table for a particular GPU. These need to be
>  	 * per-GPU as we need the tables to filter ATSDs when there are no
> -	 * active contexts on a particular GPU.
> +	 * active contexts on a particular GPU. It is safe for these to be
> +	 * called concurrently with destroy as the OPAL call takes appropriate
> +	 * locks and refcounts on init/destroy.
>  	 */
>  	rc = opal_npu_init_context(nphb->opal_id, mm->context.id, flags,
>  				PCI_DEVID(gpdev->bus->number, gpdev->devfn));
> @@ -752,8 +761,19 @@ struct npu_context *pnv_npu2_init_context(struct pci_dev *gpdev,
>  	 * We store the npu pci device so we can more easily get at the
>  	 * associated npus.
>  	 */
> +	spin_lock(&npu_context_lock);
>  	npu_context = mm->context.npu_context;
> +	if (npu_context)
> +		WARN_ON(!kref_get_unless_zero(&npu_context->kref));
> +	spin_unlock(&npu_context_lock);
> +
>  	if (!npu_context) {
> +		/*
> +		 * We can set up these fields without holding the
> +		 * npu_context_lock as the npu_context hasn't been returned to
> +		 * the caller meaning it can't be destroyed. Parallel allocation
> +		 * is protected against by mmap_sem.
> +		 */
>  		rc = -ENOMEM;
>  		npu_context = kzalloc(sizeof(struct npu_context), GFP_KERNEL);
>  		if (npu_context) {
> @@ -772,8 +792,6 @@ struct npu_context *pnv_npu2_init_context(struct pci_dev *gpdev,
>  		}
>  
>  		mm->context.npu_context = npu_context;
> -	} else {
> -		WARN_ON(!kref_get_unless_zero(&npu_context->kref));
>  	}
>  
>  	npu_context->release_cb = cb;
> @@ -811,15 +829,16 @@ static void pnv_npu2_release_context(struct kref *kref)
>  		mm_context_remove_copro(npu_context->mm);

mm_context_remove_copro will now be called while holding a spin lock. Just
as a sanity check, is that ok? I haven't hit any problems in testing
and I see radix__flush_all_mm call preempt_disable/enable so I assume so,
but it doesn't hurt to double-check my understanding.

>  
>  	npu_context->mm->context.npu_context = NULL;
> -	mmu_notifier_unregister(&npu_context->mn,
> -				npu_context->mm);
> -
> -	kfree(npu_context);
>  }
>  
> +/*
> + * Destroy a context on the given GPU. May free the npu_context if it is no
> + * longer active on any GPUs. Must not be called from interrupt context.
> + */
>  void pnv_npu2_destroy_context(struct npu_context *npu_context,
>  			struct pci_dev *gpdev)
>  {
> +	int removed;
>  	struct pnv_phb *nphb;
>  	struct npu *npu;
>  	struct pci_dev *npdev = pnv_pci_get_npu_dev(gpdev, 0);
> @@ -841,7 +860,21 @@ void pnv_npu2_destroy_context(struct npu_context *npu_context,
>  	WRITE_ONCE(npu_context->npdev[npu->index][nvlink_index], NULL);
>  	opal_npu_destroy_context(nphb->opal_id, npu_context->mm->context.id,
>  				PCI_DEVID(gpdev->bus->number, gpdev->devfn));
> -	kref_put(&npu_context->kref, pnv_npu2_release_context);
> +	spin_lock(&npu_context_lock);
> +	removed = kref_put(&npu_context->kref, pnv_npu2_release_context);
> +	spin_unlock(&npu_context_lock);
> +
> +	/*
> +	 * We need to do this outside of pnv_npu2_release_context so that it is
> +	 * outside the spinlock as mmu_notifier_destroy uses SRCU.
> +	 */
> +	if (removed) {
> +		mmu_notifier_unregister(&npu_context->mn,
> +					npu_context->mm);
> +
> +		kfree(npu_context);
> +	}
> +
>  }
>  EXPORT_SYMBOL(pnv_npu2_destroy_context);
>  
> -- 
> 2.11.0
> 
> 

Reviewed-by: Mark Hairgrove <mhairgrove at nvidia.com>
Tested-by: Mark Hairgrove <mhairgrove at nvidia.com>