[PATCH 1/2] powernv/npu: Add lock to prevent race in concurrent context init/destroy
Mark Hairgrove
mhairgrove at nvidia.com
Fri Apr 13 12:02:05 AEST 2018
On Wed, 11 Apr 2018, Alistair Popple wrote:
> The pnv_npu2_init_context() and pnv_npu2_destroy_context() functions are
> used to allocate/free contexts to allow address translation and shootdown
> by the NPU on a particular GPU. Context initialisation is implicitly safe
> as it is protected by the requirement mmap_sem be held in write mode,
> however pnv_npu2_destroy_context() does not require mmap_sem to be held and
> it is not safe to call with a concurrent initialisation for a different
> GPU.
>
> It was assumed the driver would ensure destruction was not called
> concurrently with initialisation. However the driver may be simplified by
> allowing concurrent initialisation and destruction for different GPUs. As
> npu context creation/destruction is not a performance critical path and the
> critical section is not large a single spinlock is used for simplicity.
>
> Signed-off-by: Alistair Popple <alistair at popple.id.au>
> ---
> arch/powerpc/platforms/powernv/npu-dma.c | 51 ++++++++++++++++++++++++++------
> 1 file changed, 42 insertions(+), 9 deletions(-)
>
> diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c
> index 1cbef1f9cd37..cb77162f4e7a 100644
> --- a/arch/powerpc/platforms/powernv/npu-dma.c
> +++ b/arch/powerpc/platforms/powernv/npu-dma.c
> @@ -34,6 +34,12 @@
> #define npu_to_phb(x) container_of(x, struct pnv_phb, npu)
>
> /*
> + * spinlock to protect initialisation of an npu_context for a particular
> + * mm_struct.
> + */
> +DEFINE_SPINLOCK(npu_context_lock);
static DEFINE_SPINLOCK
> +
> +/*
> * Other types of TCE cache invalidation are not functional in the
> * hardware.
> */
> @@ -694,7 +700,8 @@ static const struct mmu_notifier_ops nv_nmmu_notifier_ops = {
> * Returns an error if there no contexts are currently available or a
> * npu_context which should be passed to pnv_npu2_handle_fault().
> *
> - * mmap_sem must be held in write mode.
> + * mmap_sem must be held in write mode and must not be called from interrupt
> + * context.
> */
> struct npu_context *pnv_npu2_init_context(struct pci_dev *gpdev,
> unsigned long flags,
> @@ -741,7 +748,9 @@ struct npu_context *pnv_npu2_init_context(struct pci_dev *gpdev,
> /*
> * Setup the NPU context table for a particular GPU. These need to be
> * per-GPU as we need the tables to filter ATSDs when there are no
> - * active contexts on a particular GPU.
> + * active contexts on a particular GPU. It is safe for these to be
> + * called concurrently with destroy as the OPAL call takes appropriate
> + * locks and refcounts on init/destroy.
> */
> rc = opal_npu_init_context(nphb->opal_id, mm->context.id, flags,
> PCI_DEVID(gpdev->bus->number, gpdev->devfn));
> @@ -752,8 +761,19 @@ struct npu_context *pnv_npu2_init_context(struct pci_dev *gpdev,
> * We store the npu pci device so we can more easily get at the
> * associated npus.
> */
> + spin_lock(&npu_context_lock);
> npu_context = mm->context.npu_context;
> + if (npu_context)
> + WARN_ON(!kref_get_unless_zero(&npu_context->kref));
> + spin_unlock(&npu_context_lock);
> +
> if (!npu_context) {
> + /*
> + * We can set up these fields without holding the
> + * npu_context_lock as the npu_context hasn't been returned to
> + * the caller meaning it can't be destroyed. Parallel allocation
> + * is protected against by mmap_sem.
> + */
> rc = -ENOMEM;
> npu_context = kzalloc(sizeof(struct npu_context), GFP_KERNEL);
> if (npu_context) {
> @@ -772,8 +792,6 @@ struct npu_context *pnv_npu2_init_context(struct pci_dev *gpdev,
> }
>
> mm->context.npu_context = npu_context;
> - } else {
> - WARN_ON(!kref_get_unless_zero(&npu_context->kref));
> }
>
> npu_context->release_cb = cb;
> @@ -811,15 +829,16 @@ static void pnv_npu2_release_context(struct kref *kref)
> mm_context_remove_copro(npu_context->mm);
mm_context_remove_copro will now be called while holding a spin lock. Just
as a sanity check, is that ok? I haven't hit any problems in testing
and I see radix__flush_all_mm call preempt_disable/enable so I assume so,
but it doesn't hurt to double-check my understanding.
>
> npu_context->mm->context.npu_context = NULL;
> - mmu_notifier_unregister(&npu_context->mn,
> - npu_context->mm);
> -
> - kfree(npu_context);
> }
>
> +/*
> + * Destroy a context on the given GPU. May free the npu_context if it is no
> + * longer active on any GPUs. Must not be called from interrupt context.
> + */
> void pnv_npu2_destroy_context(struct npu_context *npu_context,
> struct pci_dev *gpdev)
> {
> + int removed;
> struct pnv_phb *nphb;
> struct npu *npu;
> struct pci_dev *npdev = pnv_pci_get_npu_dev(gpdev, 0);
> @@ -841,7 +860,21 @@ void pnv_npu2_destroy_context(struct npu_context *npu_context,
> WRITE_ONCE(npu_context->npdev[npu->index][nvlink_index], NULL);
> opal_npu_destroy_context(nphb->opal_id, npu_context->mm->context.id,
> PCI_DEVID(gpdev->bus->number, gpdev->devfn));
> - kref_put(&npu_context->kref, pnv_npu2_release_context);
> + spin_lock(&npu_context_lock);
> + removed = kref_put(&npu_context->kref, pnv_npu2_release_context);
> + spin_unlock(&npu_context_lock);
> +
> + /*
> + * We need to do this outside of pnv_npu2_release_context so that it is
> + * outside the spinlock as mmu_notifier_destroy uses SRCU.
> + */
> + if (removed) {
> + mmu_notifier_unregister(&npu_context->mn,
> + npu_context->mm);
> +
> + kfree(npu_context);
> + }
> +
> }
> EXPORT_SYMBOL(pnv_npu2_destroy_context);
>
> --
> 2.11.0
>
>
Reviewed-by: Mark Hairgrove <mhairgrove at nvidia.com>
Tested-by: Mark Hairgrove <mhairgrove at nvidia.com>
More information about the Linuxppc-dev
mailing list