[PATCH kernel 07/10] powerpc/powernv/npu: Rework TCE Kill handling
Alistair Popple
alistair at popple.id.au
Mon Mar 21 17:50:35 AEDT 2016
On Wed, 9 Mar 2016 17:29:03 Alexey Kardashevskiy wrote:
> The pnv_ioda_pe struct keeps an array of peers. At the moment it is only
> used to link GPU and NPU for 2 purposes:
>
> 1. Access NPU _quickly_ when configuring DMA for GPU - this was addressed
> in the previos patch by removing use of it as DMA setup is not what
> the kernel would constantly do.
This was implemented using peers[] because we had peers[] anyway to deal with
TCE cache invalidation. I agree there's no reason to keep it around solely for
speed.
> 2. Invalidate TCE cache for NPU when it is invalidated for GPU.
> GPU and NPU are in different PE. There is already a mechanism to
> attach multiple iommu_table_group to the same iommu_table (used for VFIO),
> we can reuse it here so does this patch.
Ok, this makes sense. I wasn't aware of iommu_table_groups but it looks like a
more elegant way of solving the problem. I'm not familiar with the way iommu
groups work but the changes make sense to me as far as I can tell.
> This gets rid of peers[] array and PNV_IODA_PE_PEER flag as they are
> not needed anymore.
>
> While we are here, add TCE cache invalidation after changing TVT.
Good idea, even though I guess we're unlikely to hit a problem in practice as
I'm pretty sure on a normal system the links would get retrained between runs
with different TVTs which implies the NPU gets reset too.
> Signed-off-by: Alexey Kardashevskiy <aik at ozlabs.ru>
Reviewed-By: Alistair Popple <alistair at popple.id.au>
> ---
> arch/powerpc/platforms/powernv/npu-dma.c | 75
+++++++++----------------------
> arch/powerpc/platforms/powernv/pci-ioda.c | 57 +++--------------------
> arch/powerpc/platforms/powernv/pci.h | 6 ---
> 3 files changed, 29 insertions(+), 109 deletions(-)
>
> diff --git a/arch/powerpc/platforms/powernv/npu-dma.c
b/arch/powerpc/platforms/powernv/npu-dma.c
> index 8960e46..866d3d3 100644
> --- a/arch/powerpc/platforms/powernv/npu-dma.c
> +++ b/arch/powerpc/platforms/powernv/npu-dma.c
> @@ -136,22 +136,17 @@ static struct pnv_ioda_pe
*get_gpu_pci_dev_and_pe(struct pnv_ioda_pe *npe,
> struct pnv_ioda_pe *pe;
> struct pci_dn *pdn;
>
> - if (npe->flags & PNV_IODA_PE_PEER) {
> - pe = npe->peers[0];
> - pdev = pe->pdev;
> - } else {
> - pdev = pnv_pci_get_gpu_dev(npe->pdev);
> - if (!pdev)
> - return NULL;
> + pdev = pnv_pci_get_gpu_dev(npe->pdev);
> + if (!pdev)
> + return NULL;
>
> - pdn = pci_get_pdn(pdev);
> - if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE))
> - return NULL;
> + pdn = pci_get_pdn(pdev);
> + if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE))
> + return NULL;
>
> - hose = pci_bus_to_host(pdev->bus);
> - phb = hose->private_data;
> - pe = &phb->ioda.pe_array[pdn->pe_number];
> - }
> + hose = pci_bus_to_host(pdev->bus);
> + phb = hose->private_data;
> + pe = &phb->ioda.pe_array[pdn->pe_number];
>
> if (gpdev)
> *gpdev = pdev;
> @@ -159,42 +154,6 @@ static struct pnv_ioda_pe
*get_gpu_pci_dev_and_pe(struct pnv_ioda_pe *npe,
> return pe;
> }
>
> -void pnv_npu_init_dma_pe(struct pnv_ioda_pe *npe)
> -{
> - struct pnv_ioda_pe *gpe;
> - struct pci_dev *gpdev;
> - int i, avail = -1;
> -
> - if (!npe->pdev || !(npe->flags & PNV_IODA_PE_DEV))
> - return;
> -
> - gpe = get_gpu_pci_dev_and_pe(npe, &gpdev);
> - if (!gpe)
> - return;
> -
> - for (i = 0; i < PNV_IODA_MAX_PEER_PES; i++) {
> - /* Nothing to do if the PE is already connected. */
> - if (gpe->peers[i] == npe)
> - return;
> -
> - if (!gpe->peers[i])
> - avail = i;
> - }
> -
> - if (WARN_ON(avail < 0))
> - return;
> -
> - gpe->peers[avail] = npe;
> - gpe->flags |= PNV_IODA_PE_PEER;
> -
> - /*
> - * We assume that the NPU devices only have a single peer PE
> - * (the GPU PCIe device PE).
> - */
> - npe->peers[0] = gpe;
> - npe->flags |= PNV_IODA_PE_PEER;
> -}
> -
> /*
> * Enables 32 bit DMA on NPU.
> */
> @@ -225,6 +184,13 @@ static void pnv_npu_dma_set_32(struct pnv_ioda_pe *npe)
> if (rc != OPAL_SUCCESS)
> pr_warn("%s: Error %lld setting DMA window on PHB#%d-PE#%d\n",
> __func__, rc, phb->hose->global_number, npe-
>pe_number);
> + else
> + pnv_pci_ioda2_tce_invalidate_entire(phb, false);
> +
> + /* Add the table to the list so its TCE cache will get invalidated */
> + npe->table_group.tables[0] = tbl;
> + pnv_pci_link_table_and_group(phb->hose->node, 0,
> + tbl, &npe->table_group);
>
> /*
> * We don't initialise npu_pe->tce32_table as we always use
> @@ -245,10 +211,10 @@ static int pnv_npu_dma_set_bypass(struct pnv_ioda_pe
*npe)
> int64_t rc = 0;
> phys_addr_t top = memblock_end_of_DRAM();
>
> - if (phb->type != PNV_PHB_NPU || !npe->pdev)
> - return -EINVAL;
> -
> /* Enable the bypass window */
> + pnv_pci_unlink_table_and_group(npe->table_group.tables[0],
> + &npe->table_group);
> + npe->table_group.tables[0] = NULL;
>
> npe->tce_bypass_base = 0;
> top = roundup_pow_of_two(top);
> @@ -258,6 +224,9 @@ static int pnv_npu_dma_set_bypass(struct pnv_ioda_pe
*npe)
> npe->pe_number, npe->pe_number,
> npe->tce_bypass_base, top);
>
> + if (rc == OPAL_SUCCESS)
> + pnv_pci_ioda2_tce_invalidate_entire(phb, false);
> +
> return rc;
> }
>
> diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c
b/arch/powerpc/platforms/powernv/pci-ioda.c
> index 47085b7..5a6cf2e 100644
> --- a/arch/powerpc/platforms/powernv/pci-ioda.c
> +++ b/arch/powerpc/platforms/powernv/pci-ioda.c
> @@ -1836,23 +1836,12 @@ static inline void
pnv_pci_ioda2_tce_invalidate_pe(struct pnv_ioda_pe *pe)
> /* 01xb - invalidate TCEs that match the specified PE# */
> unsigned long val = TCE_KILL_INVAL_PE | (pe->pe_number & 0xFF);
> struct pnv_phb *phb = pe->phb;
> - struct pnv_ioda_pe *npe;
> - int i;
>
> if (!phb->ioda.tce_inval_reg)
> return;
>
> mb(); /* Ensure above stores are visible */
> __raw_writeq(cpu_to_be64(val), phb->ioda.tce_inval_reg);
> -
> - if (pe->flags & PNV_IODA_PE_PEER)
> - for (i = 0; i < PNV_IODA_MAX_PEER_PES; i++) {
> - npe = pe->peers[i];
> - if (!npe || npe->phb->type != PNV_PHB_NPU)
> - continue;
> -
> - pnv_pci_ioda2_tce_invalidate_entire(npe->phb, false);
> - }
> }
>
> static void pnv_pci_ioda2_do_tce_invalidate(unsigned pe_number, bool rm,
> @@ -1887,33 +1876,24 @@ static void pnv_pci_ioda2_tce_invalidate(struct
iommu_table *tbl,
> struct iommu_table_group_link *tgl;
>
> list_for_each_entry_lockless(tgl, &tbl->it_group_list, next) {
> - struct pnv_ioda_pe *npe;
> struct pnv_ioda_pe *pe = container_of(tgl->table_group,
> struct pnv_ioda_pe, table_group);
> __be64 __iomem *invalidate = rm ?
> (__be64 __iomem *)pe->phb->ioda.tce_inval_reg_phys :
> pe->phb->ioda.tce_inval_reg;
> - int i;
>
> - pnv_pci_ioda2_do_tce_invalidate(pe->pe_number, rm,
> - invalidate, tbl->it_page_shift,
> - index, npages);
> -
> - if (pe->flags & PNV_IODA_PE_PEER)
> + if (pe->phb->type == PNV_PHB_NPU) {
> /*
> * The NVLink hardware does not support TCE kill
> * per TCE entry so we have to invalidate
> * the entire cache for it.
> */
> - for (i = 0; i < PNV_IODA_MAX_PEER_PES; i++) {
> - npe = pe->peers[i];
> - if (!npe || npe->phb->type != PNV_PHB_NPU ||
> - !npe->phb->ioda.tce_inval_reg)
> - continue;
> -
> - pnv_pci_ioda2_tce_invalidate_entire(npe->phb,
> - rm);
> - }
> + pnv_pci_ioda2_tce_invalidate_entire(pe->phb, rm);
> + continue;
> + }
> + pnv_pci_ioda2_do_tce_invalidate(pe->pe_number, rm,
> + invalidate, tbl->it_page_shift,
> + index, npages);
> }
> }
>
> @@ -3094,26 +3074,6 @@ static void pnv_pci_ioda_create_dbgfs(void)
> #endif /* CONFIG_DEBUG_FS */
> }
>
> -static void pnv_npu_ioda_fixup(void)
> -{
> - bool enable_bypass;
> - struct pci_controller *hose, *tmp;
> - struct pnv_phb *phb;
> - struct pnv_ioda_pe *pe;
> -
> - list_for_each_entry_safe(hose, tmp, &hose_list, list_node) {
> - phb = hose->private_data;
> - if (phb->type != PNV_PHB_NPU)
> - continue;
> -
> - list_for_each_entry(pe, &phb->ioda.pe_dma_list, dma_link) {
> - enable_bypass = dma_get_mask(&pe->pdev->dev) ==
> - DMA_BIT_MASK(64);
> - pnv_npu_init_dma_pe(pe);
> - }
> - }
> -}
> -
> static void pnv_pci_ioda_fixup(void)
> {
> pnv_pci_ioda_setup_PEs();
> @@ -3126,9 +3086,6 @@ static void pnv_pci_ioda_fixup(void)
> eeh_init();
> eeh_addr_cache_build();
> #endif
> -
> - /* Link NPU IODA tables to their PCI devices. */
> - pnv_npu_ioda_fixup();
> }
>
> /*
> diff --git a/arch/powerpc/platforms/powernv/pci.h
b/arch/powerpc/platforms/powernv/pci.h
> index d574a9d..06405fd 100644
> --- a/arch/powerpc/platforms/powernv/pci.h
> +++ b/arch/powerpc/platforms/powernv/pci.h
> @@ -24,7 +24,6 @@ enum pnv_phb_model {
> #define PNV_IODA_PE_MASTER (1 << 3) /* Master PE in compound case
*/
> #define PNV_IODA_PE_SLAVE (1 << 4) /* Slave PE in compound case
*/
> #define PNV_IODA_PE_VF (1 << 5) /* PE for one VF
*/
> -#define PNV_IODA_PE_PEER (1 << 6) /* PE has peers
*/
>
> /* Data associated with a PE, including IOMMU tracking etc.. */
> struct pnv_phb;
> @@ -32,9 +31,6 @@ struct pnv_ioda_pe {
> unsigned long flags;
> struct pnv_phb *phb;
>
> -#define PNV_IODA_MAX_PEER_PES 8
> - struct pnv_ioda_pe *peers[PNV_IODA_MAX_PEER_PES];
> -
> /* A PE can be associated with a single device or an
> * entire bus (& children). In the former case, pdev
> * is populated, in the later case, pbus is.
> @@ -237,8 +233,6 @@ extern int pnv_setup_msi_irqs(struct pci_dev *pdev, int
nvec, int type);
> extern void pnv_teardown_msi_irqs(struct pci_dev *pdev);
>
> /* Nvlink functions */
> -extern void pnv_npu_init_dma_pe(struct pnv_ioda_pe *npe);
> -extern void pnv_npu_setup_dma_pe(struct pnv_ioda_pe *npe);
> extern void pnv_npu_try_dma_set_bypass(struct pci_dev *gpdev, bool bypass);
> extern void pnv_pci_ioda2_tce_invalidate_entire(struct pnv_phb *phb, bool
rm);
>
>
More information about the Linuxppc-dev
mailing list