[PATCH kernel v4 10/11] powerpc/powernv/npu: Rework TCE Kill handling
Alexey Kardashevskiy
aik at ozlabs.ru
Thu May 5 14:23:20 AEST 2016
On 05/03/2016 05:37 PM, Alistair Popple wrote:
> On Fri, 29 Apr 2016 18:55:23 Alexey Kardashevskiy wrote:
>> The pnv_ioda_pe struct keeps an array of peers. At the moment it is only
>> used to link GPU and NPU for 2 purposes:
>>
>> 1. Access NPU quickly when configuring DMA for GPU - this was addressed
>> in the previos patch by removing use of it as DMA setup is not what
>> the kernel would constantly do.
>
> Agreed. It was used here because the peer array was added to deal with (2)
> below ...
>
>> 2. Invalidate TCE cache for NPU when it is invalidated for GPU.
>> GPU and NPU are in different PE. There is already a mechanism to
>> attach multiple iommu_table_group to the same iommu_table (used for VFIO),
>> we can reuse it here so does this patch.
>
> ... because we weren't aware iommu_table_group could be used to do this
> instead.
>
>> This gets rid of peers[] array and PNV_IODA_PE_PEER flag as they are
>> not needed anymore.
>
> Happy to see it go. I'm not too familiar with iommu groups but based on the
> code and what you have described to me both here and offline everything looks
> good to me. One pretty minor style comment below.
>
> Reviewed-By: Alistair Popple <alistair at popple.id.au>
>
>> While we are here, add TCE cache invalidation after enabling bypass.
>>
>> Signed-off-by: Alexey Kardashevskiy <aik at ozlabs.ru>
>> ---
>> Changes:
>> v4:
>> * reworked as "powerpc/powernv/npu: Add set/unset window helpers" has been
>> added
>> ---
>> arch/powerpc/platforms/powernv/npu-dma.c | 69
> +++++++++----------------------
>> arch/powerpc/platforms/powernv/pci-ioda.c | 57 ++++---------------------
>> arch/powerpc/platforms/powernv/pci.h | 6 ---
>> 3 files changed, 26 insertions(+), 106 deletions(-)
>>
>> diff --git a/arch/powerpc/platforms/powernv/npu-dma.c
> b/arch/powerpc/platforms/powernv/npu-dma.c
>> index 800d70f..cb2d1da 100644
>> --- a/arch/powerpc/platforms/powernv/npu-dma.c
>> +++ b/arch/powerpc/platforms/powernv/npu-dma.c
>> @@ -136,22 +136,17 @@ static struct pnv_ioda_pe
> *get_gpu_pci_dev_and_pe(struct pnv_ioda_pe *npe,
>> struct pnv_ioda_pe *pe;
>> struct pci_dn *pdn;
>>
>> - if (npe->flags & PNV_IODA_PE_PEER) {
>> - pe = npe->peers[0];
>> - pdev = pe->pdev;
>> - } else {
>> - pdev = pnv_pci_get_gpu_dev(npe->pdev);
>> - if (!pdev)
>> - return NULL;
>> + pdev = pnv_pci_get_gpu_dev(npe->pdev);
>> + if (!pdev)
>> + return NULL;
>>
>> - pdn = pci_get_pdn(pdev);
>> - if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE))
>> - return NULL;
>> + pdn = pci_get_pdn(pdev);
>> + if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE))
>> + return NULL;
>>
>> - hose = pci_bus_to_host(pdev->bus);
>> - phb = hose->private_data;
>> - pe = &phb->ioda.pe_array[pdn->pe_number];
>> - }
>> + hose = pci_bus_to_host(pdev->bus);
>> + phb = hose->private_data;
>> + pe = &phb->ioda.pe_array[pdn->pe_number];
>>
>> if (gpdev)
>> *gpdev = pdev;
>> @@ -186,6 +181,10 @@ static long pnv_npu_set_window(struct pnv_ioda_pe *npe,
>> }
>> pnv_pci_ioda2_tce_invalidate_entire(phb, false);
>>
>> + /* Add the table to the list so its TCE cache will get invalidated */
>> + pnv_pci_link_table_and_group(phb->hose->node, 0,
>> + tbl, &npe->table_group);
>
> Where tbl is associated with the GPU and is what links the NPU and GPU PEs.
>
>> return 0;
>> }
>>
>> @@ -206,45 +205,12 @@ static long pnv_npu_unset_window(struct pnv_ioda_pe
> *npe)
>> }
>> pnv_pci_ioda2_tce_invalidate_entire(phb, false);
>>
>> + pnv_pci_unlink_table_and_group(npe->table_group.tables[0],
>> + &npe->table_group);
>> +
>> return 0;
>> }
>>
>> -void pnv_npu_init_dma_pe(struct pnv_ioda_pe *npe)
>> -{
>> - struct pnv_ioda_pe *gpe;
>> - struct pci_dev *gpdev;
>> - int i, avail = -1;
>> -
>> - if (!npe->pdev || !(npe->flags & PNV_IODA_PE_DEV))
>> - return;
>> -
>> - gpe = get_gpu_pci_dev_and_pe(npe, &gpdev);
>> - if (!gpe)
>> - return;
>> -
>> - for (i = 0; i < PNV_IODA_MAX_PEER_PES; i++) {
>> - /* Nothing to do if the PE is already connected. */
>> - if (gpe->peers[i] == npe)
>> - return;
>> -
>> - if (!gpe->peers[i])
>> - avail = i;
>> - }
>> -
>> - if (WARN_ON(avail < 0))
>> - return;
>> -
>> - gpe->peers[avail] = npe;
>> - gpe->flags |= PNV_IODA_PE_PEER;
>> -
>> - /*
>> - * We assume that the NPU devices only have a single peer PE
>> - * (the GPU PCIe device PE).
>> - */
>> - npe->peers[0] = gpe;
>> - npe->flags |= PNV_IODA_PE_PEER;
>> -}
>> -
>> /*
>> * Enables 32 bit DMA on NPU.
>> */
>> @@ -302,6 +268,9 @@ static int pnv_npu_dma_set_bypass(struct pnv_ioda_pe
> *npe)
>> npe->pe_number, npe->pe_number,
>> 0 /* bypass base */, top);
>>
>> + if (rc == OPAL_SUCCESS)
>> + pnv_pci_ioda2_tce_invalidate_entire(phb, false);
>> +
>> return rc;
>> }
>>
>> diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c
> b/arch/powerpc/platforms/powernv/pci-ioda.c
>> index db7695f..922c74c 100644
>> --- a/arch/powerpc/platforms/powernv/pci-ioda.c
>> +++ b/arch/powerpc/platforms/powernv/pci-ioda.c
>> @@ -1816,23 +1816,12 @@ static inline void
> pnv_pci_ioda2_tce_invalidate_pe(struct pnv_ioda_pe *pe)
>> /* 01xb - invalidate TCEs that match the specified PE# */
>> unsigned long val = TCE_KILL_INVAL_PE | (pe->pe_number & 0xFF);
>> struct pnv_phb *phb = pe->phb;
>> - struct pnv_ioda_pe *npe;
>> - int i;
>>
>> if (!phb->ioda.tce_inval_reg)
>> return;
>>
>> mb(); /* Ensure above stores are visible */
>> __raw_writeq(cpu_to_be64(val), phb->ioda.tce_inval_reg);
>> -
>> - if (pe->flags & PNV_IODA_PE_PEER)
>> - for (i = 0; i < PNV_IODA_MAX_PEER_PES; i++) {
>> - npe = pe->peers[i];
>> - if (!npe || npe->phb->type != PNV_PHB_NPU)
>> - continue;
>> -
>> - pnv_pci_ioda2_tce_invalidate_entire(npe->phb, false);
>> - }
>> }
>>
>> static void pnv_pci_ioda2_do_tce_invalidate(unsigned pe_number, bool rm,
>> @@ -1867,33 +1856,24 @@ static void pnv_pci_ioda2_tce_invalidate(struct
> iommu_table *tbl,
>> struct iommu_table_group_link *tgl;
>>
>> list_for_each_entry_rcu(tgl, &tbl->it_group_list, next) {
>> - struct pnv_ioda_pe *npe;
>> struct pnv_ioda_pe *pe = container_of(tgl->table_group,
>> struct pnv_ioda_pe, table_group);
>> __be64 __iomem *invalidate = rm ?
>> (__be64 __iomem *)pe->phb->ioda.tce_inval_reg_phys :
>> pe->phb->ioda.tce_inval_reg;
>> - int i;
>>
>> - pnv_pci_ioda2_do_tce_invalidate(pe->pe_number, rm,
>> - invalidate, tbl->it_page_shift,
>> - index, npages);
>> -
>> - if (pe->flags & PNV_IODA_PE_PEER)
>> + if (pe->phb->type == PNV_PHB_NPU) {
>> /*
>> * The NVLink hardware does not support TCE kill
>> * per TCE entry so we have to invalidate
>> * the entire cache for it.
>> */
>> - for (i = 0; i < PNV_IODA_MAX_PEER_PES; i++) {
>> - npe = pe->peers[i];
>> - if (!npe || npe->phb->type != PNV_PHB_NPU ||
>> - !npe->phb->ioda.tce_inval_reg)
>> - continue;
>> -
>> - pnv_pci_ioda2_tce_invalidate_entire(npe->phb,
>> - rm);
>> - }
>> + pnv_pci_ioda2_tce_invalidate_entire(pe->phb, rm);
>> + continue;
>> + }
>
> Personally I think an if/else instead of the continue would be cleaner here.
I see it as a hack which is nice to have in a way that it does not touch
the correct code (in this case - by enclosing it into "else{}").
Also, that would increase the indentation of the
pnv_pci_ioda2_do_tce_invalidate() below for no good reason.
>
>> + pnv_pci_ioda2_do_tce_invalidate(pe->pe_number, rm,
>> + invalidate, tbl->it_page_shift,
>> + index, npages);
>> }
>> }
>>
>> @@ -3061,26 +3041,6 @@ static void pnv_pci_ioda_create_dbgfs(void)
>> #endif /* CONFIG_DEBUG_FS */
>> }
>>
>> -static void pnv_npu_ioda_fixup(void)
>> -{
>> - bool enable_bypass;
>> - struct pci_controller *hose, *tmp;
>> - struct pnv_phb *phb;
>> - struct pnv_ioda_pe *pe;
>> -
>> - list_for_each_entry_safe(hose, tmp, &hose_list, list_node) {
>> - phb = hose->private_data;
>> - if (phb->type != PNV_PHB_NPU)
>> - continue;
>> -
>> - list_for_each_entry(pe, &phb->ioda.pe_dma_list, dma_link) {
>> - enable_bypass = dma_get_mask(&pe->pdev->dev) ==
>> - DMA_BIT_MASK(64);
>> - pnv_npu_init_dma_pe(pe);
>> - }
>> - }
>> -}
>> -
>> static void pnv_pci_ioda_fixup(void)
>> {
>> pnv_pci_ioda_setup_PEs();
>> @@ -3093,9 +3053,6 @@ static void pnv_pci_ioda_fixup(void)
>> eeh_init();
>> eeh_addr_cache_build();
>> #endif
>> -
>> - /* Link NPU IODA tables to their PCI devices. */
>> - pnv_npu_ioda_fixup();
>> }
>>
>> /*
>> diff --git a/arch/powerpc/platforms/powernv/pci.h
> b/arch/powerpc/platforms/powernv/pci.h
>> index 485e5b1..e117bd8 100644
>> --- a/arch/powerpc/platforms/powernv/pci.h
>> +++ b/arch/powerpc/platforms/powernv/pci.h
>> @@ -24,7 +24,6 @@ enum pnv_phb_model {
>> #define PNV_IODA_PE_MASTER (1 << 3) /* Master PE in compound case
> */
>> #define PNV_IODA_PE_SLAVE (1 << 4) /* Slave PE in compound case
> */
>> #define PNV_IODA_PE_VF (1 << 5) /* PE for one VF
> */
>> -#define PNV_IODA_PE_PEER (1 << 6) /* PE has peers
> */
>>
>> /* Data associated with a PE, including IOMMU tracking etc.. */
>> struct pnv_phb;
>> @@ -32,9 +31,6 @@ struct pnv_ioda_pe {
>> unsigned long flags;
>> struct pnv_phb *phb;
>>
>> -#define PNV_IODA_MAX_PEER_PES 8
>> - struct pnv_ioda_pe *peers[PNV_IODA_MAX_PEER_PES];
>> -
>> /* A PE can be associated with a single device or an
>> * entire bus (& children). In the former case, pdev
>> * is populated, in the later case, pbus is.
>> @@ -246,8 +242,6 @@ extern void pe_level_printk(const struct pnv_ioda_pe
> *pe, const char *level,
>> pe_level_printk(pe, KERN_INFO, fmt, ##__VA_ARGS__)
>>
>> /* Nvlink functions */
>> -extern void pnv_npu_init_dma_pe(struct pnv_ioda_pe *npe);
>> -extern void pnv_npu_setup_dma_pe(struct pnv_ioda_pe *npe);
>> extern void pnv_npu_try_dma_set_bypass(struct pci_dev *gpdev, bool bypass);
>> extern void pnv_pci_ioda2_tce_invalidate_entire(struct pnv_phb *phb, bool
> rm);
>>
>>
>
--
Alexey
More information about the Linuxppc-dev
mailing list