[PATCH kernel v3 18/22] powerpc/powernv/npu: Add compound IOMMU groups
Alexey Kardashevskiy
aik at ozlabs.ru
Mon Nov 19 13:29:07 AEDT 2018
On 19/11/2018 12:12, David Gibson wrote:
> On Tue, Nov 13, 2018 at 07:28:19PM +1100, Alexey Kardashevskiy wrote:
>> At the moment powernv registers an IOMMU group for each PE. There is
>> an exception though - NPU (an emulated PCI bridge representing an NVLink);
>> powernv attaches these bridges to the GPU IOMMU group which becomes
>> a master.
>>
>> Now we have POWER9 systems with GPUs connected to each other directly,
>> bypassing PCI. At the moment powernv does not control these links so
>> it has to put such interconnected GPUs to the same IOMMU group which
>> means that the old scheme with a GPU as a master won't work - there will
>> be up to 3 GPUs in such group.
>>
>> This introduces a npu_comp struct which represents a compound IOMMU
>> group made of multiple PEs. This converts the existing NVLink1 code to
>> use the new scheme. From now on, each PE must have a valid
>> iommu_table_group_ops which will either be called directly (a single PE
>> group) or indirectly from a compound group.
>>
>> This moves IOMMU group registration for NPU-connected GPUs to npu-dma.c.
>> For POWER8, this stores a new compound group pointer in a PE (so a GPU
>> is still a master); for POWER9 the new group pointer is stored in an NPU.
>>
>> Signed-off-by: Alexey Kardashevskiy <aik at ozlabs.ru>
>> ---
>> arch/powerpc/include/asm/pci.h | 1 +
>> arch/powerpc/platforms/powernv/pci.h | 7 +
>> arch/powerpc/platforms/powernv/npu-dma.c | 286 ++++++++++++++++++++--
>> arch/powerpc/platforms/powernv/pci-ioda.c | 173 +++----------
>> 4 files changed, 308 insertions(+), 159 deletions(-)
>>
>> diff --git a/arch/powerpc/include/asm/pci.h b/arch/powerpc/include/asm/pci.h
>> index baf2886..0c72f18 100644
>> --- a/arch/powerpc/include/asm/pci.h
>> +++ b/arch/powerpc/include/asm/pci.h
>> @@ -132,5 +132,6 @@ extern struct pci_dev *pnv_pci_get_npu_dev(struct pci_dev *gpdev, int index);
>> extern int pnv_npu2_init(struct pci_controller *hose);
>> extern int pnv_npu2_map_lpar_dev(struct pci_dev *gpdev, unsigned int lparid,
>> unsigned long msr);
>> +extern int pnv_npu2_unmap_lpar_dev(struct pci_dev *gpdev);
>>
>> #endif /* __ASM_POWERPC_PCI_H */
>> diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
>> index cf9f748..aef4bb5 100644
>> --- a/arch/powerpc/platforms/powernv/pci.h
>> +++ b/arch/powerpc/platforms/powernv/pci.h
>> @@ -62,6 +62,7 @@ struct pnv_ioda_pe {
>>
>> /* "Base" iommu table, ie, 4K TCEs, 32-bit DMA */
>> struct iommu_table_group table_group;
>> + struct npu_comp *npucomp;
>>
>> /* 64-bit TCE bypass region */
>> bool tce_bypass_enabled;
>> @@ -201,6 +202,8 @@ extern void pnv_teardown_msi_irqs(struct pci_dev *pdev);
>> extern struct pnv_ioda_pe *pnv_ioda_get_pe(struct pci_dev *dev);
>> extern void pnv_set_msi_irq_chip(struct pnv_phb *phb, unsigned int virq);
>> extern void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable);
>> +extern unsigned long pnv_pci_ioda2_get_table_size(__u32 page_shift,
>> + __u64 window_size, __u32 levels);
>> extern int pnv_eeh_post_init(void);
>>
>> extern void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level,
>> @@ -216,6 +219,10 @@ extern void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level,
>> extern void pnv_npu_try_dma_set_bypass(struct pci_dev *gpdev, bool bypass);
>> extern void pnv_pci_ioda2_tce_invalidate_entire(struct pnv_phb *phb, bool rm);
>> extern struct pnv_ioda_pe *pnv_pci_npu_setup_iommu(struct pnv_ioda_pe *npe);
>> +extern struct iommu_table_group *pnv_try_setup_npu_table_group(
>> + struct pnv_ioda_pe *pe);
>> +extern struct iommu_table_group *pnv_npu_compound_attach(
>> + struct pnv_ioda_pe *pe);
>>
>> /* pci-ioda-tce.c */
>> #define POWERNV_IOMMU_DEFAULT_LEVELS 1
>> diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c
>> index 1792c7e..2231f4c 100644
>> --- a/arch/powerpc/platforms/powernv/npu-dma.c
>> +++ b/arch/powerpc/platforms/powernv/npu-dma.c
>> @@ -317,31 +317,6 @@ static struct iommu_table_group_ops pnv_pci_npu_ops = {
>> .unset_window = pnv_npu_unset_window,
>> .take_ownership = pnv_npu_take_ownership,
>> };
>> -
>> -struct pnv_ioda_pe *pnv_pci_npu_setup_iommu(struct pnv_ioda_pe *npe)
>> -{
>> - struct pnv_phb *phb = npe->phb;
>> - struct pci_bus *pbus = phb->hose->bus;
>> - struct pci_dev *npdev, *gpdev = NULL, *gptmp;
>> - struct pnv_ioda_pe *gpe = get_gpu_pci_dev_and_pe(npe, &gpdev);
>> -
>> - if (!gpe || !gpdev)
>> - return NULL;
>> -
>> - npe->table_group.ops = &pnv_pci_npu_ops;
>> -
>> - list_for_each_entry(npdev, &pbus->devices, bus_list) {
>> - gptmp = pnv_pci_get_gpu_dev(npdev);
>> -
>> - if (gptmp != gpdev)
>> - continue;
>> -
>> - pe_info(gpe, "Attached NPU %s\n", dev_name(&npdev->dev));
>> - iommu_group_add_device(gpe->table_group.group, &npdev->dev);
>> - }
>> -
>> - return gpe;
>> -}
>> #endif /* !CONFIG_IOMMU_API */
>>
>> /*
>> @@ -349,6 +324,17 @@ struct pnv_ioda_pe *pnv_pci_npu_setup_iommu(struct pnv_ioda_pe *npe)
>> */
>> /* Maximum possible number of ATSD MMIO registers per NPU */
>> #define NV_NMMU_ATSD_REGS 8
>> +#define NV_NPU_MAX_PE_NUM 16
>> +
>> +/*
>> + * A compound NPU IOMMU group which might consist of 1 GPU + 2xNPUs (POWER8) or
>> + * up to 3 x (GPU + 2xNPUs) (POWER9).
>> + */
>> +struct npu_comp {
>> + struct iommu_table_group table_group;
>> + int pe_num;
>> + struct pnv_ioda_pe *pe[NV_NPU_MAX_PE_NUM];
>> +};
>>
>> /* An NPU descriptor, valid for POWER9 only */
>> struct npu {
>> @@ -365,6 +351,8 @@ struct npu {
>> struct list_head next;
>>
>> struct pci_controller *hose;
>> +
>> + struct npu_comp npucomp;
>> };
>
> I'm confused by this. The comment simply there are multiple NPUs in a
> single composite-group, but the np_comp structure is embedded in the
> npu structure, implying there's a copy per-NPU.
Yeah, there is a naming confusion. NPU is a big chunk in the CPU with 6
links, and this is what the "struct npu" above describes.
And there are 6 NPU emulated bridge devices which you can see in lspci
with the "ibmnpu" driver bound to them.
I guess from now on I will refer to the big NPU as "NPU" and to the
emulated bridge device as "NVLink2" or "NVlink2 emulated device" unless
you have a better suggestion (Alistair does not though).
--
Alexey
More information about the Linuxppc-dev
mailing list