[PATCH kernel v3 18/22] powerpc/powernv/npu: Add compound IOMMU groups

Mon Nov 19 13:29:07 AEDT 2018

On 19/11/2018 12:12, David Gibson wrote:
> On Tue, Nov 13, 2018 at 07:28:19PM +1100, Alexey Kardashevskiy wrote:
>> At the moment powernv registers an IOMMU group for each PE. There is
>> an exception though - NPU (an emulated PCI bridge representing an NVLink);
>> powernv attaches these bridges to the GPU IOMMU group which becomes
>> a master.
>>
>> Now we have POWER9 systems with GPUs connected to each other directly,
>> bypassing PCI. At the moment powernv does not control these links so
>> it has to put such interconnected GPUs to the same IOMMU group which
>> means that the old scheme with a GPU as a master won't work - there will
>> be up to 3 GPUs in such group.
>>
>> This introduces a npu_comp struct which represents a compound IOMMU
>> group made of multiple PEs. This converts the existing NVLink1 code to
>> use the new scheme. From now on, each PE must have a valid
>> iommu_table_group_ops which will either be called directly (a single PE
>> group) or indirectly from a compound group.
>>
>> This moves IOMMU group registration for NPU-connected GPUs to npu-dma.c.
>> For POWER8, this stores a new compound group pointer in a PE (so a GPU
>> is still a master); for POWER9 the new group pointer is stored in an NPU.
>>
>> Signed-off-by: Alexey Kardashevskiy <aik at ozlabs.ru>
>> ---
>>  arch/powerpc/include/asm/pci.h            |   1 +
>>  arch/powerpc/platforms/powernv/pci.h      |   7 +
>>  arch/powerpc/platforms/powernv/npu-dma.c  | 286 ++++++++++++++++++++--
>>  arch/powerpc/platforms/powernv/pci-ioda.c | 173 +++----------
>>  4 files changed, 308 insertions(+), 159 deletions(-)
>>
>> diff --git a/arch/powerpc/include/asm/pci.h b/arch/powerpc/include/asm/pci.h
>> index baf2886..0c72f18 100644
>> --- a/arch/powerpc/include/asm/pci.h
>> +++ b/arch/powerpc/include/asm/pci.h
>> @@ -132,5 +132,6 @@ extern struct pci_dev *pnv_pci_get_npu_dev(struct pci_dev *gpdev, int index);
>>  extern int pnv_npu2_init(struct pci_controller *hose);
>>  extern int pnv_npu2_map_lpar_dev(struct pci_dev *gpdev, unsigned int lparid,
>>  		unsigned long msr);
>> +extern int pnv_npu2_unmap_lpar_dev(struct pci_dev *gpdev);
>>  
>>  #endif /* __ASM_POWERPC_PCI_H */
>> diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
>> index cf9f748..aef4bb5 100644
>> --- a/arch/powerpc/platforms/powernv/pci.h
>> +++ b/arch/powerpc/platforms/powernv/pci.h
>> @@ -62,6 +62,7 @@ struct pnv_ioda_pe {
>>  
>>  	/* "Base" iommu table, ie, 4K TCEs, 32-bit DMA */
>>  	struct iommu_table_group table_group;
>> +	struct npu_comp		*npucomp;
>>  
>>  	/* 64-bit TCE bypass region */
>>  	bool			tce_bypass_enabled;
>> @@ -201,6 +202,8 @@ extern void pnv_teardown_msi_irqs(struct pci_dev *pdev);
>>  extern struct pnv_ioda_pe *pnv_ioda_get_pe(struct pci_dev *dev);
>>  extern void pnv_set_msi_irq_chip(struct pnv_phb *phb, unsigned int virq);
>>  extern void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable);
>> +extern unsigned long pnv_pci_ioda2_get_table_size(__u32 page_shift,
>> +		__u64 window_size, __u32 levels);
>>  extern int pnv_eeh_post_init(void);
>>  
>>  extern void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level,
>> @@ -216,6 +219,10 @@ extern void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level,
>>  extern void pnv_npu_try_dma_set_bypass(struct pci_dev *gpdev, bool bypass);
>>  extern void pnv_pci_ioda2_tce_invalidate_entire(struct pnv_phb *phb, bool rm);
>>  extern struct pnv_ioda_pe *pnv_pci_npu_setup_iommu(struct pnv_ioda_pe *npe);
>> +extern struct iommu_table_group *pnv_try_setup_npu_table_group(
>> +		struct pnv_ioda_pe *pe);
>> +extern struct iommu_table_group *pnv_npu_compound_attach(
>> +		struct pnv_ioda_pe *pe);
>>  
>>  /* pci-ioda-tce.c */
>>  #define POWERNV_IOMMU_DEFAULT_LEVELS	1
>> diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c
>> index 1792c7e..2231f4c 100644
>> --- a/arch/powerpc/platforms/powernv/npu-dma.c
>> +++ b/arch/powerpc/platforms/powernv/npu-dma.c
>> @@ -317,31 +317,6 @@ static struct iommu_table_group_ops pnv_pci_npu_ops = {
>>  	.unset_window = pnv_npu_unset_window,
>>  	.take_ownership = pnv_npu_take_ownership,
>>  };
>> -
>> -struct pnv_ioda_pe *pnv_pci_npu_setup_iommu(struct pnv_ioda_pe *npe)
>> -{
>> -	struct pnv_phb *phb = npe->phb;
>> -	struct pci_bus *pbus = phb->hose->bus;
>> -	struct pci_dev *npdev, *gpdev = NULL, *gptmp;
>> -	struct pnv_ioda_pe *gpe = get_gpu_pci_dev_and_pe(npe, &gpdev);
>> -
>> -	if (!gpe || !gpdev)
>> -		return NULL;
>> -
>> -	npe->table_group.ops = &pnv_pci_npu_ops;
>> -
>> -	list_for_each_entry(npdev, &pbus->devices, bus_list) {
>> -		gptmp = pnv_pci_get_gpu_dev(npdev);
>> -
>> -		if (gptmp != gpdev)
>> -			continue;
>> -
>> -		pe_info(gpe, "Attached NPU %s\n", dev_name(&npdev->dev));
>> -		iommu_group_add_device(gpe->table_group.group, &npdev->dev);
>> -	}
>> -
>> -	return gpe;
>> -}
>>  #endif /* !CONFIG_IOMMU_API */
>>  
>>  /*
>> @@ -349,6 +324,17 @@ struct pnv_ioda_pe *pnv_pci_npu_setup_iommu(struct pnv_ioda_pe *npe)
>>   */
>>  /* Maximum possible number of ATSD MMIO registers per NPU */
>>  #define NV_NMMU_ATSD_REGS 8
>> +#define NV_NPU_MAX_PE_NUM	16
>> +
>> +/*
>> + * A compound NPU IOMMU group which might consist of 1 GPU + 2xNPUs (POWER8) or
>> + * up to 3 x (GPU + 2xNPUs) (POWER9).
>> + */
>> +struct npu_comp {
>> +	struct iommu_table_group table_group;
>> +	int pe_num;
>> +	struct pnv_ioda_pe *pe[NV_NPU_MAX_PE_NUM];
>> +};
>>  
>>  /* An NPU descriptor, valid for POWER9 only */
>>  struct npu {
>> @@ -365,6 +351,8 @@ struct npu {
>>  	struct list_head next;
>>  
>>  	struct pci_controller *hose;
>> +
>> +	struct npu_comp npucomp;
>>  };
> 
> I'm confused by this.  The comment simply there are multiple NPUs in a
> single composite-group, but the np_comp structure is embedded in the
> npu structure, implying there's a copy per-NPU.

Yeah, there is a naming confusion. NPU is a big chunk in the CPU with 6
links, and this is what the "struct npu" above describes.

And there are 6 NPU emulated bridge devices which you can see in lspci
with the "ibmnpu" driver bound to them.

I guess from now on I will refer to the big NPU as "NPU" and to the
emulated bridge device as "NVLink2" or "NVlink2 emulated device" unless
you have a better suggestion (Alistair does not though).

-- 
Alexey