[Very RFC 07/46] powernv/pci: Rework IODA PE device accounting

Alexey Kardashevskiy aik at ozlabs.ru
Thu Nov 21 16:48:08 AEDT 2019



On 20/11/2019 12:28, Oliver O'Halloran wrote:
> The current process for configuring the IODA PEs for normal PCI devices is
> abit stupid.

This phrase should go.


> After assigning MMIO resources for the devices on a bus the
> actual PE asignment occurs when pcibios_setup_bridge() is called for the
> parent bridge. In pcibios_setup_bridge() we:
> 
> 1. Assign all 256 devfn's for the subordinate bus to the PE corresponding
>    to the bridge window.
> 2. Initialise the IOMMU tables for that PE.
> 3. Traverse each device on the bus below the bridge setting the IOMMU table
>    pointer to point at the PE's table.
> 4. Finally, set pci_dn->pe_number to indicate that we've done the
>    per-device setup and allow EEH and the platform code to look up
>    the PE number.
> 
> Although mostly functional, there's a couple of issues with this approach.
> The most glaring is that it mixes the per-bus (per-PE really) setup with
> the per-device setup in a way that's completely asymmetric to what happens
> when tearing down a device.
> 
> In step 4. the number of devices in the PE is counted and stored in the
> ioda_pe structure. When releasing a pci_dev the device count is dropped
> until it hits zero where the ioda_pe itself is torn down. However, the bus
> itself remains active and can be re-scanned to bring back the devices that
> were removed. On a rescan we do not re-run the bridge setup so the
> per-device setup is never re-done which results in the re-scanned being
> unusable.
> 
> There are a few other minor issues too. Associating all 256 devfns with
> the PE means that config accesses to non-existant PCI devices results
> in a spurious PE freezes. We currently prevent this by only allowing config
> accesses to a BDFN when there is a corresponding pci_dn structure. We
> would like to eliminate that restriction in the future though.
> 
> That all said the biggest issue is that the current behaviour is hard to
> follow at the best of times. On top of that the behaviour is slightly (or
> majorly) different across each PHB type (PCIe, OpenCAPI, NVLink) and the
> behaviour for physical devices (described above) and virtual functions is
> again different. To address this we want to merge the paths as much as
> possible so that the PowerNV specific PCI initialisation steps all occur
> at roughly the same point in the PCI device setup path.
> 
> We can address most of these problems by moving the PE setup out of
> pcibios_setup_bridge() and into pcibios_bus_add_device(). The latter is
> called on a per-device basis so we have some symmetry between the setup and
> teardown paths. Moving the PE assignments to here should also allow us to
> converge how PE assignment works on all PHB types so it's always done in
> one place.
> 
> Signed-off-by: Oliver O'Halloran <oohall at gmail.com>
> ---
>  arch/powerpc/platforms/powernv/pci-ioda.c | 112 +++++++++++-----------
>  1 file changed, 58 insertions(+), 54 deletions(-)
> 
> diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
> index c6ea7a504e04..c74521e5f3ab 100644
> --- a/arch/powerpc/platforms/powernv/pci-ioda.c
> +++ b/arch/powerpc/platforms/powernv/pci-ioda.c
> @@ -51,6 +51,7 @@ static const char * const pnv_phb_names[] = { "IODA1", "IODA2", "NPU_NVLINK",
>  					      "NPU_OCAPI" };
>  
>  static void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable);
> +static void pnv_pci_configure_bus(struct pci_bus *bus);


Do not need this, at least, after 46/46.

>  
>  void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level,
>  			    const char *fmt, ...)
> @@ -1104,34 +1105,6 @@ static struct pnv_ioda_pe *pnv_ioda_setup_dev_PE(struct pci_dev *dev)
>  	return pe;
>  }
>  
> -static void pnv_ioda_setup_same_PE(struct pci_bus *bus, struct pnv_ioda_pe *pe)
> -{
> -	struct pci_dev *dev;
> -
> -	list_for_each_entry(dev, &bus->devices, bus_list) {
> -		struct pci_dn *pdn = pci_get_pdn(dev);
> -
> -		if (pdn == NULL) {
> -			pr_warn("%s: No device node associated with device !\n",
> -				pci_name(dev));
> -			continue;
> -		}
> -
> -		/*
> -		 * In partial hotplug case, the PCI device might be still
> -		 * associated with the PE and needn't attach it to the PE
> -		 * again.
> -		 */
> -		if (pdn->pe_number != IODA_INVALID_PE)
> -			continue;
> -
> -		pe->device_count++;
> -		pdn->pe_number = pe->pe_number;
> -		if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate)
> -			pnv_ioda_setup_same_PE(dev->subordinate, pe);
> -	}
> -}
> -
>  /*
>   * There're 2 types of PCI bus sensitive PEs: One that is compromised of
>   * single PCI bus. Another one that contains the primary PCI bus and its
> @@ -1152,7 +1125,6 @@ static struct pnv_ioda_pe *pnv_ioda_setup_bus_PE(struct pci_bus *bus, bool all)
>  	pe_num = phb->ioda.pe_rmap[bus->number << 8];
>  	if (pe_num != IODA_INVALID_PE) {
>  		pe = &phb->ioda.pe_array[pe_num];
> -		pnv_ioda_setup_same_PE(bus, pe);
>  		return NULL;
>  	}
>  
> @@ -1196,9 +1168,6 @@ static struct pnv_ioda_pe *pnv_ioda_setup_bus_PE(struct pci_bus *bus, bool all)
>  		return NULL;
>  	}
>  
> -	/* Associate it with all child devices */
> -	pnv_ioda_setup_same_PE(bus, pe);
> -
>  	/* Put PE to the list */
>  	list_add_tail(&pe->list, &phb->ioda.pe_list);
>  
> @@ -1758,23 +1727,20 @@ static void pnv_pci_ioda_dma_dev_setup(struct pnv_phb *phb, struct pci_dev *pdev
>  	struct pci_dn *pdn = pci_get_pdn(pdev);
>  	struct pnv_ioda_pe *pe;
>  
> -	/*
> -	 * The function can be called while the PE#
> -	 * hasn't been assigned. Do nothing for the
> -	 * case.
> -	 */
> -	if (!pdn || pdn->pe_number == IODA_INVALID_PE)
> -		return;
> -
>  	pe = &phb->ioda.pe_array[pdn->pe_number];
>  	WARN_ON(get_dma_ops(&pdev->dev) != &dma_iommu_ops);
>  	pdev->dev.archdata.dma_offset = pe->tce_bypass_base;
>  	set_iommu_table_base(&pdev->dev, pe->table_group.tables[0]);
> +
> +	pe->device_count++;
> +
>  	/*
>  	 * Note: iommu_add_device() will fail here as
>  	 * for physical PE: the device is already added by now;
>  	 * for virtual PE: sysfs entries are not ready yet and
>  	 * tce_iommu_bus_notifier will add the device to a group later.
> +	 *
> +	 * XXX: this is wrong since the re-ordering patch.


What is that re-ordering patch and why it did not remove this comment then?



>  	 */
>  }
>  
> @@ -2288,9 +2254,6 @@ static void pnv_pci_ioda1_setup_dma_pe(struct pnv_phb *phb,
>  	pe->table_group.tce32_size = tbl->it_size << tbl->it_page_shift;
>  	iommu_init_table(tbl, phb->hose->node, 0, 0);
>  
> -	if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL))
> -		pnv_ioda_setup_bus_dma(pe, pe->pbus);
> -
>  	return;
>   fail:
>  	/* XXX Failure: Try to fallback to 64-bit only ? */
> @@ -2626,9 +2589,9 @@ static void pnv_pci_ioda_setup_iommu_api(void)
>  	/*
>  	 * There are 4 types of PEs:
>  	 * - PNV_IODA_PE_BUS: a downstream port with an adapter,
> -	 *   created from pnv_pci_setup_bridge();
> +	 *   created from pnv_pci_configure_bus();
>  	 * - PNV_IODA_PE_BUS_ALL: a PCI-PCIX bridge with devices behind it,
> -	 *   created from pnv_pci_setup_bridge();
> +	 *   created from pnv_pci_configure_bus();
>  	 * - PNV_IODA_PE_VF: a SRIOV virtual function,
>  	 *   created from pnv_pcibios_sriov_enable();
>  	 * - PNV_IODA_PE_DEV: an NPU or OCAPI device,
> @@ -2748,8 +2711,10 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
>  	if (rc)
>  		return;
>  
> -	if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL))
> -		pnv_ioda_setup_bus_dma(pe, pe->pbus);
> +	/*
> +	 * The IOMMU table for the PE is associated with the device in
> +	 * pnv_pcibios_bus_add_device()
> +	 */


Is it really? eeh_add_device_late or pnv_eeh_probe_pdev do not seem to
be doing this.


>  }
>  
>  int64_t pnv_opal_pci_msi_eoi(struct irq_chip *chip, unsigned int hw_irq)
> @@ -3324,16 +3289,13 @@ static void pnv_pci_fixup_bridge_resources(struct pci_bus *bus,
>  	}
>  }
>  
> -static void pnv_pci_setup_bridge(struct pci_bus *bus, unsigned long type)
> +static void pnv_pci_configure_bus(struct pci_bus *bus)
>  {
>  	struct pci_controller *hose = pci_bus_to_host(bus);
>  	struct pnv_phb *phb = hose->private_data;
>  	struct pci_dev *bridge = bus->self;
>  	struct pnv_ioda_pe *pe;
> -	bool all = (pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE);
> -
> -	/* Extend bridge's windows if necessary */
> -	pnv_pci_fixup_bridge_resources(bus, type);
> +	bool all = (bridge && pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE);
>  
>  	/* The PE for root bus should be realized before any one else */
>  	if (!phb->ioda.root_pe_populated) {
> @@ -3342,12 +3304,21 @@ static void pnv_pci_setup_bridge(struct pci_bus *bus, unsigned long type)
>  			phb->ioda.root_pe_idx = pe->pe_number;
>  			phb->ioda.root_pe_populated = true;
>  		}
> +
> +		/* no need to re-configure the root bus */
> +		if (bus == phb->hose->bus)
> +			return;
>  	}
>  
>  	/* Don't assign PE to PCI bus, which doesn't have subordinate devices */
>  	if (list_empty(&bus->devices))
>  		return;
>  
> +	/* PE should never be re-configured */
> +	pe = __pnv_ioda_get_pe(phb, bus->number << 8);
> +	if (WARN_ON(pe))
> +		return;
> +
>  	/* Reserve PEs according to used M64 resources */
>  	pnv_ioda_reserve_m64_pe(bus, NULL, all);
>  
> @@ -3654,6 +3625,39 @@ void pnv_pci_dma_dev_setup(struct pci_dev *pdev)
>  {
>  	struct pci_controller *hose = pci_bus_to_host(pdev->bus);
>  	struct pnv_phb *phb = hose->private_data;
> +	struct pci_dn *pdn = pci_get_pdn(pdev);
> +	struct pnv_ioda_pe *pe;
> +
> +	/* Check if the BDFN for this device is associated with a PE yet */
> +	pe = __pnv_ioda_get_pe(phb, pdev->devfn | (pdev->bus->number << 8));
> +	if (!pe) {
> +		/*
> +		 * We should only hit this path for "normal" PCI PHBs. The


We should only hit this path only for PCIe-PCIx bridges, right? If yes,
should not this be in pnv_pci_dma_bus_setup()? If no, I am lost :(


> +		 * special PHBs used for OpenCAPI and NVLink don't have to
> +		 * deal with eeh-on-mmio so they assign PEs at probe time
> +		 * rather than after resources are allocated.
> +		 */
> +		WARN_ON(phb->type != PNV_PHB_IODA2 && phb->type != PNV_PHB_IODA1);
> +		/* PEs for VFs should have been assigned in sriov_enable() */
> +		WARN_ON(pdev->is_virtfn);
> +
> +		pnv_pci_configure_bus(pdev->bus);
> +		pe = __pnv_ioda_get_pe(phb, pdev->devfn | (pdev->bus->number << 8));
> +		pci_err(pdev, "Configured new pe PE#%x\n", pe ? pe->pe_number : 0xfffff);
> +
> +


An extra empty line.

> +		/*
> +		 * If we can't setup the IODA PE something has gone horribly
> +		 * wrong and we can't enable DMA for the device.
> +		 */
> +		if (WARN_ON(!pe))
> +			return;
> +	} else {
> +		pci_err(pdev, "Added to existing PE#%x\n", pe->pe_number);
> +	}
> +
> +	if (pdn)
> +		pdn->pe_number = pe->pe_number;
>  
>  	pnv_pci_ioda_dma_dev_setup(phb, pdev);

Open code pnv_pci_ioda_dma_dev_setup()?

Or at least do pe->device_count++ here?


>  }
> @@ -3680,14 +3684,14 @@ void pnv_pci_dma_bus_setup(struct pci_bus *bus)
>  
>  static const struct pci_controller_ops pnv_pci_ioda_controller_ops = {
>  	.dma_dev_setup		= pnv_pci_dma_dev_setup,
> -	.dma_bus_setup		= pnv_pci_dma_bus_setup,
> +	.dma_bus_setup		= pnv_pci_dma_bus_setup, /* NB: DMA setup actually happens in dma_dev_setup */

The comment does not help much imho.


>  	.iommu_bypass_supported	= pnv_pci_ioda_iommu_bypass_supported,
>  	.setup_msi_irqs		= pnv_setup_msi_irqs,
>  	.teardown_msi_irqs	= pnv_teardown_msi_irqs,
>  	.enable_device_hook	= pnv_pci_enable_device_hook,
>  	.release_device		= pnv_pci_release_device,
>  	.window_alignment	= pnv_pci_window_alignment,
> -	.setup_bridge		= pnv_pci_setup_bridge,
> +	.setup_bridge		= pnv_pci_fixup_bridge_resources,


Using "fixup" in the name of a "setup" hook is ... mmmm.... contemporary :)



>  	.reset_secondary_bus	= pnv_pci_reset_secondary_bus,
>  	.shutdown		= pnv_pci_ioda_shutdown,
>  };
> 

-- 
Alexey


More information about the Linuxppc-dev mailing list