[PATCH v6 23/42] powerpc/powernv: Release PEs dynamically
Alexey Kardashevskiy
aik at ozlabs.ru
Tue Aug 11 23:03:40 AEST 2015
On 08/06/2015 02:11 PM, Gavin Shan wrote:
> This adds the refcount to PE, which represents number of PCI
> devices contained in the PE. When last device leaves from the
> PE, the PE together with its consumed resources (IO, DMA, PELTM,
> PELTV) are released, to support PCI hotplug.
>
> Signed-off-by: Gavin Shan <gwshan at linux.vnet.ibm.com>
> ---
> arch/powerpc/platforms/powernv/pci-ioda.c | 233 +++++++++++++++++++++++++++---
> arch/powerpc/platforms/powernv/pci.h | 3 +
> 2 files changed, 217 insertions(+), 19 deletions(-)
>
> diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
> index d2697a3..13d8a5b 100644
> --- a/arch/powerpc/platforms/powernv/pci-ioda.c
> +++ b/arch/powerpc/platforms/powernv/pci-ioda.c
> @@ -132,6 +132,53 @@ static inline bool pnv_pci_is_mem_pref_64(unsigned long flags)
> (IORESOURCE_MEM_64 | IORESOURCE_PREFETCH));
> }
>
> +static void pnv_pci_ioda_release_pe_dma(struct pnv_ioda_pe *pe)
Is this ioda1 helper or common helper for both ioda1 and ioda2?
> +{
> + struct pnv_phb *phb = pe->phb;
> + struct iommu_table *tbl;
> + int seg;
> + int64_t rc;
> +
> + /* No DMA32 segments allocated */
> + if (pe->dma32_seg == PNV_INVALID_SEGMENT ||
> + pe->dma32_segcount <= 0) {
dma32_segcount is unsigned long, cannot be less than 0.
> + pe->dma32_seg = PNV_INVALID_SEGMENT;
> + pe->dma32_segcount = 0;
> + return;
> + }
> +
> + /* Unlink IOMMU table from group */
> + tbl = pe->table_group.tables[0];
> + pnv_pci_unlink_table_and_group(tbl, &pe->table_group);
> + if (pe->table_group.group) {
> + iommu_group_put(pe->table_group.group);
> + BUG_ON(pe->table_group.group);
> + }
> +
> + /* Release IOMMU table */
> + free_pages(tbl->it_base,
> + get_order(TCE32_TABLE_SIZE * pe->dma32_segcount));
> + iommu_free_table(tbl,
> + of_node_full_name(pci_bus_to_OF_node(pe->pbus)));
There is pnv_pci_ioda2_table_free_pages(), use it.
> +
> + /* Disable TVE */
> + for (seg = pe->dma32_seg;
> + seg < pe->dma32_seg + pe->dma32_segcount;
> + seg++) {
> + rc = opal_pci_map_pe_dma_window(phb->opal_id,
> + pe->pe_number, seg, 0, 0ul, 0ul, 0ul);
> + if (rc)
> + pe_warn(pe, "Error %ld unmapping DMA32 seg#%d\n",
> + rc, seg);
> + }
May be implement iommu_table_group_ops::unset_window for IODA1 too?
> +
> + /* Free the DMA32 segments */
> + bitmap_clear(phb->ioda.dma32_segmap,
> + pe->dma32_seg, pe->dma32_segcount);
> + pe->dma32_seg = PNV_INVALID_SEGMENT;
> + pe->dma32_segcount = 0;
> +}
> +
> static inline void pnv_pci_ioda2_tce_invalidate_entire(struct pnv_ioda_pe *pe)
> {
> /* 01xb - invalidate TCEs that match the specified PE# */
> @@ -199,13 +246,15 @@ static void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable)
> pe->tce_bypass_enabled = enable;
> }
>
> -#ifdef CONFIG_PCI_IOV
> -static void pnv_pci_ioda2_release_dma_pe(struct pci_dev *dev,
> - struct pnv_ioda_pe *pe)
> +static void pnv_pci_ioda2_release_pe_dma(struct pnv_ioda_pe *pe)
> {
> struct iommu_table *tbl;
> + struct device_node *dn;
> int64_t rc;
>
> + if (pe->dma32_seg == PNV_INVALID_SEGMENT)
> + return;
> +
> tbl = pe->table_group.tables[0];
> rc = pnv_pci_ioda2_unset_window(&pe->table_group, 0);
> if (rc)
> @@ -216,10 +265,91 @@ static void pnv_pci_ioda2_release_dma_pe(struct pci_dev *dev,
> iommu_group_put(pe->table_group.group);
> BUG_ON(pe->table_group.group);
> }
> +
> + if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL))
> + dn = pci_bus_to_OF_node(pe->pbus);
> + else if (pe->flags & PNV_IODA_PE_DEV)
> + dn = pci_device_to_OF_node(pe->pdev);
> +#ifdef CONFIG_PCI_IOV
> + else if (pe->flags & PNV_IODA_PE_VF)
> + dn = pci_device_to_OF_node(pe->parent_dev);
> +#endif
> + else
> + dn = NULL;
> +
> pnv_pci_ioda2_table_free_pages(tbl);
> - iommu_free_table(tbl, of_node_full_name(dev->dev.of_node));
> + iommu_free_table(tbl, of_node_full_name(dn));
> + pe->dma32_seg = PNV_INVALID_SEGMENT;
> +}
I'd drop the chunk about calculating @dn above, nobody really cares what
iommu_free_table() prints. If you really need to print something, print PE#.
> +
> +static void pnv_ioda_release_pe_dma(struct pnv_ioda_pe *pe)
> +{
> + struct pnv_phb *phb = pe->phb;
> +
> + switch (phb->type) {
> + case PNV_PHB_IODA1:
> + pnv_pci_ioda_release_pe_dma(pe);
> + break;
> + case PNV_PHB_IODA2:
> + pnv_pci_ioda2_release_pe_dma(pe);
> + break;
> + default:
> + pr_warn("%s: Cannot release DMA for PHB type %d\n",
> + __func__, phb->type);
This is BUG_ON() indeed because we cannot possibly get that far with
unsupported PHB type, it would have crashed earlier.
> + }
> +}
> +
> +static void pnv_ioda_release_pe_one_seg(struct pnv_ioda_pe *pe, int win)
> +{
> + struct pnv_phb *phb = pe->phb;
> + unsigned long *segmap = NULL;
> + unsigned long *pe_segmap = NULL;
> + int segno, limit, mod = 0;
> +
> + switch (win) {
> + case OPAL_IO_WINDOW_TYPE:
> + segmap = phb->ioda.io_segmap;
> + pe_segmap = pe->io_segmap;
> + break;
> + case OPAL_M32_WINDOW_TYPE:
> + segmap = phb->ioda.m32_segmap;
> + pe_segmap = pe->m32_segmap;
> + break;
> + case OPAL_M64_WINDOW_TYPE:
> + if (phb->type != PNV_PHB_IODA1)
> + return;
> + segmap = phb->ioda.m64_segmap;
> + pe_segmap = pe->m64_segmap;
You seem to keep phb->ioda.m64_segmap update but you never actually read
it, you only read pe->m64_segmap. Is that correct or I am missing something
here?
> + mod = 8;
> + break;
> + default:
> + return;
> + }
> +
> + segno = -1;
> + limit = phb->ioda.total_pe_num;
> + while ((segno = find_next_bit(pe_segmap, limit, segno + 1)) < limit) {
> + if (mod > 0)
> + opal_pci_map_pe_mmio_window(phb->opal_id,
> + phb->ioda.reserved_pe_idx, win,
> + segno / mod, segno % mod);
> + else
> + opal_pci_map_pe_mmio_window(phb->opal_id,
> + phb->ioda.reserved_pe_idx, win,
> + 0, segno);
> +
> + clear_bit(segno, pe_segmap);
> + clear_bit(segno, segmap);
> + }
> +}
> +
> +static void pnv_ioda_release_pe_seg(struct pnv_ioda_pe *pe)
> +{
> + int win;
> +
> + for (win = OPAL_M32_WINDOW_TYPE; win <= OPAL_IO_WINDOW_TYPE; win++)
> + pnv_ioda_release_pe_one_seg(pe, win);
> }
> -#endif /* CONFIG_PCI_IOV */
>
> static int pnv_ioda_set_one_peltv(struct pnv_phb *phb,
> struct pnv_ioda_pe *parent,
> @@ -325,7 +455,6 @@ static int pnv_ioda_set_peltv(struct pnv_phb *phb,
> return 0;
> }
>
> -#ifdef CONFIG_PCI_IOV
> static int pnv_ioda_deconfigure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
> {
> struct pci_dev *parent;
> @@ -373,9 +502,11 @@ static int pnv_ioda_deconfigure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
> }
> rid_end = pe->rid + (count << 8);
> } else {
> +#ifdef CONFIG_PCI_IOV
> if (pe->flags & PNV_IODA_PE_VF)
> parent = pe->parent_dev;
> else
> +#endif
> parent = pe->pdev->bus->self;
> bcomp = OpalPciBusAll;
> dcomp = OPAL_COMPARE_RID_DEVICE_NUMBER;
> @@ -415,11 +546,72 @@ static int pnv_ioda_deconfigure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
>
> pe->pbus = NULL;
> pe->pdev = NULL;
> +#ifdef CONFIG_PCI_IOV
> pe->parent_dev = NULL;
> +#endif
>
> return 0;
> }
> -#endif /* CONFIG_PCI_IOV */
> +
> +static void pnv_ioda_release_pe(struct pnv_ioda_pe *pe)
> +{
> + struct pnv_phb *phb = pe->phb;
> + struct pnv_ioda_pe *tmp, *slave;
> +
> + /* Release slave PEs in compound PE */
> + if (pe->flags & PNV_IODA_PE_MASTER) {
> + list_for_each_entry_safe(slave, tmp, &pe->slaves, list)
> + pnv_ioda_release_pe(pe);
> + }
> +
> + /* Remove the PE from the list */
> + list_del(&pe->list);
> +
> + /* Release resources */
> + pnv_ioda_release_pe_dma(pe);
> + pnv_ioda_release_pe_seg(pe);
> + pnv_ioda_deconfigure_pe(pe->phb, pe);
> +
> + /* Release PE number */
> + clear_bit(pe->pe_number, phb->ioda.pe_alloc);
> +}
> +
> +static inline struct pnv_ioda_pe *pnv_ioda_pe_get(struct pnv_ioda_pe *pe)
> +{
> + if (!pe)
> + return NULL;
> +
> + pe->device_count++;
> + return pe;
> +}
> +
> +static inline void pnv_ioda_pe_put(struct pnv_ioda_pe *pe)
> +{
> + if (!pe)
> + return;
> +
> + pe->device_count--;
> + BUG_ON(pe->device_count < 0);
> + if (pe->device_count == 0)
> + pnv_ioda_release_pe(pe);
> +}
Sure you do not want atomic_t for device_count? Races are impossibe here?
> +
> +static void pnv_pci_release_device(struct pci_dev *pdev)
> +{
> + struct pci_controller *hose = pci_bus_to_host(pdev->bus);
> + struct pnv_phb *phb = hose->private_data;
> + struct pci_dn *pdn = pci_get_pdn(pdev);
> + struct pnv_ioda_pe *pe;
> +
> + if (pdev->is_virtfn)
> + return;
> +
> + if (!pdn || pdn->pe_number == IODA_INVALID_PE)
> + return;
> +
> + pe = &phb->ioda.pe_array[pdn->pe_number];
> + pnv_ioda_pe_put(pe);
> +}
>
> static struct pnv_ioda_pe *pnv_ioda_init_pe(struct pnv_phb *phb, int pe_no)
> {
> @@ -466,6 +658,7 @@ static struct pnv_ioda_pe *pnv_ioda_alloc_pe(struct pnv_phb *phb)
> return pnv_ioda_init_pe(phb, pe);
> }
>
> +#ifdef CONFIG_PCI_IOV
> static void pnv_ioda_free_pe(struct pnv_phb *phb, int pe)
The name of pnv_ioda_free_pe() suggests it should work for non-SRIOV case
too but you put it under #ifdef IOV, is that correct? Is so, rename it please.
> {
> WARN_ON(phb->ioda.pe_array[pe].pdev);
> @@ -473,6 +666,7 @@ static void pnv_ioda_free_pe(struct pnv_phb *phb, int pe)
> memset(&phb->ioda.pe_array[pe], 0, sizeof(struct pnv_ioda_pe));
> clear_bit(pe, phb->ioda.pe_alloc);
> }
> +#endif
>
> static int pnv_ioda1_init_m64(struct pnv_phb *phb)
> {
> @@ -1177,6 +1371,7 @@ static void pnv_ioda_setup_same_PE(struct pci_bus *bus, struct pnv_ioda_pe *pe)
> if (pdn->pe_number != IODA_INVALID_PE)
> continue;
>
> + pnv_ioda_pe_get(pe);
> pdn->pe_number = pe->pe_number;
> pe->dma32_weight += pnv_ioda_dma_weight(dev);
> if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate)
> @@ -1231,7 +1426,7 @@ static struct pnv_ioda_pe *pnv_ioda_setup_bus_PE(struct pci_bus *bus, bool all)
> pe->flags |= (all ? PNV_IODA_PE_BUS_ALL : PNV_IODA_PE_BUS);
> pe->pbus = bus;
> pe->pdev = NULL;
> - pe->dma32_seg = -1;
> + pe->dma32_seg = PNV_INVALID_SEGMENT;
> pe->mve_number = -1;
> pe->rid = bus->busn_res.start << 8;
> pe->dma32_weight = 0;
> @@ -1244,9 +1439,8 @@ static struct pnv_ioda_pe *pnv_ioda_setup_bus_PE(struct pci_bus *bus, bool all)
> bus->busn_res.start, pe->pe_number);
>
> if (pnv_ioda_configure_pe(phb, pe)) {
> - /* XXX What do we do here ? */
> - pnv_ioda_free_pe(phb, pe->pe_number);
> pe->pbus = NULL;
> + pnv_ioda_release_pe(pe);
> return NULL;
> }
>
> @@ -1449,14 +1643,14 @@ static void pnv_ioda_release_vf_PE(struct pci_dev *pdev, u16 num_vfs)
> if ((pe->flags & PNV_IODA_PE_MASTER) &&
> (pe->flags & PNV_IODA_PE_VF)) {
> list_for_each_entry_safe(s, sn, &pe->slaves, list) {
> - pnv_pci_ioda2_release_dma_pe(pdev, s);
> + pnv_pci_ioda2_release_dma_pe(s);
> list_del(&s->list);
> pnv_ioda_deconfigure_pe(phb, s);
> pnv_ioda_free_pe(phb, s->pe_number);
> }
> }
>
> - pnv_pci_ioda2_release_dma_pe(pdev, pe);
> + pnv_pci_ioda2_release_pe_dma(pe);
>
> /* Remove from list */
> mutex_lock(&phb->ioda.pe_list_mutex);
> @@ -1532,7 +1726,7 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs)
> pe->flags = PNV_IODA_PE_VF;
> pe->pbus = NULL;
> pe->parent_dev = pdev;
> - pe->dma32_seg = -1;
> + pe->dma32_seg = PNV_INVALID_SEGMENT;
This and similar changes are not really about "Release PEs dynamically".
> pe->mve_number = -1;
> pe->rid = (pci_iov_virtfn_bus(pdev, vf_index) << 8) |
> pci_iov_virtfn_devfn(pdev, vf_index);
> @@ -1995,7 +2189,7 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
> /* XXX FIXME: Allocate multi-level tables on PHB3 */
>
> /* We shouldn't already have a 32-bit DMA associated */
> - if (WARN_ON(pe->dma32_seg >= 0))
> + if (WARN_ON(pe->dma32_seg != PNV_INVALID_SEGMENT))
> return;
>
> tbl = pnv_pci_table_alloc(phb->hose->node);
> @@ -2066,10 +2260,10 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
> return;
> fail:
> /* XXX Failure: Try to fallback to 64-bit only ? */
> - if (pe->dma32_seg >= 0) {
> + if (pe->dma32_seg != PNV_INVALID_SEGMENT) {
> bitmap_clear(phb->ioda.dma32_segmap,
> pe->dma32_seg, pe->dma32_segcount);
> - pe->dma32_seg = -1;
> + pe->dma32_seg = PNV_INVALID_SEGMENT;
> pe->dma32_segcount = 0;
> }
>
> @@ -2416,7 +2610,7 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
> int64_t rc;
>
> /* We shouldn't already have a 32-bit DMA associated */
> - if (WARN_ON(pe->dma32_seg >= 0))
> + if (WARN_ON(pe->dma32_seg != PNV_INVALID_SEGMENT))
> return;
>
> /* TVE #1 is selected by PCI address bit 59 */
> @@ -2443,8 +2637,8 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
>
> rc = pnv_pci_ioda2_setup_default_config(pe);
> if (rc) {
> - if (pe->dma32_seg >= 0)
> - pe->dma32_seg = -1;
> + if (pe->dma32_seg != PNV_INVALID_SEGMENT)
> + pe->dma32_seg = PNV_INVALID_SEGMENT;
> return;
> }
>
> @@ -3183,6 +3377,7 @@ static const struct pci_controller_ops pnv_pci_ioda_controller_ops = {
> .teardown_msi_irqs = pnv_teardown_msi_irqs,
> #endif
> .enable_device_hook = pnv_pci_enable_device_hook,
> + .release_device = pnv_pci_release_device,
> .window_alignment = pnv_pci_window_alignment,
> .setup_bridge = pnv_pci_setup_bridge,
> .reset_secondary_bus = pnv_pci_reset_secondary_bus,
> diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
> index f8e6022..2058f06 100644
> --- a/arch/powerpc/platforms/powernv/pci.h
> +++ b/arch/powerpc/platforms/powernv/pci.h
> @@ -25,11 +25,14 @@ enum pnv_phb_model {
> #define PNV_IODA_PE_SLAVE (1 << 4) /* Slave PE in compound case */
> #define PNV_IODA_PE_VF (1 << 5) /* PE for one VF */
>
> +#define PNV_INVALID_SEGMENT (-1)
> +
> /* Data associated with a PE, including IOMMU tracking etc.. */
> struct pnv_phb;
> struct pnv_ioda_pe {
> unsigned long flags;
> struct pnv_phb *phb;
> + int device_count;
>
> /* A PE can be associated with a single device or an
> * entire bus (& children). In the former case, pdev
>
--
Alexey
More information about the Linuxppc-dev
mailing list