[PATCH v6 23/42] powerpc/powernv: Release PEs dynamically

Gavin Shan gwshan at linux.vnet.ibm.com
Thu Aug 13 10:54:36 AEST 2015


On Tue, Aug 11, 2015 at 11:03:40PM +1000, Alexey Kardashevskiy wrote:
>On 08/06/2015 02:11 PM, Gavin Shan wrote:
>>This adds the refcount to PE, which represents number of PCI
>>devices contained in the PE. When last device leaves from the
>>PE, the PE together with its consumed resources (IO, DMA, PELTM,
>>PELTV) are released, to support PCI hotplug.
>>
>>Signed-off-by: Gavin Shan <gwshan at linux.vnet.ibm.com>
>>---
>>  arch/powerpc/platforms/powernv/pci-ioda.c | 233 +++++++++++++++++++++++++++---
>>  arch/powerpc/platforms/powernv/pci.h      |   3 +
>>  2 files changed, 217 insertions(+), 19 deletions(-)
>>
>>diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
>>index d2697a3..13d8a5b 100644
>>--- a/arch/powerpc/platforms/powernv/pci-ioda.c
>>+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
>>@@ -132,6 +132,53 @@ static inline bool pnv_pci_is_mem_pref_64(unsigned long flags)
>>  		(IORESOURCE_MEM_64 | IORESOURCE_PREFETCH));
>>  }
>>
>>+static void pnv_pci_ioda_release_pe_dma(struct pnv_ioda_pe *pe)
>
>Is this ioda1 helper or common helper for both ioda1 and ioda2?
>

It's for IODA1 only.

>>+{
>>+	struct pnv_phb *phb = pe->phb;
>>+	struct iommu_table *tbl;
>>+	int seg;
>>+	int64_t rc;
>>+
>>+	/* No DMA32 segments allocated */
>>+	if (pe->dma32_seg == PNV_INVALID_SEGMENT ||
>>+	    pe->dma32_segcount <= 0) {
>
>
>dma32_segcount is unsigned long, cannot be less than 0.
>

It's "int dma32_segcount" in pci.h:

>>+		pe->dma32_seg = PNV_INVALID_SEGMENT;
>>+		pe->dma32_segcount = 0;
>>+		return;
>>+	}
>>+
>>+	/* Unlink IOMMU table from group */
>>+	tbl = pe->table_group.tables[0];
>>+	pnv_pci_unlink_table_and_group(tbl, &pe->table_group);
>>+	if (pe->table_group.group) {
>>+		iommu_group_put(pe->table_group.group);
>>+		BUG_ON(pe->table_group.group);
>>+	}
>>+
>>+	/* Release IOMMU table */
>>+	free_pages(tbl->it_base,
>>+		get_order(TCE32_TABLE_SIZE * pe->dma32_segcount));
>>+	iommu_free_table(tbl,
>>+		of_node_full_name(pci_bus_to_OF_node(pe->pbus)));
>
>There is pnv_pci_ioda2_table_free_pages(), use it.
>

The function (pnv_pci_ioda_release_pe_dma()) is for IODA1 only.

>>+
>>+	/* Disable TVE */
>>+	for (seg = pe->dma32_seg;
>>+	     seg < pe->dma32_seg + pe->dma32_segcount;
>>+	     seg++) {
>>+		rc = opal_pci_map_pe_dma_window(phb->opal_id,
>>+				pe->pe_number, seg, 0, 0ul, 0ul, 0ul);
>>+		if (rc)
>>+			pe_warn(pe, "Error %ld unmapping DMA32 seg#%d\n",
>>+				rc, seg);
>>+	}
>
>May be implement iommu_table_group_ops::unset_window for IODA1 too?
>

Good point, but it's something out of scope. I'm putting it into my TODO
list and cook up the patch when having chance.

>>+
>>+	/* Free the DMA32 segments */
>>+	bitmap_clear(phb->ioda.dma32_segmap,
>>+		pe->dma32_seg, pe->dma32_segcount);
>>+	pe->dma32_seg = PNV_INVALID_SEGMENT;
>>+	pe->dma32_segcount = 0;
>>+}
>>+
>>  static inline void pnv_pci_ioda2_tce_invalidate_entire(struct pnv_ioda_pe *pe)
>>  {
>>  	/* 01xb - invalidate TCEs that match the specified PE# */
>>@@ -199,13 +246,15 @@ static void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable)
>>  		pe->tce_bypass_enabled = enable;
>>  }
>>
>>-#ifdef CONFIG_PCI_IOV
>>-static void pnv_pci_ioda2_release_dma_pe(struct pci_dev *dev,
>>-					 struct pnv_ioda_pe *pe)
>>+static void pnv_pci_ioda2_release_pe_dma(struct pnv_ioda_pe *pe)
>>  {
>>  	struct iommu_table    *tbl;
>>+	struct device_node    *dn;
>>  	int64_t               rc;
>>
>>+	if (pe->dma32_seg == PNV_INVALID_SEGMENT)
>>+		return;
>>+
>>  	tbl = pe->table_group.tables[0];
>>  	rc = pnv_pci_ioda2_unset_window(&pe->table_group, 0);
>>  	if (rc)
>>@@ -216,10 +265,91 @@ static void pnv_pci_ioda2_release_dma_pe(struct pci_dev *dev,
>>  		iommu_group_put(pe->table_group.group);
>>  		BUG_ON(pe->table_group.group);
>>  	}
>>+
>>+	if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL))
>>+		dn = pci_bus_to_OF_node(pe->pbus);
>>+	else if (pe->flags & PNV_IODA_PE_DEV)
>>+		dn = pci_device_to_OF_node(pe->pdev);
>>+#ifdef CONFIG_PCI_IOV
>>+	else if (pe->flags & PNV_IODA_PE_VF)
>>+		dn = pci_device_to_OF_node(pe->parent_dev);
>>+#endif
>>+	else
>>+		dn = NULL;
>>+
>>  	pnv_pci_ioda2_table_free_pages(tbl);
>>-	iommu_free_table(tbl, of_node_full_name(dev->dev.of_node));
>>+	iommu_free_table(tbl, of_node_full_name(dn));
>>+	pe->dma32_seg = PNV_INVALID_SEGMENT;
>>+}
>
>
>
>I'd drop the chunk about calculating @dn above, nobody really cares what
>iommu_free_table() prints. If you really need to print something, print PE#.
>

It makes sense. I'll drop the chunk of garbage and replace it with the
PE number.

>>+
>>+static void pnv_ioda_release_pe_dma(struct pnv_ioda_pe *pe)
>>+{
>>+	struct pnv_phb *phb = pe->phb;
>>+
>>+	switch (phb->type) {
>>+	case PNV_PHB_IODA1:
>>+		pnv_pci_ioda_release_pe_dma(pe);
>>+		break;
>>+	case PNV_PHB_IODA2:
>>+		pnv_pci_ioda2_release_pe_dma(pe);
>>+		break;
>>+	default:
>>+		pr_warn("%s: Cannot release DMA for PHB type %d\n",
>>+			__func__, phb->type);
>
>This is BUG_ON() indeed because we cannot possibly get that far with
>unsupported PHB type, it would have crashed earlier.
>

Right. I'll using BUG_ON() then.

>>+	}
>>+}
>>+
>>+static void pnv_ioda_release_pe_one_seg(struct pnv_ioda_pe *pe, int win)
>>+{
>>+	struct pnv_phb *phb = pe->phb;
>>+	unsigned long *segmap = NULL;
>>+	unsigned long *pe_segmap = NULL;
>>+	int segno, limit, mod = 0;
>>+
>>+	switch (win) {
>>+	case OPAL_IO_WINDOW_TYPE:
>>+		segmap = phb->ioda.io_segmap;
>>+		pe_segmap = pe->io_segmap;
>>+		break;
>>+	case OPAL_M32_WINDOW_TYPE:
>>+		segmap = phb->ioda.m32_segmap;
>>+		pe_segmap = pe->m32_segmap;
>>+		break;
>>+	case OPAL_M64_WINDOW_TYPE:
>>+		if (phb->type != PNV_PHB_IODA1)
>>+			return;
>>+		segmap = phb->ioda.m64_segmap;
>>+		pe_segmap = pe->m64_segmap;
>
>
>You seem to keep phb->ioda.m64_segmap update but you never actually read it,
>you only read pe->m64_segmap. Is that correct or I am missing something here?
>
>

You're correct to some extent. There're two reasons to have phb->ioda.m64_segmap
as below. However, you suggested to have hashtable to reprenet segment mapping,
which isn't finalized yet:

- Track the used M64 segment from PHB's domain. Easy for debugging.
- Used to avoid reserve same segment for twice.

>>+		mod = 8;
>>+		break;
>>+	default:
>>+		return;
>>+	}
>>+
>>+	segno = -1;
>>+	limit = phb->ioda.total_pe_num;
>>+	while ((segno = find_next_bit(pe_segmap, limit, segno + 1)) < limit) {
>>+		if (mod > 0)
>>+			opal_pci_map_pe_mmio_window(phb->opal_id,
>>+				phb->ioda.reserved_pe_idx, win,
>>+				segno / mod, segno % mod);
>>+		else
>>+			opal_pci_map_pe_mmio_window(phb->opal_id,
>>+					phb->ioda.reserved_pe_idx, win,
>>+					0, segno);
>>+
>>+		clear_bit(segno, pe_segmap);
>>+		clear_bit(segno, segmap);
>>+	}
>>+}
>>+
>>+static void pnv_ioda_release_pe_seg(struct pnv_ioda_pe *pe)
>>+{
>>+	int win;
>>+
>>+	for (win = OPAL_M32_WINDOW_TYPE; win <= OPAL_IO_WINDOW_TYPE; win++)
>>+		pnv_ioda_release_pe_one_seg(pe, win);
>>  }
>>-#endif /* CONFIG_PCI_IOV */
>>
>>  static int pnv_ioda_set_one_peltv(struct pnv_phb *phb,
>>  				  struct pnv_ioda_pe *parent,
>>@@ -325,7 +455,6 @@ static int pnv_ioda_set_peltv(struct pnv_phb *phb,
>>  	return 0;
>>  }
>>
>>-#ifdef CONFIG_PCI_IOV
>>  static int pnv_ioda_deconfigure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
>>  {
>>  	struct pci_dev *parent;
>>@@ -373,9 +502,11 @@ static int pnv_ioda_deconfigure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
>>  		}
>>  		rid_end = pe->rid + (count << 8);
>>  	} else {
>>+#ifdef CONFIG_PCI_IOV
>>  		if (pe->flags & PNV_IODA_PE_VF)
>>  			parent = pe->parent_dev;
>>  		else
>>+#endif
>>  			parent = pe->pdev->bus->self;
>>  		bcomp = OpalPciBusAll;
>>  		dcomp = OPAL_COMPARE_RID_DEVICE_NUMBER;
>>@@ -415,11 +546,72 @@ static int pnv_ioda_deconfigure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
>>
>>  	pe->pbus = NULL;
>>  	pe->pdev = NULL;
>>+#ifdef CONFIG_PCI_IOV
>>  	pe->parent_dev = NULL;
>>+#endif
>>
>>  	return 0;
>>  }
>>-#endif /* CONFIG_PCI_IOV */
>>+
>>+static void pnv_ioda_release_pe(struct pnv_ioda_pe *pe)
>>+{
>>+	struct pnv_phb *phb = pe->phb;
>>+	struct pnv_ioda_pe *tmp, *slave;
>>+
>>+	/* Release slave PEs in compound PE */
>>+	if (pe->flags & PNV_IODA_PE_MASTER) {
>>+		list_for_each_entry_safe(slave, tmp, &pe->slaves, list)
>>+			pnv_ioda_release_pe(pe);
>>+	}
>>+
>>+	/* Remove the PE from the list */
>>+	list_del(&pe->list);
>>+
>>+	/* Release resources */
>>+	pnv_ioda_release_pe_dma(pe);
>>+	pnv_ioda_release_pe_seg(pe);
>>+	pnv_ioda_deconfigure_pe(pe->phb, pe);
>>+
>>+	/* Release PE number */
>>+	clear_bit(pe->pe_number, phb->ioda.pe_alloc);
>>+}
>>+
>>+static inline struct pnv_ioda_pe *pnv_ioda_pe_get(struct pnv_ioda_pe *pe)
>>+{
>>+	if (!pe)
>>+		return NULL;
>>+
>>+	pe->device_count++;
>>+	return pe;
>>+}
>>+
>>+static inline void pnv_ioda_pe_put(struct pnv_ioda_pe *pe)
>>+{
>>+	if (!pe)
>>+		return;
>>+
>>+	pe->device_count--;
>>+	BUG_ON(pe->device_count < 0);
>>+	if (pe->device_count == 0)
>>+		pnv_ioda_release_pe(pe);
>>+}
>
>Sure you do not want atomic_t for device_count? Races are impossibe here?
>

Yes, I don't see any possible race. Also, it's what you suggested. Here's
the comment you gave:

 | You do not need kref here. You call kref_put() in a single location and can do
 | stuff directly, without kref. Just have an "unsigned int" counter and that's
 | it (it does not even have to be atomic if you do not have races but I am not
 | sure you do not).

>>+
>>+static void pnv_pci_release_device(struct pci_dev *pdev)
>>+{
>>+	struct pci_controller *hose = pci_bus_to_host(pdev->bus);
>>+	struct pnv_phb *phb = hose->private_data;
>>+	struct pci_dn *pdn = pci_get_pdn(pdev);
>>+	struct pnv_ioda_pe *pe;
>>+
>>+	if (pdev->is_virtfn)
>>+		return;
>>+
>>+	if (!pdn || pdn->pe_number == IODA_INVALID_PE)
>>+		return;
>>+
>>+	pe = &phb->ioda.pe_array[pdn->pe_number];
>>+	pnv_ioda_pe_put(pe);
>>+}
>>
>>  static struct pnv_ioda_pe *pnv_ioda_init_pe(struct pnv_phb *phb, int pe_no)
>>  {
>>@@ -466,6 +658,7 @@ static struct pnv_ioda_pe *pnv_ioda_alloc_pe(struct pnv_phb *phb)
>>  	return pnv_ioda_init_pe(phb, pe);
>>  }
>>
>>+#ifdef CONFIG_PCI_IOV
>>  static void pnv_ioda_free_pe(struct pnv_phb *phb, int pe)
>
>The name of pnv_ioda_free_pe() suggests it should work for non-SRIOV case too
>but you put it under #ifdef IOV, is that correct? Is so, rename it please.
>

It's used by SRIOV code only. I'll rename it to pnv_ioda_free_vf_pe() in
separate patch.

>
>>  {
>>  	WARN_ON(phb->ioda.pe_array[pe].pdev);
>>@@ -473,6 +666,7 @@ static void pnv_ioda_free_pe(struct pnv_phb *phb, int pe)
>>  	memset(&phb->ioda.pe_array[pe], 0, sizeof(struct pnv_ioda_pe));
>>  	clear_bit(pe, phb->ioda.pe_alloc);
>>  }
>>+#endif
>>
>>  static int pnv_ioda1_init_m64(struct pnv_phb *phb)
>>  {
>>@@ -1177,6 +1371,7 @@ static void pnv_ioda_setup_same_PE(struct pci_bus *bus, struct pnv_ioda_pe *pe)
>>  		if (pdn->pe_number != IODA_INVALID_PE)
>>  			continue;
>>
>>+		pnv_ioda_pe_get(pe);
>>  		pdn->pe_number = pe->pe_number;
>>  		pe->dma32_weight += pnv_ioda_dma_weight(dev);
>>  		if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate)
>>@@ -1231,7 +1426,7 @@ static struct pnv_ioda_pe *pnv_ioda_setup_bus_PE(struct pci_bus *bus, bool all)
>>  	pe->flags |= (all ? PNV_IODA_PE_BUS_ALL : PNV_IODA_PE_BUS);
>>  	pe->pbus = bus;
>>  	pe->pdev = NULL;
>>-	pe->dma32_seg = -1;
>>+	pe->dma32_seg = PNV_INVALID_SEGMENT;
>>  	pe->mve_number = -1;
>>  	pe->rid = bus->busn_res.start << 8;
>>  	pe->dma32_weight = 0;
>>@@ -1244,9 +1439,8 @@ static struct pnv_ioda_pe *pnv_ioda_setup_bus_PE(struct pci_bus *bus, bool all)
>>  			bus->busn_res.start, pe->pe_number);
>>
>>  	if (pnv_ioda_configure_pe(phb, pe)) {
>>-		/* XXX What do we do here ? */
>>-		pnv_ioda_free_pe(phb, pe->pe_number);
>>  		pe->pbus = NULL;
>>+		pnv_ioda_release_pe(pe);
>>  		return NULL;
>>  	}
>>
>>@@ -1449,14 +1643,14 @@ static void pnv_ioda_release_vf_PE(struct pci_dev *pdev, u16 num_vfs)
>>  		if ((pe->flags & PNV_IODA_PE_MASTER) &&
>>  		    (pe->flags & PNV_IODA_PE_VF)) {
>>  			list_for_each_entry_safe(s, sn, &pe->slaves, list) {
>>-				pnv_pci_ioda2_release_dma_pe(pdev, s);
>>+				pnv_pci_ioda2_release_dma_pe(s);
>>  				list_del(&s->list);
>>  				pnv_ioda_deconfigure_pe(phb, s);
>>  				pnv_ioda_free_pe(phb, s->pe_number);
>>  			}
>>  		}
>>
>>-		pnv_pci_ioda2_release_dma_pe(pdev, pe);
>>+		pnv_pci_ioda2_release_pe_dma(pe);
>>
>>  		/* Remove from list */
>>  		mutex_lock(&phb->ioda.pe_list_mutex);
>>@@ -1532,7 +1726,7 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs)
>>  		pe->flags = PNV_IODA_PE_VF;
>>  		pe->pbus = NULL;
>>  		pe->parent_dev = pdev;
>>-		pe->dma32_seg = -1;
>>+		pe->dma32_seg = PNV_INVALID_SEGMENT;
>
>
>This and similar changes are not really about "Release PEs dynamically".
>

Agree, I'll split the patch and move this similar changes into another one
separate patch.

>
>>  		pe->mve_number = -1;
>>  		pe->rid = (pci_iov_virtfn_bus(pdev, vf_index) << 8) |
>>  			   pci_iov_virtfn_devfn(pdev, vf_index);
>>@@ -1995,7 +2189,7 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
>>  	/* XXX FIXME: Allocate multi-level tables on PHB3 */
>>
>>  	/* We shouldn't already have a 32-bit DMA associated */
>>-	if (WARN_ON(pe->dma32_seg >= 0))
>>+	if (WARN_ON(pe->dma32_seg != PNV_INVALID_SEGMENT))
>>  		return;
>>
>>  	tbl = pnv_pci_table_alloc(phb->hose->node);
>>@@ -2066,10 +2260,10 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
>>  	return;
>>   fail:
>>  	/* XXX Failure: Try to fallback to 64-bit only ? */
>>-	if (pe->dma32_seg >= 0) {
>>+	if (pe->dma32_seg != PNV_INVALID_SEGMENT) {
>>  		bitmap_clear(phb->ioda.dma32_segmap,
>>  			     pe->dma32_seg, pe->dma32_segcount);
>>-		pe->dma32_seg = -1;
>>+		pe->dma32_seg = PNV_INVALID_SEGMENT;
>>  		pe->dma32_segcount = 0;
>>  	}
>>
>>@@ -2416,7 +2610,7 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
>>  	int64_t rc;
>>
>>  	/* We shouldn't already have a 32-bit DMA associated */
>>-	if (WARN_ON(pe->dma32_seg >= 0))
>>+	if (WARN_ON(pe->dma32_seg != PNV_INVALID_SEGMENT))
>>  		return;
>>
>>  	/* TVE #1 is selected by PCI address bit 59 */
>>@@ -2443,8 +2637,8 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
>>
>>  	rc = pnv_pci_ioda2_setup_default_config(pe);
>>  	if (rc) {
>>-		if (pe->dma32_seg >= 0)
>>-			pe->dma32_seg = -1;
>>+		if (pe->dma32_seg != PNV_INVALID_SEGMENT)
>>+			pe->dma32_seg = PNV_INVALID_SEGMENT;
>>  		return;
>>  	}
>>
>>@@ -3183,6 +3377,7 @@ static const struct pci_controller_ops pnv_pci_ioda_controller_ops = {
>>         .teardown_msi_irqs = pnv_teardown_msi_irqs,
>>  #endif
>>         .enable_device_hook = pnv_pci_enable_device_hook,
>>+	.release_device = pnv_pci_release_device,
>>         .window_alignment = pnv_pci_window_alignment,
>>  	.setup_bridge = pnv_pci_setup_bridge,
>>         .reset_secondary_bus = pnv_pci_reset_secondary_bus,
>>diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
>>index f8e6022..2058f06 100644
>>--- a/arch/powerpc/platforms/powernv/pci.h
>>+++ b/arch/powerpc/platforms/powernv/pci.h
>>@@ -25,11 +25,14 @@ enum pnv_phb_model {
>>  #define PNV_IODA_PE_SLAVE	(1 << 4)	/* Slave PE in compound case	*/
>>  #define PNV_IODA_PE_VF		(1 << 5)	/* PE for one VF 		*/
>>
>>+#define PNV_INVALID_SEGMENT	(-1)
>>+
>>  /* Data associated with a PE, including IOMMU tracking etc.. */
>>  struct pnv_phb;
>>  struct pnv_ioda_pe {
>>  	unsigned long		flags;
>>  	struct pnv_phb		*phb;
>>+	int			device_count;
>>
>>  	/* A PE can be associated with a single device or an
>>  	 * entire bus (& children). In the former case, pdev

Thanks,
Gavin



More information about the Linuxppc-dev mailing list