[PATCH v6 23/42] powerpc/powernv: Release PEs dynamically
Gavin Shan
gwshan at linux.vnet.ibm.com
Thu Aug 13 10:54:36 AEST 2015
On Tue, Aug 11, 2015 at 11:03:40PM +1000, Alexey Kardashevskiy wrote:
>On 08/06/2015 02:11 PM, Gavin Shan wrote:
>>This adds the refcount to PE, which represents number of PCI
>>devices contained in the PE. When last device leaves from the
>>PE, the PE together with its consumed resources (IO, DMA, PELTM,
>>PELTV) are released, to support PCI hotplug.
>>
>>Signed-off-by: Gavin Shan <gwshan at linux.vnet.ibm.com>
>>---
>> arch/powerpc/platforms/powernv/pci-ioda.c | 233 +++++++++++++++++++++++++++---
>> arch/powerpc/platforms/powernv/pci.h | 3 +
>> 2 files changed, 217 insertions(+), 19 deletions(-)
>>
>>diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
>>index d2697a3..13d8a5b 100644
>>--- a/arch/powerpc/platforms/powernv/pci-ioda.c
>>+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
>>@@ -132,6 +132,53 @@ static inline bool pnv_pci_is_mem_pref_64(unsigned long flags)
>> (IORESOURCE_MEM_64 | IORESOURCE_PREFETCH));
>> }
>>
>>+static void pnv_pci_ioda_release_pe_dma(struct pnv_ioda_pe *pe)
>
>Is this ioda1 helper or common helper for both ioda1 and ioda2?
>
It's for IODA1 only.
>>+{
>>+ struct pnv_phb *phb = pe->phb;
>>+ struct iommu_table *tbl;
>>+ int seg;
>>+ int64_t rc;
>>+
>>+ /* No DMA32 segments allocated */
>>+ if (pe->dma32_seg == PNV_INVALID_SEGMENT ||
>>+ pe->dma32_segcount <= 0) {
>
>
>dma32_segcount is unsigned long, cannot be less than 0.
>
It's "int dma32_segcount" in pci.h:
>>+ pe->dma32_seg = PNV_INVALID_SEGMENT;
>>+ pe->dma32_segcount = 0;
>>+ return;
>>+ }
>>+
>>+ /* Unlink IOMMU table from group */
>>+ tbl = pe->table_group.tables[0];
>>+ pnv_pci_unlink_table_and_group(tbl, &pe->table_group);
>>+ if (pe->table_group.group) {
>>+ iommu_group_put(pe->table_group.group);
>>+ BUG_ON(pe->table_group.group);
>>+ }
>>+
>>+ /* Release IOMMU table */
>>+ free_pages(tbl->it_base,
>>+ get_order(TCE32_TABLE_SIZE * pe->dma32_segcount));
>>+ iommu_free_table(tbl,
>>+ of_node_full_name(pci_bus_to_OF_node(pe->pbus)));
>
>There is pnv_pci_ioda2_table_free_pages(), use it.
>
The function (pnv_pci_ioda_release_pe_dma()) is for IODA1 only.
>>+
>>+ /* Disable TVE */
>>+ for (seg = pe->dma32_seg;
>>+ seg < pe->dma32_seg + pe->dma32_segcount;
>>+ seg++) {
>>+ rc = opal_pci_map_pe_dma_window(phb->opal_id,
>>+ pe->pe_number, seg, 0, 0ul, 0ul, 0ul);
>>+ if (rc)
>>+ pe_warn(pe, "Error %ld unmapping DMA32 seg#%d\n",
>>+ rc, seg);
>>+ }
>
>May be implement iommu_table_group_ops::unset_window for IODA1 too?
>
Good point, but it's something out of scope. I'm putting it into my TODO
list and cook up the patch when having chance.
>>+
>>+ /* Free the DMA32 segments */
>>+ bitmap_clear(phb->ioda.dma32_segmap,
>>+ pe->dma32_seg, pe->dma32_segcount);
>>+ pe->dma32_seg = PNV_INVALID_SEGMENT;
>>+ pe->dma32_segcount = 0;
>>+}
>>+
>> static inline void pnv_pci_ioda2_tce_invalidate_entire(struct pnv_ioda_pe *pe)
>> {
>> /* 01xb - invalidate TCEs that match the specified PE# */
>>@@ -199,13 +246,15 @@ static void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable)
>> pe->tce_bypass_enabled = enable;
>> }
>>
>>-#ifdef CONFIG_PCI_IOV
>>-static void pnv_pci_ioda2_release_dma_pe(struct pci_dev *dev,
>>- struct pnv_ioda_pe *pe)
>>+static void pnv_pci_ioda2_release_pe_dma(struct pnv_ioda_pe *pe)
>> {
>> struct iommu_table *tbl;
>>+ struct device_node *dn;
>> int64_t rc;
>>
>>+ if (pe->dma32_seg == PNV_INVALID_SEGMENT)
>>+ return;
>>+
>> tbl = pe->table_group.tables[0];
>> rc = pnv_pci_ioda2_unset_window(&pe->table_group, 0);
>> if (rc)
>>@@ -216,10 +265,91 @@ static void pnv_pci_ioda2_release_dma_pe(struct pci_dev *dev,
>> iommu_group_put(pe->table_group.group);
>> BUG_ON(pe->table_group.group);
>> }
>>+
>>+ if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL))
>>+ dn = pci_bus_to_OF_node(pe->pbus);
>>+ else if (pe->flags & PNV_IODA_PE_DEV)
>>+ dn = pci_device_to_OF_node(pe->pdev);
>>+#ifdef CONFIG_PCI_IOV
>>+ else if (pe->flags & PNV_IODA_PE_VF)
>>+ dn = pci_device_to_OF_node(pe->parent_dev);
>>+#endif
>>+ else
>>+ dn = NULL;
>>+
>> pnv_pci_ioda2_table_free_pages(tbl);
>>- iommu_free_table(tbl, of_node_full_name(dev->dev.of_node));
>>+ iommu_free_table(tbl, of_node_full_name(dn));
>>+ pe->dma32_seg = PNV_INVALID_SEGMENT;
>>+}
>
>
>
>I'd drop the chunk about calculating @dn above, nobody really cares what
>iommu_free_table() prints. If you really need to print something, print PE#.
>
It makes sense. I'll drop the chunk of garbage and replace it with the
PE number.
>>+
>>+static void pnv_ioda_release_pe_dma(struct pnv_ioda_pe *pe)
>>+{
>>+ struct pnv_phb *phb = pe->phb;
>>+
>>+ switch (phb->type) {
>>+ case PNV_PHB_IODA1:
>>+ pnv_pci_ioda_release_pe_dma(pe);
>>+ break;
>>+ case PNV_PHB_IODA2:
>>+ pnv_pci_ioda2_release_pe_dma(pe);
>>+ break;
>>+ default:
>>+ pr_warn("%s: Cannot release DMA for PHB type %d\n",
>>+ __func__, phb->type);
>
>This is BUG_ON() indeed because we cannot possibly get that far with
>unsupported PHB type, it would have crashed earlier.
>
Right. I'll using BUG_ON() then.
>>+ }
>>+}
>>+
>>+static void pnv_ioda_release_pe_one_seg(struct pnv_ioda_pe *pe, int win)
>>+{
>>+ struct pnv_phb *phb = pe->phb;
>>+ unsigned long *segmap = NULL;
>>+ unsigned long *pe_segmap = NULL;
>>+ int segno, limit, mod = 0;
>>+
>>+ switch (win) {
>>+ case OPAL_IO_WINDOW_TYPE:
>>+ segmap = phb->ioda.io_segmap;
>>+ pe_segmap = pe->io_segmap;
>>+ break;
>>+ case OPAL_M32_WINDOW_TYPE:
>>+ segmap = phb->ioda.m32_segmap;
>>+ pe_segmap = pe->m32_segmap;
>>+ break;
>>+ case OPAL_M64_WINDOW_TYPE:
>>+ if (phb->type != PNV_PHB_IODA1)
>>+ return;
>>+ segmap = phb->ioda.m64_segmap;
>>+ pe_segmap = pe->m64_segmap;
>
>
>You seem to keep phb->ioda.m64_segmap update but you never actually read it,
>you only read pe->m64_segmap. Is that correct or I am missing something here?
>
>
You're correct to some extent. There're two reasons to have phb->ioda.m64_segmap
as below. However, you suggested to have hashtable to reprenet segment mapping,
which isn't finalized yet:
- Track the used M64 segment from PHB's domain. Easy for debugging.
- Used to avoid reserve same segment for twice.
>>+ mod = 8;
>>+ break;
>>+ default:
>>+ return;
>>+ }
>>+
>>+ segno = -1;
>>+ limit = phb->ioda.total_pe_num;
>>+ while ((segno = find_next_bit(pe_segmap, limit, segno + 1)) < limit) {
>>+ if (mod > 0)
>>+ opal_pci_map_pe_mmio_window(phb->opal_id,
>>+ phb->ioda.reserved_pe_idx, win,
>>+ segno / mod, segno % mod);
>>+ else
>>+ opal_pci_map_pe_mmio_window(phb->opal_id,
>>+ phb->ioda.reserved_pe_idx, win,
>>+ 0, segno);
>>+
>>+ clear_bit(segno, pe_segmap);
>>+ clear_bit(segno, segmap);
>>+ }
>>+}
>>+
>>+static void pnv_ioda_release_pe_seg(struct pnv_ioda_pe *pe)
>>+{
>>+ int win;
>>+
>>+ for (win = OPAL_M32_WINDOW_TYPE; win <= OPAL_IO_WINDOW_TYPE; win++)
>>+ pnv_ioda_release_pe_one_seg(pe, win);
>> }
>>-#endif /* CONFIG_PCI_IOV */
>>
>> static int pnv_ioda_set_one_peltv(struct pnv_phb *phb,
>> struct pnv_ioda_pe *parent,
>>@@ -325,7 +455,6 @@ static int pnv_ioda_set_peltv(struct pnv_phb *phb,
>> return 0;
>> }
>>
>>-#ifdef CONFIG_PCI_IOV
>> static int pnv_ioda_deconfigure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
>> {
>> struct pci_dev *parent;
>>@@ -373,9 +502,11 @@ static int pnv_ioda_deconfigure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
>> }
>> rid_end = pe->rid + (count << 8);
>> } else {
>>+#ifdef CONFIG_PCI_IOV
>> if (pe->flags & PNV_IODA_PE_VF)
>> parent = pe->parent_dev;
>> else
>>+#endif
>> parent = pe->pdev->bus->self;
>> bcomp = OpalPciBusAll;
>> dcomp = OPAL_COMPARE_RID_DEVICE_NUMBER;
>>@@ -415,11 +546,72 @@ static int pnv_ioda_deconfigure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
>>
>> pe->pbus = NULL;
>> pe->pdev = NULL;
>>+#ifdef CONFIG_PCI_IOV
>> pe->parent_dev = NULL;
>>+#endif
>>
>> return 0;
>> }
>>-#endif /* CONFIG_PCI_IOV */
>>+
>>+static void pnv_ioda_release_pe(struct pnv_ioda_pe *pe)
>>+{
>>+ struct pnv_phb *phb = pe->phb;
>>+ struct pnv_ioda_pe *tmp, *slave;
>>+
>>+ /* Release slave PEs in compound PE */
>>+ if (pe->flags & PNV_IODA_PE_MASTER) {
>>+ list_for_each_entry_safe(slave, tmp, &pe->slaves, list)
>>+ pnv_ioda_release_pe(pe);
>>+ }
>>+
>>+ /* Remove the PE from the list */
>>+ list_del(&pe->list);
>>+
>>+ /* Release resources */
>>+ pnv_ioda_release_pe_dma(pe);
>>+ pnv_ioda_release_pe_seg(pe);
>>+ pnv_ioda_deconfigure_pe(pe->phb, pe);
>>+
>>+ /* Release PE number */
>>+ clear_bit(pe->pe_number, phb->ioda.pe_alloc);
>>+}
>>+
>>+static inline struct pnv_ioda_pe *pnv_ioda_pe_get(struct pnv_ioda_pe *pe)
>>+{
>>+ if (!pe)
>>+ return NULL;
>>+
>>+ pe->device_count++;
>>+ return pe;
>>+}
>>+
>>+static inline void pnv_ioda_pe_put(struct pnv_ioda_pe *pe)
>>+{
>>+ if (!pe)
>>+ return;
>>+
>>+ pe->device_count--;
>>+ BUG_ON(pe->device_count < 0);
>>+ if (pe->device_count == 0)
>>+ pnv_ioda_release_pe(pe);
>>+}
>
>Sure you do not want atomic_t for device_count? Races are impossibe here?
>
Yes, I don't see any possible race. Also, it's what you suggested. Here's
the comment you gave:
| You do not need kref here. You call kref_put() in a single location and can do
| stuff directly, without kref. Just have an "unsigned int" counter and that's
| it (it does not even have to be atomic if you do not have races but I am not
| sure you do not).
>>+
>>+static void pnv_pci_release_device(struct pci_dev *pdev)
>>+{
>>+ struct pci_controller *hose = pci_bus_to_host(pdev->bus);
>>+ struct pnv_phb *phb = hose->private_data;
>>+ struct pci_dn *pdn = pci_get_pdn(pdev);
>>+ struct pnv_ioda_pe *pe;
>>+
>>+ if (pdev->is_virtfn)
>>+ return;
>>+
>>+ if (!pdn || pdn->pe_number == IODA_INVALID_PE)
>>+ return;
>>+
>>+ pe = &phb->ioda.pe_array[pdn->pe_number];
>>+ pnv_ioda_pe_put(pe);
>>+}
>>
>> static struct pnv_ioda_pe *pnv_ioda_init_pe(struct pnv_phb *phb, int pe_no)
>> {
>>@@ -466,6 +658,7 @@ static struct pnv_ioda_pe *pnv_ioda_alloc_pe(struct pnv_phb *phb)
>> return pnv_ioda_init_pe(phb, pe);
>> }
>>
>>+#ifdef CONFIG_PCI_IOV
>> static void pnv_ioda_free_pe(struct pnv_phb *phb, int pe)
>
>The name of pnv_ioda_free_pe() suggests it should work for non-SRIOV case too
>but you put it under #ifdef IOV, is that correct? Is so, rename it please.
>
It's used by SRIOV code only. I'll rename it to pnv_ioda_free_vf_pe() in
separate patch.
>
>> {
>> WARN_ON(phb->ioda.pe_array[pe].pdev);
>>@@ -473,6 +666,7 @@ static void pnv_ioda_free_pe(struct pnv_phb *phb, int pe)
>> memset(&phb->ioda.pe_array[pe], 0, sizeof(struct pnv_ioda_pe));
>> clear_bit(pe, phb->ioda.pe_alloc);
>> }
>>+#endif
>>
>> static int pnv_ioda1_init_m64(struct pnv_phb *phb)
>> {
>>@@ -1177,6 +1371,7 @@ static void pnv_ioda_setup_same_PE(struct pci_bus *bus, struct pnv_ioda_pe *pe)
>> if (pdn->pe_number != IODA_INVALID_PE)
>> continue;
>>
>>+ pnv_ioda_pe_get(pe);
>> pdn->pe_number = pe->pe_number;
>> pe->dma32_weight += pnv_ioda_dma_weight(dev);
>> if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate)
>>@@ -1231,7 +1426,7 @@ static struct pnv_ioda_pe *pnv_ioda_setup_bus_PE(struct pci_bus *bus, bool all)
>> pe->flags |= (all ? PNV_IODA_PE_BUS_ALL : PNV_IODA_PE_BUS);
>> pe->pbus = bus;
>> pe->pdev = NULL;
>>- pe->dma32_seg = -1;
>>+ pe->dma32_seg = PNV_INVALID_SEGMENT;
>> pe->mve_number = -1;
>> pe->rid = bus->busn_res.start << 8;
>> pe->dma32_weight = 0;
>>@@ -1244,9 +1439,8 @@ static struct pnv_ioda_pe *pnv_ioda_setup_bus_PE(struct pci_bus *bus, bool all)
>> bus->busn_res.start, pe->pe_number);
>>
>> if (pnv_ioda_configure_pe(phb, pe)) {
>>- /* XXX What do we do here ? */
>>- pnv_ioda_free_pe(phb, pe->pe_number);
>> pe->pbus = NULL;
>>+ pnv_ioda_release_pe(pe);
>> return NULL;
>> }
>>
>>@@ -1449,14 +1643,14 @@ static void pnv_ioda_release_vf_PE(struct pci_dev *pdev, u16 num_vfs)
>> if ((pe->flags & PNV_IODA_PE_MASTER) &&
>> (pe->flags & PNV_IODA_PE_VF)) {
>> list_for_each_entry_safe(s, sn, &pe->slaves, list) {
>>- pnv_pci_ioda2_release_dma_pe(pdev, s);
>>+ pnv_pci_ioda2_release_dma_pe(s);
>> list_del(&s->list);
>> pnv_ioda_deconfigure_pe(phb, s);
>> pnv_ioda_free_pe(phb, s->pe_number);
>> }
>> }
>>
>>- pnv_pci_ioda2_release_dma_pe(pdev, pe);
>>+ pnv_pci_ioda2_release_pe_dma(pe);
>>
>> /* Remove from list */
>> mutex_lock(&phb->ioda.pe_list_mutex);
>>@@ -1532,7 +1726,7 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs)
>> pe->flags = PNV_IODA_PE_VF;
>> pe->pbus = NULL;
>> pe->parent_dev = pdev;
>>- pe->dma32_seg = -1;
>>+ pe->dma32_seg = PNV_INVALID_SEGMENT;
>
>
>This and similar changes are not really about "Release PEs dynamically".
>
Agree, I'll split the patch and move this similar changes into another one
separate patch.
>
>> pe->mve_number = -1;
>> pe->rid = (pci_iov_virtfn_bus(pdev, vf_index) << 8) |
>> pci_iov_virtfn_devfn(pdev, vf_index);
>>@@ -1995,7 +2189,7 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
>> /* XXX FIXME: Allocate multi-level tables on PHB3 */
>>
>> /* We shouldn't already have a 32-bit DMA associated */
>>- if (WARN_ON(pe->dma32_seg >= 0))
>>+ if (WARN_ON(pe->dma32_seg != PNV_INVALID_SEGMENT))
>> return;
>>
>> tbl = pnv_pci_table_alloc(phb->hose->node);
>>@@ -2066,10 +2260,10 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
>> return;
>> fail:
>> /* XXX Failure: Try to fallback to 64-bit only ? */
>>- if (pe->dma32_seg >= 0) {
>>+ if (pe->dma32_seg != PNV_INVALID_SEGMENT) {
>> bitmap_clear(phb->ioda.dma32_segmap,
>> pe->dma32_seg, pe->dma32_segcount);
>>- pe->dma32_seg = -1;
>>+ pe->dma32_seg = PNV_INVALID_SEGMENT;
>> pe->dma32_segcount = 0;
>> }
>>
>>@@ -2416,7 +2610,7 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
>> int64_t rc;
>>
>> /* We shouldn't already have a 32-bit DMA associated */
>>- if (WARN_ON(pe->dma32_seg >= 0))
>>+ if (WARN_ON(pe->dma32_seg != PNV_INVALID_SEGMENT))
>> return;
>>
>> /* TVE #1 is selected by PCI address bit 59 */
>>@@ -2443,8 +2637,8 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
>>
>> rc = pnv_pci_ioda2_setup_default_config(pe);
>> if (rc) {
>>- if (pe->dma32_seg >= 0)
>>- pe->dma32_seg = -1;
>>+ if (pe->dma32_seg != PNV_INVALID_SEGMENT)
>>+ pe->dma32_seg = PNV_INVALID_SEGMENT;
>> return;
>> }
>>
>>@@ -3183,6 +3377,7 @@ static const struct pci_controller_ops pnv_pci_ioda_controller_ops = {
>> .teardown_msi_irqs = pnv_teardown_msi_irqs,
>> #endif
>> .enable_device_hook = pnv_pci_enable_device_hook,
>>+ .release_device = pnv_pci_release_device,
>> .window_alignment = pnv_pci_window_alignment,
>> .setup_bridge = pnv_pci_setup_bridge,
>> .reset_secondary_bus = pnv_pci_reset_secondary_bus,
>>diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
>>index f8e6022..2058f06 100644
>>--- a/arch/powerpc/platforms/powernv/pci.h
>>+++ b/arch/powerpc/platforms/powernv/pci.h
>>@@ -25,11 +25,14 @@ enum pnv_phb_model {
>> #define PNV_IODA_PE_SLAVE (1 << 4) /* Slave PE in compound case */
>> #define PNV_IODA_PE_VF (1 << 5) /* PE for one VF */
>>
>>+#define PNV_INVALID_SEGMENT (-1)
>>+
>> /* Data associated with a PE, including IOMMU tracking etc.. */
>> struct pnv_phb;
>> struct pnv_ioda_pe {
>> unsigned long flags;
>> struct pnv_phb *phb;
>>+ int device_count;
>>
>> /* A PE can be associated with a single device or an
>> * entire bus (& children). In the former case, pdev
Thanks,
Gavin
More information about the Linuxppc-dev
mailing list