[PATCH v7 27/50] powerpc/powernv: Dynamically release PEs
Gavin Shan
gwshan at linux.vnet.ibm.com
Tue Nov 24 10:06:01 AEDT 2015
On Wed, Nov 18, 2015 at 01:23:05PM +1100, Alexey Kardashevskiy wrote:
>On 11/05/2015 12:12 AM, Gavin Shan wrote:
>>This adds a reference count of PE, representing the number of PCI
>>devices associated with the PE. The reference count is increased
>>or decreased when PCI devices join or leave the PE. Once it becomes
>>zero, the PE together with its used resources (IO, MMIO, DMA, PELTM,
>>PELTV) are released to support PCI hot unplug.
>
>
>The commit log suggest the patch only adds a counter, initializes it, and
>replaces unconditional release of an object (in this case - PE) with the
>conditional one. But it is more that that...
>
Yes, it's more than that as stated in the commit log.
>>Signed-off-by: Gavin Shan <gwshan at linux.vnet.ibm.com>
>>---
>> arch/powerpc/platforms/powernv/pci-ioda.c | 245 ++++++++++++++++++++++++++----
>> arch/powerpc/platforms/powernv/pci.h | 1 +
>> 2 files changed, 218 insertions(+), 28 deletions(-)
>>
>>diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
>>index 0bb0056..dcffce5 100644
>>--- a/arch/powerpc/platforms/powernv/pci-ioda.c
>>+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
>>@@ -129,6 +129,215 @@ static inline bool pnv_pci_is_mem_pref_64(unsigned long flags)
>> (IORESOURCE_MEM_64 | IORESOURCE_PREFETCH));
>> }
>>
>>+static void pnv_pci_ioda1_release_dma_pe(struct pnv_ioda_pe *pe)
>>+{
>>+ struct pnv_phb *phb = pe->phb;
>>+ struct iommu_table *tbl;
>>+ int start, count, i;
>>+ int64_t rc;
>>+
>>+ /* Search for the used DMA32 segments */
>>+ start = -1;
>>+ count = 0;
>>+ for (i = 0; i < phb->ioda.dma32_count; i++) {
>>+ if (phb->ioda.dma32_segmap[i] != pe->pe_number)
>>+ continue;
>>+
>>+ count++;
>>+ if (start < 0)
>>+ start = i;
>>+ }
>>+
>>+ if (!count)
>>+ return;
>
>
>imho checking pe->table_group.tables[0] != NULL is shorter than the loop above.
>
Will use it in next revision.
>>+
>>+ /* Unlink IOMMU table from group */
>>+ tbl = pe->table_group.tables[0];
>>+ pnv_pci_unlink_table_and_group(tbl, &pe->table_group);
>>+ if (pe->table_group.group) {
>>+ iommu_group_put(pe->table_group.group);
>>+ WARN_ON(pe->table_group.group);
>>+ }
>>+
>>+ /* Release IOMMU table */
>>+ pnv_pci_ioda2_table_free_pages(tbl);
>
>
>This is IODA2 helper with multilevel support, does IODA1 support multilevel
>TCE tables? If not, it should WARN_ON on levels!=1.
>
>Another thing is you should first unprogram TVEs (via
>opal_pci_map_pe_dma_window), then invalidate the cache (if required, not sure
>if this is needed on IODA1), only then free the actual table.
>
>
>>+ iommu_free_table(tbl, of_node_full_name(pci_bus_to_OF_node(pe->pbus)));
>>+
>>+ /* Disable TVE */
>>+ for (i = start; i < start + count; i++) {
>>+ rc = opal_pci_map_pe_dma_window(phb->opal_id, pe->pe_number,
>>+ i, 0, 0ul, 0ul, 0ul);
>>+ if (rc)
>>+ pe_warn(pe, "Error %ld unmapping DMA32 seg#%d\n",
>>+ rc, i);
>>+
>>+ phb->ioda.dma32_segmap[i] = IODA_INVALID_PE;
>>+ }
>
>
>You could implement pnv_pci_ioda1_unset_window/pnv_ioda1_table_free as
>callbacks, change pnv_pci_ioda2_release_dma_pe() to use them (and rename it
>to reflect that it supports IODA1 and IODA2).
>
>
>>+}
>>+
>>+static unsigned int pnv_pci_ioda_pe_dma_weight(struct pnv_ioda_pe *pe);
>>+static long pnv_pci_ioda2_unset_window(struct iommu_table_group *table_group,
>>+ int num);
>>+static void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable);
>>+
>>+static void pnv_pci_ioda2_release_dma_pe(struct pnv_ioda_pe *pe)
>
>
>You moved this function and changed it, please do one thing at once (which is
>"change", not "move").
>
>>+{
>>+ struct iommu_table *tbl;
>>+ unsigned int weight = pnv_pci_ioda_pe_dma_weight(pe);
>>+ int64_t rc;
>>+
>>+ if (!weight)
>>+ return;
>
>
>Checking for pe->table_group.group is better because if we ever change the
>logic of what gets included to an IOMMU group, we will have to do the change
>where we add devices to a group but we won't have to touch releasing code.
>
>
>>+
>>+ tbl = pe->table_group.tables[0];
>>+ rc = pnv_pci_ioda2_unset_window(&pe->table_group, 0);
>>+ if (rc)
>>+ pe_warn(pe, "OPAL error %ld release DMA window\n", rc);
>>+
>>+ pnv_pci_ioda2_set_bypass(pe, false);
>>+ if (pe->table_group.group) {
>>+ iommu_group_put(pe->table_group.group);
>>+ WARN_ON(pe->table_group.group);
>>+ }
>>+
>>+ pnv_pci_ioda2_table_free_pages(tbl);
>>+ iommu_free_table(tbl, "pnv");
>>+}
>>+
>>+static void pnv_ioda_release_dma_pe(struct pnv_ioda_pe *pe)
>
>Merge this into pnv_ioda_release_pe() - it is small and called just once.
>
>
>>+{
>>+ struct pnv_phb *phb = pe->phb;
>>+
>>+ switch (phb->type) {
>>+ case PNV_PHB_IODA1:
>>+ pnv_pci_ioda1_release_dma_pe(pe);
>>+ break;
>>+ case PNV_PHB_IODA2:
>>+ pnv_pci_ioda2_release_dma_pe(pe);
>>+ break;
>>+ default:
>>+ WARN_ON(1);
>>+ }
>>+}
>>+
>>+static void pnv_ioda_release_window(struct pnv_ioda_pe *pe, int win)
>>+{
>>+ struct pnv_phb *phb = pe->phb;
>>+ int index, *segmap = NULL;
>>+ int64_t rc;
>>+
>>+ switch (win) {
>>+ case OPAL_IO_WINDOW_TYPE:
>>+ segmap = phb->ioda.io_segmap;
>>+ break;
>>+ case OPAL_M32_WINDOW_TYPE:
>>+ segmap = phb->ioda.m32_segmap;
>>+ break;
>>+ case OPAL_M64_WINDOW_TYPE:
>>+ if (phb->type != PNV_PHB_IODA1)
>>+ return;
>>+ segmap = phb->ioda.m64_segmap;
>>+ break;
>>+ default:
>>+ return;
>
>Unnecessary return.
>
>
>>+ }
>>+
>>+ for (index = 0; index < phb->ioda.total_pe_num; index++) {
>>+ if (segmap[index] != pe->pe_number)
>>+ continue;
>>+
>>+ if (win == OPAL_M64_WINDOW_TYPE)
>>+ rc = opal_pci_map_pe_mmio_window(phb->opal_id,
>>+ phb->ioda.reserved_pe_idx, win,
>>+ index / PNV_IODA1_M64_SEGS,
>>+ index % PNV_IODA1_M64_SEGS);
>>+ else
>>+ rc = opal_pci_map_pe_mmio_window(phb->opal_id,
>>+ phb->ioda.reserved_pe_idx, win,
>>+ 0, index);
>>+
>>+ if (rc != OPAL_SUCCESS)
>>+ pe_warn(pe, "Error %ld unmapping (%d) segment#%d\n",
>>+ rc, win, index);
>>+
>>+ segmap[index] = IODA_INVALID_PE;
>>+ }
>>+}
>>+
>>+static void pnv_ioda_release_pe_seg(struct pnv_ioda_pe *pe)
>>+{
>>+ struct pnv_phb *phb = pe->phb;
>>+ int win;
>>+
>>+ for (win = OPAL_M32_WINDOW_TYPE; win <= OPAL_IO_WINDOW_TYPE; win++) {
>>+ if (phb->type == PNV_PHB_IODA2 && win == OPAL_IO_WINDOW_TYPE)
>>+ continue;
>
>Move this check to pnv_ioda_release_window() or move case(win ==
>OPAL_M64_WINDOW_TYPE):if(phb->type != PNV_PHB_IODA1) from that function here.
>
>
>>+
>>+ pnv_ioda_release_window(pe, win);
>>+ }
>>+}
>
>This is shorter and cleaner:
>
>
>static void pnv_ioda_release_window(struct pnv_ioda_pe *pe, int win, int
>*segmap
>{
> struct pnv_phb *phb = pe->phb;
> int index;
> int64_t rc;
>
> for (index = 0; index < phb->ioda.total_pe_num; index++) {
> if (segmap[index] != pe->pe_number)
> continue;
>
> if (win == OPAL_M64_WINDOW_TYPE)
> rc = opal_pci_map_pe_mmio_window(phb->opal_id,
> phb->ioda.reserved_pe_idx, win,
> index / PNV_IODA1_M64_SEGS,
> index % PNV_IODA1_M64_SEGS);
> else
> rc = opal_pci_map_pe_mmio_window(phb->opal_id,
> phb->ioda.reserved_pe_idx, win,
> 0, index);
>
> if (rc != OPAL_SUCCESS)
> pe_warn(pe, "Error %ld unmapping (%d) segment#%d\n",
> rc, win, index);
>
> segmap[index] = IODA_INVALID_PE;
> }
>}
>
>static void pnv_ioda_release_pe_seg(struct pnv_ioda_pe *pe)
>{
> pnv_ioda_release_window(pe, OPAL_M32_WINDOW_TYPE,
>phb->ioda.m32_segmap);
> if (phb->type != PNV_PHB_IODA2)
> pnv_ioda_release_window(pe, OPAL_IO_WINDOW_TYPE,
> phb->ioda.io_segmap);
> else
> pnv_ioda_release_window(pe, OPAL_M64_WINDOW_TYPE,
> phb->ioda.m64_segmap);
>}
>
>
>I'd actually merge pnv_ioda_release_pe_seg() into pnv_ioda_release_pe() as
>well as it is also small and called once.
>
>
>>+
>>+static int pnv_ioda_deconfigure_pe(struct pnv_phb *phb,
>>+ struct pnv_ioda_pe *pe);
>>+static void pnv_ioda_free_pe(struct pnv_ioda_pe *pe);
>>+static void pnv_ioda_release_pe(struct pnv_ioda_pe *pe)
>>+{
>>+ struct pnv_ioda_pe *tmp, *slave;
>>+
>>+ /* Release slave PEs in compound PE */
>>+ if (pe->flags & PNV_IODA_PE_MASTER) {
>>+ list_for_each_entry_safe(slave, tmp, &pe->slaves, list)
>>+ pnv_ioda_release_pe(slave);
>>+ }
>>+
>>+ /* Remove the PE from the list */
>>+ list_del(&pe->list);
>>+
>>+ /* Release resources */
>>+ pnv_ioda_release_dma_pe(pe);
>>+ pnv_ioda_release_pe_seg(pe);
>>+ pnv_ioda_deconfigure_pe(pe->phb, pe);
>>+
>>+ pnv_ioda_free_pe(pe);
>>+}
>>+
>>+static inline struct pnv_ioda_pe *pnv_ioda_pe_get(struct pnv_ioda_pe *pe)
>>+{
>>+ if (!pe)
>>+ return NULL;
>>+
>>+ pe->device_count++;
>>+ return pe;
>>+}
>>+
>>+static inline void pnv_ioda_pe_put(struct pnv_ioda_pe *pe)
>
>
>Merge this into pnv_pci_release_device() as it is small and called only once.
>
I don't think so. The functions pnv_ioda_pe_{get,put}() are paired. I think it's
good enough to have separate function for the logic included in pnv_ioda_pe_put().
>>+{
>>+ if (!pe)
>>+ return;
>>+
>>+ pe->device_count--;
>>+ WARN_ON(pe->device_count < 0);
>>+ if (pe->device_count == 0)
>>+ pnv_ioda_release_pe(pe);
>>+}
>>+
>>+static void pnv_pci_release_device(struct pci_dev *pdev)
>>+{
>>+ struct pci_controller *hose = pci_bus_to_host(pdev->bus);
>>+ struct pnv_phb *phb = hose->private_data;
>>+ struct pci_dn *pdn = pci_get_pdn(pdev);
>>+ struct pnv_ioda_pe *pe;
>>+
>>+ if (pdev->is_virtfn)
>>+ return;
>>+
>>+ if (!pdn || pdn->pe_number == IODA_INVALID_PE)
>>+ return;
>>+
>>+ pe = &phb->ioda.pe_array[pdn->pe_number];
>>+ pnv_ioda_pe_put(pe);
>>+}
>>+
>> static struct pnv_ioda_pe *pnv_ioda_init_pe(struct pnv_phb *phb, int pe_no)
>> {
>> phb->ioda.pe_array[pe_no].phb = phb;
>>@@ -724,7 +933,6 @@ static int pnv_ioda_set_peltv(struct pnv_phb *phb,
>> return 0;
>> }
>>
>>-#ifdef CONFIG_PCI_IOV
>> static int pnv_ioda_deconfigure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
>> {
>> struct pci_dev *parent;
>>@@ -759,9 +967,11 @@ static int pnv_ioda_deconfigure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
>> }
>> rid_end = pe->rid + (count << 8);
>> } else {
>>+#ifdef CONFIG_PCI_IOV
>> if (pe->flags & PNV_IODA_PE_VF)
>> parent = pe->parent_dev;
>> else
>>+#endif
>> parent = pe->pdev->bus->self;
>> bcomp = OpalPciBusAll;
>> dcomp = OPAL_COMPARE_RID_DEVICE_NUMBER;
>>@@ -799,11 +1009,12 @@ static int pnv_ioda_deconfigure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
>>
>> pe->pbus = NULL;
>> pe->pdev = NULL;
>>+#ifdef CONFIG_PCI_IOV
>> pe->parent_dev = NULL;
>>+#endif
>
>
>These #ifdef movements seem very much unrelated.
>
It's related: pnv_ioda_deconfigure_pe() was used for VF PE only. Now it's used by all
types of PEs. pe->parent_dev is declared as below:
#ifdef CONFIG_PCI_IOV
struct pci_dev *parent_dev;
#endif
>
>>
>> return 0;
>> }
>>-#endif /* CONFIG_PCI_IOV */
>>
>> static int pnv_ioda_configure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
>> {
>>@@ -985,6 +1196,7 @@ static void pnv_ioda_setup_same_PE(struct pci_bus *bus, struct pnv_ioda_pe *pe)
>> continue;
>>
>> pdn->pe_number = pe->pe_number;
>>+ pnv_ioda_pe_get(pe);
>> if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate)
>> pnv_ioda_setup_same_PE(dev->subordinate, pe);
>> }
>>@@ -1047,9 +1259,8 @@ static struct pnv_ioda_pe *pnv_ioda_setup_bus_PE(struct pci_bus *bus, bool all)
>> bus->busn_res.start, pe->pe_number);
>>
>> if (pnv_ioda_configure_pe(phb, pe)) {
>>- /* XXX What do we do here ? */
>>- pnv_ioda_free_pe(pe);
>> pe->pbus = NULL;
>>+ pnv_ioda_release_pe(pe);
>
>
>This is unrelated unexplained change.
>
Will drop it in next revision.
>> return NULL;
>> }
>>
>>@@ -1199,29 +1410,6 @@ m64_failed:
>> return -EBUSY;
>> }
>>
>>-static long pnv_pci_ioda2_unset_window(struct iommu_table_group *table_group,
>>- int num);
>>-static void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable);
>>-
>>-static void pnv_pci_ioda2_release_dma_pe(struct pci_dev *dev, struct pnv_ioda_pe *pe)
>>-{
>>- struct iommu_table *tbl;
>>- int64_t rc;
>>-
>>- tbl = pe->table_group.tables[0];
>>- rc = pnv_pci_ioda2_unset_window(&pe->table_group, 0);
>>- if (rc)
>>- pe_warn(pe, "OPAL error %ld release DMA window\n", rc);
>>-
>>- pnv_pci_ioda2_set_bypass(pe, false);
>>- if (pe->table_group.group) {
>>- iommu_group_put(pe->table_group.group);
>>- BUG_ON(pe->table_group.group);
>>- }
>>- pnv_pci_ioda2_table_free_pages(tbl);
>>- iommu_free_table(tbl, of_node_full_name(dev->dev.of_node));
>>-}
>>-
>> static void pnv_ioda_release_vf_PE(struct pci_dev *pdev)
>> {
>> struct pci_bus *bus;
>>@@ -1242,7 +1430,7 @@ static void pnv_ioda_release_vf_PE(struct pci_dev *pdev)
>> if (pe->parent_dev != pdev)
>> continue;
>>
>>- pnv_pci_ioda2_release_dma_pe(pdev, pe);
>>+ pnv_pci_ioda2_release_dma_pe(pe);
>
>
>This is unrelated change.
>
>>
>> /* Remove from list */
>> mutex_lock(&phb->ioda.pe_list_mutex);
>>@@ -3124,6 +3312,7 @@ static const struct pci_controller_ops pnv_pci_ioda_controller_ops = {
>> .teardown_msi_irqs = pnv_teardown_msi_irqs,
>> #endif
>> .enable_device_hook = pnv_pci_enable_device_hook,
>>+ .release_device = pnv_pci_release_device,
>> .window_alignment = pnv_pci_window_alignment,
>> .setup_bridge = pnv_pci_setup_bridge,
>> .reset_secondary_bus = pnv_pci_reset_secondary_bus,
>>diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
>>index ef5271a..3bb10de 100644
>>--- a/arch/powerpc/platforms/powernv/pci.h
>>+++ b/arch/powerpc/platforms/powernv/pci.h
>>@@ -30,6 +30,7 @@ struct pnv_phb;
>> struct pnv_ioda_pe {
>> unsigned long flags;
>> struct pnv_phb *phb;
>>+ int device_count;
>
>Not atomic_t, no kref, no additional mutex, just "int"? Sure about it? If so,
>put a note to the commit log about what provides a guarantee that there is no
>race.
>
>
It was a kref. Something you suggested on v5 as below:
| You do not need kref here. You call kref_put() in a single location and can do
| stuff directly, without kref. Just have an "unsigned int" counter and that's
| it (it does not even have to be atomic if you do not have races but I am not
| sure you do not).
|
>>
>> /* A PE can be associated with a single device or an
>> * entire bus (& children). In the former case, pdev
>>
Thanks,
Gavin
More information about the Linuxppc-dev
mailing list