[PATCH v7 27/50] powerpc/powernv: Dynamically release PEs
Alexey Kardashevskiy
aik at au1.ibm.com
Wed Nov 18 13:23:05 AEDT 2015
On 11/05/2015 12:12 AM, Gavin Shan wrote:
> This adds a reference count of PE, representing the number of PCI
> devices associated with the PE. The reference count is increased
> or decreased when PCI devices join or leave the PE. Once it becomes
> zero, the PE together with its used resources (IO, MMIO, DMA, PELTM,
> PELTV) are released to support PCI hot unplug.
The commit log suggest the patch only adds a counter, initializes it, and
replaces unconditional release of an object (in this case - PE) with the
conditional one. But it is more that that...
> Signed-off-by: Gavin Shan <gwshan at linux.vnet.ibm.com>
> ---
> arch/powerpc/platforms/powernv/pci-ioda.c | 245 ++++++++++++++++++++++++++----
> arch/powerpc/platforms/powernv/pci.h | 1 +
> 2 files changed, 218 insertions(+), 28 deletions(-)
>
> diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
> index 0bb0056..dcffce5 100644
> --- a/arch/powerpc/platforms/powernv/pci-ioda.c
> +++ b/arch/powerpc/platforms/powernv/pci-ioda.c
> @@ -129,6 +129,215 @@ static inline bool pnv_pci_is_mem_pref_64(unsigned long flags)
> (IORESOURCE_MEM_64 | IORESOURCE_PREFETCH));
> }
>
> +static void pnv_pci_ioda1_release_dma_pe(struct pnv_ioda_pe *pe)
> +{
> + struct pnv_phb *phb = pe->phb;
> + struct iommu_table *tbl;
> + int start, count, i;
> + int64_t rc;
> +
> + /* Search for the used DMA32 segments */
> + start = -1;
> + count = 0;
> + for (i = 0; i < phb->ioda.dma32_count; i++) {
> + if (phb->ioda.dma32_segmap[i] != pe->pe_number)
> + continue;
> +
> + count++;
> + if (start < 0)
> + start = i;
> + }
> +
> + if (!count)
> + return;
imho checking pe->table_group.tables[0] != NULL is shorter than the loop above.
> +
> + /* Unlink IOMMU table from group */
> + tbl = pe->table_group.tables[0];
> + pnv_pci_unlink_table_and_group(tbl, &pe->table_group);
> + if (pe->table_group.group) {
> + iommu_group_put(pe->table_group.group);
> + WARN_ON(pe->table_group.group);
> + }
> +
> + /* Release IOMMU table */
> + pnv_pci_ioda2_table_free_pages(tbl);
This is IODA2 helper with multilevel support, does IODA1 support multilevel
TCE tables? If not, it should WARN_ON on levels!=1.
Another thing is you should first unprogram TVEs (via
opal_pci_map_pe_dma_window), then invalidate the cache (if required, not
sure if this is needed on IODA1), only then free the actual table.
> + iommu_free_table(tbl, of_node_full_name(pci_bus_to_OF_node(pe->pbus)));
> +
> + /* Disable TVE */
> + for (i = start; i < start + count; i++) {
> + rc = opal_pci_map_pe_dma_window(phb->opal_id, pe->pe_number,
> + i, 0, 0ul, 0ul, 0ul);
> + if (rc)
> + pe_warn(pe, "Error %ld unmapping DMA32 seg#%d\n",
> + rc, i);
> +
> + phb->ioda.dma32_segmap[i] = IODA_INVALID_PE;
> + }
You could implement pnv_pci_ioda1_unset_window/pnv_ioda1_table_free as
callbacks, change pnv_pci_ioda2_release_dma_pe() to use them (and rename it
to reflect that it supports IODA1 and IODA2).
> +}
> +
> +static unsigned int pnv_pci_ioda_pe_dma_weight(struct pnv_ioda_pe *pe);
> +static long pnv_pci_ioda2_unset_window(struct iommu_table_group *table_group,
> + int num);
> +static void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable);
> +
> +static void pnv_pci_ioda2_release_dma_pe(struct pnv_ioda_pe *pe)
You moved this function and changed it, please do one thing at once (which
is "change", not "move").
> +{
> + struct iommu_table *tbl;
> + unsigned int weight = pnv_pci_ioda_pe_dma_weight(pe);
> + int64_t rc;
> +
> + if (!weight)
> + return;
Checking for pe->table_group.group is better because if we ever change the
logic of what gets included to an IOMMU group, we will have to do the
change where we add devices to a group but we won't have to touch releasing
code.
> +
> + tbl = pe->table_group.tables[0];
> + rc = pnv_pci_ioda2_unset_window(&pe->table_group, 0);
> + if (rc)
> + pe_warn(pe, "OPAL error %ld release DMA window\n", rc);
> +
> + pnv_pci_ioda2_set_bypass(pe, false);
> + if (pe->table_group.group) {
> + iommu_group_put(pe->table_group.group);
> + WARN_ON(pe->table_group.group);
> + }
> +
> + pnv_pci_ioda2_table_free_pages(tbl);
> + iommu_free_table(tbl, "pnv");
> +}
> +
> +static void pnv_ioda_release_dma_pe(struct pnv_ioda_pe *pe)
Merge this into pnv_ioda_release_pe() - it is small and called just once.
> +{
> + struct pnv_phb *phb = pe->phb;
> +
> + switch (phb->type) {
> + case PNV_PHB_IODA1:
> + pnv_pci_ioda1_release_dma_pe(pe);
> + break;
> + case PNV_PHB_IODA2:
> + pnv_pci_ioda2_release_dma_pe(pe);
> + break;
> + default:
> + WARN_ON(1);
> + }
> +}
> +
> +static void pnv_ioda_release_window(struct pnv_ioda_pe *pe, int win)
> +{
> + struct pnv_phb *phb = pe->phb;
> + int index, *segmap = NULL;
> + int64_t rc;
> +
> + switch (win) {
> + case OPAL_IO_WINDOW_TYPE:
> + segmap = phb->ioda.io_segmap;
> + break;
> + case OPAL_M32_WINDOW_TYPE:
> + segmap = phb->ioda.m32_segmap;
> + break;
> + case OPAL_M64_WINDOW_TYPE:
> + if (phb->type != PNV_PHB_IODA1)
> + return;
> + segmap = phb->ioda.m64_segmap;
> + break;
> + default:
> + return;
Unnecessary return.
> + }
> +
> + for (index = 0; index < phb->ioda.total_pe_num; index++) {
> + if (segmap[index] != pe->pe_number)
> + continue;
> +
> + if (win == OPAL_M64_WINDOW_TYPE)
> + rc = opal_pci_map_pe_mmio_window(phb->opal_id,
> + phb->ioda.reserved_pe_idx, win,
> + index / PNV_IODA1_M64_SEGS,
> + index % PNV_IODA1_M64_SEGS);
> + else
> + rc = opal_pci_map_pe_mmio_window(phb->opal_id,
> + phb->ioda.reserved_pe_idx, win,
> + 0, index);
> +
> + if (rc != OPAL_SUCCESS)
> + pe_warn(pe, "Error %ld unmapping (%d) segment#%d\n",
> + rc, win, index);
> +
> + segmap[index] = IODA_INVALID_PE;
> + }
> +}
> +
> +static void pnv_ioda_release_pe_seg(struct pnv_ioda_pe *pe)
> +{
> + struct pnv_phb *phb = pe->phb;
> + int win;
> +
> + for (win = OPAL_M32_WINDOW_TYPE; win <= OPAL_IO_WINDOW_TYPE; win++) {
> + if (phb->type == PNV_PHB_IODA2 && win == OPAL_IO_WINDOW_TYPE)
> + continue;
Move this check to pnv_ioda_release_window() or move case(win ==
OPAL_M64_WINDOW_TYPE):if(phb->type != PNV_PHB_IODA1) from that function here.
> +
> + pnv_ioda_release_window(pe, win);
> + }
> +}
This is shorter and cleaner:
static void pnv_ioda_release_window(struct pnv_ioda_pe *pe, int win, int
*segmap
{
struct pnv_phb *phb = pe->phb;
int index;
int64_t rc;
for (index = 0; index < phb->ioda.total_pe_num; index++) {
if (segmap[index] != pe->pe_number)
continue;
if (win == OPAL_M64_WINDOW_TYPE)
rc = opal_pci_map_pe_mmio_window(phb->opal_id,
phb->ioda.reserved_pe_idx, win,
index / PNV_IODA1_M64_SEGS,
index % PNV_IODA1_M64_SEGS);
else
rc = opal_pci_map_pe_mmio_window(phb->opal_id,
phb->ioda.reserved_pe_idx, win,
0, index);
if (rc != OPAL_SUCCESS)
pe_warn(pe, "Error %ld unmapping (%d) segment#%d\n",
rc, win, index);
segmap[index] = IODA_INVALID_PE;
}
}
static void pnv_ioda_release_pe_seg(struct pnv_ioda_pe *pe)
{
pnv_ioda_release_window(pe, OPAL_M32_WINDOW_TYPE,
phb->ioda.m32_segmap);
if (phb->type != PNV_PHB_IODA2)
pnv_ioda_release_window(pe, OPAL_IO_WINDOW_TYPE,
phb->ioda.io_segmap);
else
pnv_ioda_release_window(pe, OPAL_M64_WINDOW_TYPE,
phb->ioda.m64_segmap);
}
I'd actually merge pnv_ioda_release_pe_seg() into pnv_ioda_release_pe() as
well as it is also small and called once.
> +
> +static int pnv_ioda_deconfigure_pe(struct pnv_phb *phb,
> + struct pnv_ioda_pe *pe);
> +static void pnv_ioda_free_pe(struct pnv_ioda_pe *pe);
> +static void pnv_ioda_release_pe(struct pnv_ioda_pe *pe)
> +{
> + struct pnv_ioda_pe *tmp, *slave;
> +
> + /* Release slave PEs in compound PE */
> + if (pe->flags & PNV_IODA_PE_MASTER) {
> + list_for_each_entry_safe(slave, tmp, &pe->slaves, list)
> + pnv_ioda_release_pe(slave);
> + }
> +
> + /* Remove the PE from the list */
> + list_del(&pe->list);
> +
> + /* Release resources */
> + pnv_ioda_release_dma_pe(pe);
> + pnv_ioda_release_pe_seg(pe);
> + pnv_ioda_deconfigure_pe(pe->phb, pe);
> +
> + pnv_ioda_free_pe(pe);
> +}
> +
> +static inline struct pnv_ioda_pe *pnv_ioda_pe_get(struct pnv_ioda_pe *pe)
> +{
> + if (!pe)
> + return NULL;
> +
> + pe->device_count++;
> + return pe;
> +}
> +
> +static inline void pnv_ioda_pe_put(struct pnv_ioda_pe *pe)
Merge this into pnv_pci_release_device() as it is small and called only once.
> +{
> + if (!pe)
> + return;
> +
> + pe->device_count--;
> + WARN_ON(pe->device_count < 0);
> + if (pe->device_count == 0)
> + pnv_ioda_release_pe(pe);
> +}
> +
> +static void pnv_pci_release_device(struct pci_dev *pdev)
> +{
> + struct pci_controller *hose = pci_bus_to_host(pdev->bus);
> + struct pnv_phb *phb = hose->private_data;
> + struct pci_dn *pdn = pci_get_pdn(pdev);
> + struct pnv_ioda_pe *pe;
> +
> + if (pdev->is_virtfn)
> + return;
> +
> + if (!pdn || pdn->pe_number == IODA_INVALID_PE)
> + return;
> +
> + pe = &phb->ioda.pe_array[pdn->pe_number];
> + pnv_ioda_pe_put(pe);
> +}
> +
> static struct pnv_ioda_pe *pnv_ioda_init_pe(struct pnv_phb *phb, int pe_no)
> {
> phb->ioda.pe_array[pe_no].phb = phb;
> @@ -724,7 +933,6 @@ static int pnv_ioda_set_peltv(struct pnv_phb *phb,
> return 0;
> }
>
> -#ifdef CONFIG_PCI_IOV
> static int pnv_ioda_deconfigure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
> {
> struct pci_dev *parent;
> @@ -759,9 +967,11 @@ static int pnv_ioda_deconfigure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
> }
> rid_end = pe->rid + (count << 8);
> } else {
> +#ifdef CONFIG_PCI_IOV
> if (pe->flags & PNV_IODA_PE_VF)
> parent = pe->parent_dev;
> else
> +#endif
> parent = pe->pdev->bus->self;
> bcomp = OpalPciBusAll;
> dcomp = OPAL_COMPARE_RID_DEVICE_NUMBER;
> @@ -799,11 +1009,12 @@ static int pnv_ioda_deconfigure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
>
> pe->pbus = NULL;
> pe->pdev = NULL;
> +#ifdef CONFIG_PCI_IOV
> pe->parent_dev = NULL;
> +#endif
These #ifdef movements seem very much unrelated.
>
> return 0;
> }
> -#endif /* CONFIG_PCI_IOV */
>
> static int pnv_ioda_configure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
> {
> @@ -985,6 +1196,7 @@ static void pnv_ioda_setup_same_PE(struct pci_bus *bus, struct pnv_ioda_pe *pe)
> continue;
>
> pdn->pe_number = pe->pe_number;
> + pnv_ioda_pe_get(pe);
> if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate)
> pnv_ioda_setup_same_PE(dev->subordinate, pe);
> }
> @@ -1047,9 +1259,8 @@ static struct pnv_ioda_pe *pnv_ioda_setup_bus_PE(struct pci_bus *bus, bool all)
> bus->busn_res.start, pe->pe_number);
>
> if (pnv_ioda_configure_pe(phb, pe)) {
> - /* XXX What do we do here ? */
> - pnv_ioda_free_pe(pe);
> pe->pbus = NULL;
> + pnv_ioda_release_pe(pe);
This is unrelated unexplained change.
> return NULL;
> }
>
> @@ -1199,29 +1410,6 @@ m64_failed:
> return -EBUSY;
> }
>
> -static long pnv_pci_ioda2_unset_window(struct iommu_table_group *table_group,
> - int num);
> -static void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable);
> -
> -static void pnv_pci_ioda2_release_dma_pe(struct pci_dev *dev, struct pnv_ioda_pe *pe)
> -{
> - struct iommu_table *tbl;
> - int64_t rc;
> -
> - tbl = pe->table_group.tables[0];
> - rc = pnv_pci_ioda2_unset_window(&pe->table_group, 0);
> - if (rc)
> - pe_warn(pe, "OPAL error %ld release DMA window\n", rc);
> -
> - pnv_pci_ioda2_set_bypass(pe, false);
> - if (pe->table_group.group) {
> - iommu_group_put(pe->table_group.group);
> - BUG_ON(pe->table_group.group);
> - }
> - pnv_pci_ioda2_table_free_pages(tbl);
> - iommu_free_table(tbl, of_node_full_name(dev->dev.of_node));
> -}
> -
> static void pnv_ioda_release_vf_PE(struct pci_dev *pdev)
> {
> struct pci_bus *bus;
> @@ -1242,7 +1430,7 @@ static void pnv_ioda_release_vf_PE(struct pci_dev *pdev)
> if (pe->parent_dev != pdev)
> continue;
>
> - pnv_pci_ioda2_release_dma_pe(pdev, pe);
> + pnv_pci_ioda2_release_dma_pe(pe);
This is unrelated change.
>
> /* Remove from list */
> mutex_lock(&phb->ioda.pe_list_mutex);
> @@ -3124,6 +3312,7 @@ static const struct pci_controller_ops pnv_pci_ioda_controller_ops = {
> .teardown_msi_irqs = pnv_teardown_msi_irqs,
> #endif
> .enable_device_hook = pnv_pci_enable_device_hook,
> + .release_device = pnv_pci_release_device,
> .window_alignment = pnv_pci_window_alignment,
> .setup_bridge = pnv_pci_setup_bridge,
> .reset_secondary_bus = pnv_pci_reset_secondary_bus,
> diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
> index ef5271a..3bb10de 100644
> --- a/arch/powerpc/platforms/powernv/pci.h
> +++ b/arch/powerpc/platforms/powernv/pci.h
> @@ -30,6 +30,7 @@ struct pnv_phb;
> struct pnv_ioda_pe {
> unsigned long flags;
> struct pnv_phb *phb;
> + int device_count;
Not atomic_t, no kref, no additional mutex, just "int"? Sure about it? If
so, put a note to the commit log about what provides a guarantee that there
is no race.
>
> /* A PE can be associated with a single device or an
> * entire bus (& children). In the former case, pdev
>
--
Alexey Kardashevskiy
IBM OzLabs, LTC Team
e-mail: aik at au1.ibm.com
notes: Alexey Kardashevskiy/Australia/IBM
More information about the Linuxppc-dev
mailing list