[PATCH v4 07/21] powerpc/powernv: Release PEs dynamically

Gavin Shan gwshan at linux.vnet.ibm.com
Mon May 11 16:25:53 AEST 2015


On Sat, May 09, 2015 at 10:43:23PM +1000, Alexey Kardashevskiy wrote:
>On 05/01/2015 04:02 PM, Gavin Shan wrote:
>>The original code doesn't support releasing PEs dynamically, meaning
>>that PE and the associated resources (IO, M32, M64 and DMA) can't
>>be released when unplugging a PCI adapter from one hotpluggable slot.
>>
>>The patch takes object oriented methodology, introducs reference
>>count to PE, which is initialized to 1 and increased with 1 when a
>>new PCI device joins the PE. Once the last PCI device leaves the
>>PE, the PE is going to be release together with its associated
>>(IO, M32, M64, DMA) resources.
>
>
>Too little commit log for non-trivial non-cut-n-paste 30KB patch...
>

Ok. I'll add more details in next revision.

>>
>>Signed-off-by: Gavin Shan <gwshan at linux.vnet.ibm.com>
>>---
>>  arch/powerpc/include/asm/pci-bridge.h     |   3 +
>>  arch/powerpc/kernel/pci-hotplug.c         |   5 +
>>  arch/powerpc/platforms/powernv/pci-ioda.c | 658 +++++++++++++++++++-----------
>>  arch/powerpc/platforms/powernv/pci.h      |   4 +-
>>  4 files changed, 432 insertions(+), 238 deletions(-)
>>
>>diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h
>>index 5367eb3..a6ad4b1 100644
>>--- a/arch/powerpc/include/asm/pci-bridge.h
>>+++ b/arch/powerpc/include/asm/pci-bridge.h
>>@@ -31,6 +31,9 @@ struct pci_controller_ops {
>>  	resource_size_t (*window_alignment)(struct pci_bus *, unsigned long type);
>>  	void		(*setup_bridge)(struct pci_bus *, unsigned long);
>>  	void		(*reset_secondary_bus)(struct pci_dev *dev);
>>+
>>+	/* Called when PCI device is released */
>>+	void		(*release_device)(struct pci_dev *);
>>  };
>>
>>  /*
>>diff --git a/arch/powerpc/kernel/pci-hotplug.c b/arch/powerpc/kernel/pci-hotplug.c
>>index 7ed85a6..0040343 100644
>>--- a/arch/powerpc/kernel/pci-hotplug.c
>>+++ b/arch/powerpc/kernel/pci-hotplug.c
>>@@ -29,6 +29,11 @@
>>   */
>>  void pcibios_release_device(struct pci_dev *dev)
>>  {
>>+	struct pci_controller *hose = pci_bus_to_host(dev->bus);
>>+
>>+	if (hose->controller_ops.release_device)
>>+		hose->controller_ops.release_device(dev);
>>+
>>  	eeh_remove_device(dev);
>>  }
>>
>>diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
>>index 910fb67..ef8c216 100644
>>--- a/arch/powerpc/platforms/powernv/pci-ioda.c
>>+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
>>@@ -12,6 +12,8 @@
>>  #undef DEBUG
>>
>>  #include <linux/kernel.h>
>>+#include <linux/atomic.h>
>>+#include <linux/kref.h>
>>  #include <linux/pci.h>
>>  #include <linux/crash_dump.h>
>>  #include <linux/debugfs.h>
>>@@ -47,6 +49,8 @@
>>  /* 256M DMA window, 4K TCE pages, 8 bytes TCE */
>>  #define TCE32_TABLE_SIZE	((0x10000000 / 0x1000) * 8)
>>
>>+static void pnv_ioda_release_pe(struct kref *kref);
>>+
>>  static void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level,
>>  			    const char *fmt, ...)
>>  {
>>@@ -123,25 +127,400 @@ static inline bool pnv_pci_is_mem_pref_64(unsigned long flags)
>>  		(IORESOURCE_MEM_64 | IORESOURCE_PREFETCH));
>>  }
>>
>>-static void pnv_ioda_reserve_pe(struct pnv_phb *phb, int pe_no)
>>+static inline void pnv_ioda_pe_get(struct pnv_ioda_pe *pe)
>>  {
>>-	if (!(pe_no >= 0 && pe_no < phb->ioda.total_pe)) {
>>-		pr_warn("%s: Invalid PE %d on PHB#%x\n",
>>-			__func__, pe_no, phb->hose->global_number);
>>+	if (!pe)
>>+		return;
>>+
>>+	kref_get(&pe->kref);
>>+}
>>+
>>+static inline void pnv_ioda_pe_put(struct pnv_ioda_pe *pe)
>>+{
>>+	unsigned int count;
>>+
>>+	if (!pe)
>>  		return;
>>+
>>+	/*
>>+	 * The count is initialized to 1 and increased with 1 when
>>+	 * a new PCI device is bound with the PE. Once the last PCI
>>+	 * device is leaving from the PE, the PE is going to be
>>+	 * released.
>>+	 */
>>+	count = atomic_read(&pe->kref.refcount);
>>+	if (count == 2)
>>+		kref_sub(&pe->kref, 2, pnv_ioda_release_pe);
>>+	else
>>+		kref_put(&pe->kref, pnv_ioda_release_pe);
>
>
>What if pnv_ioda_pe_get() gets called between atomic_read() and kref_sub()?
>

Yeah, that would have problem. But it shouldn't happen because the
PCI devices are joining the parent PE# in strictly serialized mode.
Same thing happens when detaching PCI devices from its parent PE.

>>+}
>>+
>>+static void pnv_pci_release_device(struct pci_dev *pdev)
>>+{
>>+	struct pci_controller *hose = pci_bus_to_host(pdev->bus);
>>+	struct pnv_phb *phb = hose->private_data;
>>+	struct pci_dn *pdn = pci_get_pdn(pdev);
>>+	struct pnv_ioda_pe *pe;
>>+
>>+	if (pdn && pdn->pe_number != IODA_INVALID_PE) {
>>+		pe = &phb->ioda.pe_array[pdn->pe_number];
>>+		pnv_ioda_pe_put(pe);
>>+		pdn->pe_number = IODA_INVALID_PE;
>>  	}
>>+}
>>
>>-	if (test_and_set_bit(pe_no, phb->ioda.pe_alloc)) {
>>-		pr_warn("%s: PE %d was assigned on PHB#%x\n",
>>-			__func__, pe_no, phb->hose->global_number);
>>+static void pnv_ioda_release_pe_dma(struct pnv_ioda_pe *pe)
>>+{
>>+	struct pnv_phb *phb = pe->phb;
>>+	int index, count;
>>+	unsigned long tbl_addr, tbl_size;
>>+
>>+	/* No DMA capability for slave PEs */
>>+	if (pe->flags & PNV_IODA_PE_SLAVE)
>>+		return;
>>+
>>+	/* Bypass DMA window */
>>+	if (phb->type == PNV_PHB_IODA2 &&
>>+	    pe->tce_bypass_enabled &&
>>+	    pe->tce32_table &&
>>+	    pe->tce32_table->set_bypass)
>>+		pe->tce32_table->set_bypass(pe->tce32_table, false);
>>+
>>+	/* 32-bits DMA window */
>>+	count = pe->tce32_seg_end - pe->tce32_seg_start;
>>+	tbl_addr = pe->tce32_table->it_base;
>>+	if (!count)
>>  		return;
>>+
>>+	/* Free IOMMU table */
>>+	iommu_free_table(pe->tce32_table,
>>+			 of_node_full_name(phb->hose->dn));
>>+
>>+	/* Deconfigure TCE table */
>>+	switch (phb->type) {
>>+	case PNV_PHB_IODA1:
>>+		for (index = 0; index < count; index++)
>>+			opal_pci_map_pe_dma_window(phb->opal_id,
>>+						   pe->pe_number,
>>+						   pe->tce32_seg_start + index,
>>+						   1,
>>+						   __pa(tbl_addr) +
>>+						   index * TCE32_TABLE_SIZE,
>>+						   0,
>>+						   0x1000);
>>+		bitmap_clear(phb->ioda.tce32_segmap,
>>+			     pe->tce32_seg_start,
>>+			     count);
>>+		tbl_size = TCE32_TABLE_SIZE * count;
>>+		break;
>>+	case PNV_PHB_IODA2:
>>+		opal_pci_map_pe_dma_window(phb->opal_id,
>>+					   pe->pe_number,
>>+					   pe->pe_number << 1,
>>+					   1,
>>+					   __pa(tbl_addr),
>>+					   0,
>>+					   0x1000);
>>+		tbl_size = (1ul << ilog2(phb->ioda.m32_pci_base));
>>+		tbl_size = (tbl_size >> IOMMU_PAGE_SHIFT_4K) * 8;
>>+		break;
>>+	default:
>>+		pe_warn(pe, "Unsupported PHB type %d\n", phb->type);
>>+		return;
>>+	}
>>+
>>+	/* Free memory of IOMMU table */
>>+	free_pages(tbl_addr, get_order(tbl_size));
>
>
>You just programmed the table address to TVT and then you are releasing the
>pages. It does not seem right, it will leave garbage in TVT. Also, I am
>adding helpers to alloc/free TCE pages in DDW patchset, you could reuse bits
>from there (I'll post v10 soon, you'll be in copy and you'll have to review
>that ;) ).
>

I assume you're talking about TVE. I don't understand how garbage will be left
in TVE. opal_pci_map_pe_dma_window(), which is handled by skiboot, clear TVE
with zero'ed "tce_table_size". The pages previously allocated for TCE table is
released to buddy system, which can be allocated by somebody else (from buddy
or slab).

Ok. Please put me into the cc list. I guess the whole series of patches is
better to rebased on your DDW patchset, which is to be merged first, I believe.

>
>>+	pe->tce32_table = NULL;
>>+	pe->tce32_seg_start = 0;
>>+	pe->tce32_seg_end = 0;
>>+}
>>+
>>+static void pnv_ioda_release_pe_seg(struct pnv_ioda_pe *pe)
>>+{
>>+	struct pnv_phb *phb = pe->phb;
>>+	unsigned long *segmap = NULL, *pe_segmap = NULL;
>>+	int i;
>>+	uint16_t win, win_type[] = { OPAL_IO_WINDOW_TYPE,
>>+				     OPAL_M32_WINDOW_TYPE,
>>+				     OPAL_M64_WINDOW_TYPE };
>>+
>>+	for (win = 0; win < ARRAY_SIZE(win_type); win++) {
>>+		switch (win_type[win]) {
>>+		case OPAL_IO_WINDOW_TYPE:
>>+			segmap = phb->ioda.io_segmap;
>>+			pe_segmap = pe->io_segmap;
>>+			break;
>>+		case OPAL_M32_WINDOW_TYPE:
>>+			segmap = phb->ioda.m32_segmap;
>>+			pe_segmap = pe->m32_segmap;
>>+			break;
>>+		case OPAL_M64_WINDOW_TYPE:
>>+			segmap = phb->ioda.m64_segmap;
>>+			pe_segmap = pe->m64_segmap;
>>+			break;
>>+		}
>>+		i = -1;
>>+		while ((i = find_next_bit(pe_segmap,
>>+			phb->ioda.total_pe, i + 1)) < phb->ioda.total_pe) {
>>+			if (win_type[win] == OPAL_IO_WINDOW_TYPE ||
>>+			    win_type[win] == OPAL_M32_WINDOW_TYPE)
>>+				opal_pci_map_pe_mmio_window(phb->opal_id,
>>+						phb->ioda.reserved_pe,
>>+						win_type[win], 0, i);
>>+			else if (phb->type == PNV_PHB_IODA1)
>>+				opal_pci_map_pe_mmio_window(phb->opal_id,
>>+						phb->ioda.reserved_pe,
>>+						win_type[win],
>>+						i / 8, i % 8);
>
>The function is called ""release" but it programs something what looks like
>reasonable values, is it correct?
>

It's out of problem, When the segment is deallocated, it's mapped to the
reserved PE#.

>
>
>>+
>>+			clear_bit(i, pe_segmap);
>>+			clear_bit(i, segmap);
>>+		}
>>+	}
>>+}
>>+
>>+static int pnv_ioda_set_one_peltv(struct pnv_phb *phb,
>>+				  struct pnv_ioda_pe *parent,
>>+				  struct pnv_ioda_pe *child,
>>+				  bool is_add)
>>+{
>>+	const char *desc = is_add ? "adding" : "removing";
>>+	uint8_t op = is_add ? OPAL_ADD_PE_TO_DOMAIN :
>>+			      OPAL_REMOVE_PE_FROM_DOMAIN;
>>+	struct pnv_ioda_pe *slave;
>>+	long rc;
>>+
>>+	/* Parent PE affects child PE */
>>+	rc = opal_pci_set_peltv(phb->opal_id, parent->pe_number,
>>+				child->pe_number, op);
>>+	if (rc != OPAL_SUCCESS) {
>>+		pe_warn(child, "OPAL error %ld %s to parent PELTV\n",
>>+			rc, desc);
>>+		return -ENXIO;
>>+	}
>>+
>>+	if (!(child->flags & PNV_IODA_PE_MASTER))
>>+		return 0;
>>+
>>+	/* Compound case: parent PE affects slave PEs */
>>+	list_for_each_entry(slave, &child->slaves, list) {
>>+		rc = opal_pci_set_peltv(phb->opal_id, parent->pe_number,
>>+					slave->pe_number, op);
>>+		if (rc != OPAL_SUCCESS) {
>>+			pe_warn(slave, "OPAL error %ld %s to parent PELTV\n",
>>+				rc, desc);
>>+			return -ENXIO;
>>+		}
>>+	}
>>+
>>+	return 0;
>>+}
>>+
>>+static int pnv_ioda_set_peltv(struct pnv_ioda_pe *pe, bool is_add)
>>+{
>>+	struct pnv_phb *phb = pe->phb;
>>+	struct pnv_ioda_pe *slave;
>>+	struct pci_dev *pdev = NULL;
>>+	int ret;
>>+
>>+	/*
>>+	 * Clear PE frozen state. If it's master PE, we need
>>+	 * clear slave PE frozen state as well.
>>+	 */
>>+	opal_pci_eeh_freeze_clear(phb->opal_id,
>>+				  pe->pe_number,
>>+				  OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
>>+	if (pe->flags & PNV_IODA_PE_MASTER) {
>>+		list_for_each_entry(slave, &pe->slaves, list) {
>>+			opal_pci_eeh_freeze_clear(phb->opal_id,
>>+						  slave->pe_number,
>>+						  OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
>>+		}
>>+	}
>>+
>>+	/*
>>+	 * Associate PE in PELT. We need add the PE into the
>>+	 * corresponding PELT-V as well. Otherwise, the error
>>+	 * originated from the PE might contribute to other
>>+	 * PEs.
>>+	 */
>>+	ret = pnv_ioda_set_one_peltv(phb, pe, pe, is_add);
>>+	if (ret)
>>+		return ret;
>>+
>>+	/* For compound PEs, any one affects all of them */
>>+	if (pe->flags & PNV_IODA_PE_MASTER) {
>>+		list_for_each_entry(slave, &pe->slaves, list) {
>>+			ret = pnv_ioda_set_one_peltv(phb, slave, pe, is_add);
>>+			if (ret)
>>+				return ret;
>>+		}
>>+	}
>>+
>>+	if (pe->flags & (PNV_IODA_PE_BUS_ALL | PNV_IODA_PE_BUS))
>>+		pdev = pe->pbus->self;
>>+	else if (pe->flags & PNV_IODA_PE_DEV)
>>+		pdev = pe->pdev->bus->self;
>>+#ifdef CONFIG_PCI_IOV
>>+	else if (pe->flags & PNV_IODA_PE_VF)
>>+		pdev = pe->parent_dev->bus->self;
>>+#endif /* CONFIG_PCI_IOV */
>>+
>>+	while (pdev) {
>>+		struct pci_dn *pdn = pci_get_pdn(pdev);
>>+		struct pnv_ioda_pe *parent;
>>+
>>+		if (pdn && pdn->pe_number != IODA_INVALID_PE) {
>>+			parent = &phb->ioda.pe_array[pdn->pe_number];
>>+			ret = pnv_ioda_set_one_peltv(phb, parent, pe, is_add);
>>+			if (ret)
>>+				return ret;
>>+		}
>>+
>>+		pdev = pdev->bus->self;
>>+	}
>>+
>>+	return 0;
>>+}
>>+
>>+static void pnv_ioda_deconfigure_pe(struct pnv_ioda_pe *pe)
>
>
>It used to be under #ifdef CONFIG_PCI_IOV, now it is not. Looks like just
>moving of this function to a different place deserves a separate patch with a
>comment why ("it is going to be used now for non-SRIOV case too" may be?).
>

Yeah, it makes sense to me. Will fix it up.

>
>>+{
>>+	struct pnv_phb *phb = pe->phb;
>>+	struct pci_dev *parent;
>>+	uint8_t bcomp, dcomp, fcomp;
>>+	long rid_end, rid;
>>+	int64_t rc;
>>+
>>+	/* Tear down MVE */
>>+	if (phb->type == PNV_PHB_IODA1 &&
>>+	    pe->mve_number != -1) {
>>+		rc = opal_pci_set_mve(phb->opal_id,
>>+				      pe->mve_number,
>>+				      phb->ioda.reserved_pe);
>>+		if (rc != OPAL_SUCCESS)
>>+			pe_warn(pe, "Error %lld unmapping MVE#%d\n",
>>+				rc, pe->mve_number);
>>+		rc = opal_pci_set_mve_enable(phb->opal_id,
>>+					     pe->mve_number,
>>+					     OPAL_DISABLE_MVE);
>>+		if (rc != OPAL_SUCCESS)
>>+			pe_warn(pe, "Error %lld disabling MVE#%d\n",
>>+				rc, pe->mve_number);
>>+		pe->mve_number = -1;
>>+	}
>>+
>>+	/* Unmapping PELTV */
>>+	pnv_ioda_set_peltv(pe, false);
>>+
>>+	/* To unmap PELTM */
>>+	if (pe->pbus) {
>>+		int count;
>>+
>>+		dcomp = OPAL_IGNORE_RID_DEVICE_NUMBER;
>>+		fcomp = OPAL_IGNORE_RID_FUNCTION_NUMBER;
>>+		parent = pe->pbus->self;
>>+		if (pe->flags & PNV_IODA_PE_BUS_ALL)
>>+			count = pe->pbus->busn_res.end -
>>+				pe->pbus->busn_res.start + 1;
>>+		else
>>+			count = 1;
>>+
>>+		switch(count) {
>>+		case  1: bcomp = OpalPciBusAll;   break;
>>+		case  2: bcomp = OpalPciBus7Bits; break;
>>+		case  4: bcomp = OpalPciBus6Bits; break;
>>+		case  8: bcomp = OpalPciBus5Bits; break;
>>+		case 16: bcomp = OpalPciBus4Bits; break;
>>+		case 32: bcomp = OpalPciBus3Bits; break;
>>+		default:
>>+			/* Fail back to case of one bus */
>>+			pe_warn(pe, "Cannot support %d buses\n", count);
>>+			bcomp = OpalPciBusAll;
>>+		}
>>+		rid_end = pe->rid + (count << 8);
>>+	} else {
>>+#ifdef CONFIG_PCI_IOV
>>+		if (pe->flags & PNV_IODA_PE_VF)
>>+			parent = pe->parent_dev;
>>+		else
>>+#endif
>>+			parent = pe->pdev->bus->self;
>>+		bcomp = OpalPciBusAll;
>>+		dcomp = OPAL_COMPARE_RID_DEVICE_NUMBER;
>>+		fcomp = OPAL_COMPARE_RID_FUNCTION_NUMBER;
>>+		rid_end = pe->rid + 1;
>>+	}
>>+
>>+	/* Clear RID mapping */
>>+	for (rid = pe->rid; rid < rid_end; rid++)
>>+		phb->ioda.pe_rmap[rid] = IODA_INVALID_PE;
>>+
>>+	/* Unmapping PELTM */
>>+	rc = opal_pci_set_pe(phb->opal_id, pe->pe_number, pe->rid,
>>+			     bcomp, dcomp, fcomp, OPAL_UNMAP_PE);
>>+	if (rc)
>>+		pe_warn(pe, "Error %ld unmapping PELTM\n", rc);
>>+}
>>+
>>+static void pnv_ioda_release_pe(struct kref *kref)
>>+{
>>+	struct pnv_ioda_pe *pe = container_of(kref, struct pnv_ioda_pe, kref);
>>+	struct pnv_ioda_pe *tmp, *slave;
>>+	struct pnv_phb *phb = pe->phb;
>>+
>>+	pnv_ioda_release_pe_dma(pe);
>>+	pnv_ioda_release_pe_seg(pe);
>>+	pnv_ioda_deconfigure_pe(pe);
>>+
>>+	/* Release slave PEs for compound PE */
>>+	if (pe->flags & PNV_IODA_PE_MASTER) {
>>+		list_for_each_entry_safe(slave, tmp, &pe->slaves, list)
>>+			pnv_ioda_pe_put(slave);
>>+	}
>>+
>>+	/* Remove the PE from various list. We need remove slave
>>+	 * PE from master's list.
>>+	 */
>>+	list_del(&pe->dma_link);
>>+	list_del(&pe->list);
>>+
>>+	/* Free PE number */
>>+	clear_bit(pe->pe_number, phb->ioda.pe_alloc);
>>+}
>>+
>>+static struct pnv_ioda_pe *pnv_ioda_init_pe(struct pnv_phb *phb,
>>+					    int pe_no)
>>+{
>>+	struct pnv_ioda_pe *pe = &phb->ioda.pe_array[pe_no];
>>+
>>+	kref_init(&pe->kref);
>>+	pe->phb = phb;
>>+	pe->pe_number = pe_no;
>>+	INIT_LIST_HEAD(&pe->dma_link);
>>+	INIT_LIST_HEAD(&pe->list);
>>+
>>+	return pe;
>>+}
>>+
>>+static struct pnv_ioda_pe *pnv_ioda_reserve_pe(struct pnv_phb *phb,
>>+					       int pe_no)
>>+{
>>+	if (!(pe_no >= 0 && pe_no < phb->ioda.total_pe)) {
>>+		pr_warn("%s: Invalid PE %d on PHB#%x\n",
>>+			__func__, pe_no, phb->hose->global_number);
>>+		return NULL;
>>  	}
>>
>>-	phb->ioda.pe_array[pe_no].phb = phb;
>>-	phb->ioda.pe_array[pe_no].pe_number = pe_no;
>>+	/*
>>+	 * Same PE might be reserved for multiple times, which
>>+	 * is out of problem actually.
>>+	 */
>>+	set_bit(pe_no, phb->ioda.pe_alloc);
>>+	return pnv_ioda_init_pe(phb, pe_no);
>>  }
>>
>>-static int pnv_ioda_alloc_pe(struct pnv_phb *phb)
>>+static struct pnv_ioda_pe *pnv_ioda_alloc_pe(struct pnv_phb *phb)
>>  {
>>  	unsigned long pe_no;
>>  	unsigned long limit = phb->ioda.total_pe - 1;
>>@@ -154,20 +533,10 @@ static int pnv_ioda_alloc_pe(struct pnv_phb *phb)
>>  			break;
>>
>>  		if (--limit >= phb->ioda.total_pe)
>>-			return IODA_INVALID_PE;
>>+			return NULL;
>>  	} while(1);
>>
>>-	phb->ioda.pe_array[pe_no].phb = phb;
>>-	phb->ioda.pe_array[pe_no].pe_number = pe_no;
>>-	return pe_no;
>>-}
>>-
>>-static void pnv_ioda_free_pe(struct pnv_phb *phb, int pe)
>>-{
>>-	WARN_ON(phb->ioda.pe_array[pe].pdev);
>>-
>>-	memset(&phb->ioda.pe_array[pe], 0, sizeof(struct pnv_ioda_pe));
>>-	clear_bit(pe, phb->ioda.pe_alloc);
>>+	return pnv_ioda_init_pe(phb, pe_no);
>>  }
>>
>>  static int pnv_ioda1_init_m64(struct pnv_phb *phb)
>>@@ -382,8 +751,9 @@ static void pnv_ioda_reserve_m64_pe(struct pnv_phb *phb,
>>  	}
>>  }
>>
>>-static int pnv_ioda_pick_m64_pe(struct pnv_phb *phb,
>>-				struct pci_bus *bus, int all)
>>+static struct pnv_ioda_pe *pnv_ioda_pick_m64_pe(struct pnv_phb *phb,
>>+						struct pci_bus *bus,
>>+						int all)
>
>
>Mechanic changes like this could easily go to a separate patch.
>

Indeed. I'll see how I can split the patches up in next revision.
Thanks for the suggestion.

>>  {
>>  	resource_size_t segsz = phb->ioda.m64_segsize;
>>  	struct pci_dev *pdev;
>>@@ -394,14 +764,14 @@ static int pnv_ioda_pick_m64_pe(struct pnv_phb *phb,
>>  	int i;
>>
>>  	if (!pnv_ioda_need_m64_pe(phb, bus))
>>-		return IODA_INVALID_PE;
>>+		return NULL;
>>
>>          /* Allocate bitmap */
>>  	size = _ALIGN_UP(phb->ioda.total_pe / 8, sizeof(unsigned long));
>>  	pe_bitsmap = kzalloc(size, GFP_KERNEL);
>>  	if (!pe_bitsmap) {
>>  		pr_warn("%s: Out of memory !\n", __func__);
>>-		return IODA_INVALID_PE;
>>+		return NULL;
>>  	}
>>
>>  	/* The bridge's M64 window might be extended to PHB's M64
>>@@ -438,7 +808,7 @@ static int pnv_ioda_pick_m64_pe(struct pnv_phb *phb,
>>  	/* No M64 window found ? */
>>  	if (bitmap_empty(pe_bitsmap, phb->ioda.total_pe)) {
>>  		kfree(pe_bitsmap);
>>-		return IODA_INVALID_PE;
>>+		return NULL;
>>  	}
>>
>>  	/* Figure out the master PE and put all slave PEs
>>@@ -491,7 +861,7 @@ static int pnv_ioda_pick_m64_pe(struct pnv_phb *phb,
>>  	}
>>
>>  	kfree(pe_bitsmap);
>>-	return master_pe->pe_number;
>>+	return master_pe;
>>  }
>>
>>  static void __init pnv_ioda_parse_m64_window(struct pnv_phb *phb)
>>@@ -695,7 +1065,7 @@ static int pnv_ioda_get_pe_state(struct pnv_phb *phb, int pe_no)
>>   * but in the meantime, we need to protect them to avoid warnings
>>   */
>>  #ifdef CONFIG_PCI_MSI
>>-static struct pnv_ioda_pe *pnv_ioda_get_pe(struct pci_dev *dev)
>>+static struct pnv_ioda_pe *pnv_ioda_pci_dev_to_pe(struct pci_dev *dev)
>>  {
>>  	struct pci_controller *hose = pci_bus_to_host(dev->bus);
>>  	struct pnv_phb *phb = hose->private_data;
>>@@ -709,191 +1079,6 @@ static struct pnv_ioda_pe *pnv_ioda_get_pe(struct pci_dev *dev)
>>  }
>>  #endif /* CONFIG_PCI_MSI */
>>
>>-static int pnv_ioda_set_one_peltv(struct pnv_phb *phb,
>>-				  struct pnv_ioda_pe *parent,
>>-				  struct pnv_ioda_pe *child,
>>-				  bool is_add)
>>-{
>>-	const char *desc = is_add ? "adding" : "removing";
>>-	uint8_t op = is_add ? OPAL_ADD_PE_TO_DOMAIN :
>>-			      OPAL_REMOVE_PE_FROM_DOMAIN;
>>-	struct pnv_ioda_pe *slave;
>>-	long rc;
>>-
>>-	/* Parent PE affects child PE */
>>-	rc = opal_pci_set_peltv(phb->opal_id, parent->pe_number,
>>-				child->pe_number, op);
>>-	if (rc != OPAL_SUCCESS) {
>>-		pe_warn(child, "OPAL error %ld %s to parent PELTV\n",
>>-			rc, desc);
>>-		return -ENXIO;
>>-	}
>>-
>>-	if (!(child->flags & PNV_IODA_PE_MASTER))
>>-		return 0;
>>-
>>-	/* Compound case: parent PE affects slave PEs */
>>-	list_for_each_entry(slave, &child->slaves, list) {
>>-		rc = opal_pci_set_peltv(phb->opal_id, parent->pe_number,
>>-					slave->pe_number, op);
>>-		if (rc != OPAL_SUCCESS) {
>>-			pe_warn(slave, "OPAL error %ld %s to parent PELTV\n",
>>-				rc, desc);
>>-			return -ENXIO;
>>-		}
>>-	}
>>-
>>-	return 0;
>>-}
>>-
>>-static int pnv_ioda_set_peltv(struct pnv_phb *phb,
>>-			      struct pnv_ioda_pe *pe,
>>-			      bool is_add)
>>-{
>>-	struct pnv_ioda_pe *slave;
>>-	struct pci_dev *pdev = NULL;
>>-	int ret;
>>-
>>-	/*
>>-	 * Clear PE frozen state. If it's master PE, we need
>>-	 * clear slave PE frozen state as well.
>>-	 */
>>-	if (is_add) {
>>-		opal_pci_eeh_freeze_clear(phb->opal_id, pe->pe_number,
>>-					  OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
>>-		if (pe->flags & PNV_IODA_PE_MASTER) {
>>-			list_for_each_entry(slave, &pe->slaves, list)
>>-				opal_pci_eeh_freeze_clear(phb->opal_id,
>>-							  slave->pe_number,
>>-							  OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
>>-		}
>>-	}
>>-
>>-	/*
>>-	 * Associate PE in PELT. We need add the PE into the
>>-	 * corresponding PELT-V as well. Otherwise, the error
>>-	 * originated from the PE might contribute to other
>>-	 * PEs.
>>-	 */
>>-	ret = pnv_ioda_set_one_peltv(phb, pe, pe, is_add);
>>-	if (ret)
>>-		return ret;
>>-
>>-	/* For compound PEs, any one affects all of them */
>>-	if (pe->flags & PNV_IODA_PE_MASTER) {
>>-		list_for_each_entry(slave, &pe->slaves, list) {
>>-			ret = pnv_ioda_set_one_peltv(phb, slave, pe, is_add);
>>-			if (ret)
>>-				return ret;
>>-		}
>>-	}
>>-
>>-	if (pe->flags & (PNV_IODA_PE_BUS_ALL | PNV_IODA_PE_BUS))
>>-		pdev = pe->pbus->self;
>>-	else if (pe->flags & PNV_IODA_PE_DEV)
>>-		pdev = pe->pdev->bus->self;
>>-#ifdef CONFIG_PCI_IOV
>>-	else if (pe->flags & PNV_IODA_PE_VF)
>>-		pdev = pe->parent_dev->bus->self;
>>-#endif /* CONFIG_PCI_IOV */
>>-	while (pdev) {
>>-		struct pci_dn *pdn = pci_get_pdn(pdev);
>>-		struct pnv_ioda_pe *parent;
>>-
>>-		if (pdn && pdn->pe_number != IODA_INVALID_PE) {
>>-			parent = &phb->ioda.pe_array[pdn->pe_number];
>>-			ret = pnv_ioda_set_one_peltv(phb, parent, pe, is_add);
>>-			if (ret)
>>-				return ret;
>>-		}
>>-
>>-		pdev = pdev->bus->self;
>>-	}
>>-
>>-	return 0;
>>-}
>>-
>>-#ifdef CONFIG_PCI_IOV
>>-static int pnv_ioda_deconfigure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
>>-{
>>-	struct pci_dev *parent;
>>-	uint8_t bcomp, dcomp, fcomp;
>>-	int64_t rc;
>>-	long rid_end, rid;
>>-
>>-	/* Currently, we just deconfigure VF PE. Bus PE will always there.*/
>>-	if (pe->pbus) {
>>-		int count;
>>-
>>-		dcomp = OPAL_IGNORE_RID_DEVICE_NUMBER;
>>-		fcomp = OPAL_IGNORE_RID_FUNCTION_NUMBER;
>>-		parent = pe->pbus->self;
>>-		if (pe->flags & PNV_IODA_PE_BUS_ALL)
>>-			count = pe->pbus->busn_res.end - pe->pbus->busn_res.start + 1;
>>-		else
>>-			count = 1;
>>-
>>-		switch(count) {
>>-		case  1: bcomp = OpalPciBusAll;         break;
>>-		case  2: bcomp = OpalPciBus7Bits;       break;
>>-		case  4: bcomp = OpalPciBus6Bits;       break;
>>-		case  8: bcomp = OpalPciBus5Bits;       break;
>>-		case 16: bcomp = OpalPciBus4Bits;       break;
>>-		case 32: bcomp = OpalPciBus3Bits;       break;
>>-		default:
>>-			dev_err(&pe->pbus->dev, "Number of subordinate buses %d unsupported\n",
>>-			        count);
>>-			/* Do an exact match only */
>>-			bcomp = OpalPciBusAll;
>>-		}
>>-		rid_end = pe->rid + (count << 8);
>>-	} else {
>>-		if (pe->flags & PNV_IODA_PE_VF)
>>-			parent = pe->parent_dev;
>>-		else
>>-			parent = pe->pdev->bus->self;
>>-		bcomp = OpalPciBusAll;
>>-		dcomp = OPAL_COMPARE_RID_DEVICE_NUMBER;
>>-		fcomp = OPAL_COMPARE_RID_FUNCTION_NUMBER;
>>-		rid_end = pe->rid + 1;
>>-	}
>>-
>>-	/* Clear the reverse map */
>>-	for (rid = pe->rid; rid < rid_end; rid++)
>>-		phb->ioda.pe_rmap[rid] = IODA_INVALID_PE;
>>-
>>-	/* Release from all parents PELT-V */
>>-	while (parent) {
>>-		struct pci_dn *pdn = pci_get_pdn(parent);
>>-		if (pdn && pdn->pe_number != IODA_INVALID_PE) {
>>-			rc = opal_pci_set_peltv(phb->opal_id, pdn->pe_number,
>>-						pe->pe_number, OPAL_REMOVE_PE_FROM_DOMAIN);
>>-			/* XXX What to do in case of error ? */
>
>
>Not much :) Free associated memory and mark it "dead" so it won't be used
>again till reboot. In what circumstance can this opal_pci_set_peltv() fail at
>all?
>

Yeah, maybe. Until now, I didn't see this failure since the code is there
from the day. Note the code has been there for almost 4 years since the
day Ben wrote it.

>
>>-		}
>>-		parent = parent->bus->self;
>>-	}
>>-
>>-	opal_pci_eeh_freeze_set(phb->opal_id, pe->pe_number,
>>-				  OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
>>-
>>-	/* Disassociate PE in PELT */
>>-	rc = opal_pci_set_peltv(phb->opal_id, pe->pe_number,
>>-				pe->pe_number, OPAL_REMOVE_PE_FROM_DOMAIN);
>>-	if (rc)
>>-		pe_warn(pe, "OPAL error %ld remove self from PELTV\n", rc);
>>-	rc = opal_pci_set_pe(phb->opal_id, pe->pe_number, pe->rid,
>>-			     bcomp, dcomp, fcomp, OPAL_UNMAP_PE);
>>-	if (rc)
>>-		pe_err(pe, "OPAL error %ld trying to setup PELT table\n", rc);
>>-
>>-	pe->pbus = NULL;
>>-	pe->pdev = NULL;
>>-	pe->parent_dev = NULL;
>>-
>>-	return 0;
>>-}
>>-#endif /* CONFIG_PCI_IOV */
>>-
>>  static int pnv_ioda_configure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
>>  {
>>  	struct pci_dev *parent;
>>@@ -953,7 +1138,7 @@ static int pnv_ioda_configure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
>>  	}
>>
>>  	/* Configure PELTV */
>>-	pnv_ioda_set_peltv(phb, pe, true);
>>+	pnv_ioda_set_peltv(pe, true);
>>
>>  	/* Setup reverse map */
>>  	for (rid = pe->rid; rid < rid_end; rid++)
>>@@ -1207,6 +1392,8 @@ static void pnv_ioda_setup_same_PE(struct pci_bus *bus, struct pnv_ioda_pe *pe)
>>  		if (pdn->pe_number != IODA_INVALID_PE)
>>  			continue;
>>
>>+		/* Increase reference count of the parent PE */
>
>When you comment like this, I read it as the comment belongs to the whole
>next chunk till the first empty line, i.e. to all 5 lines below, which is not
>the case. I'd remove the comment as 1) "pe_get" in pnv_ioda_pe_get() name
>suggests incrementing the reference counter 2) "pe" is always parent in this
>function. I do not insist though.
>

Agree on your explaining. I'll remove this unuseful comments.

>
>>+		pnv_ioda_pe_get(pe);
>>  		pdn->pe_number = pe->pe_number;
>>  		pe->dma_weight += pnv_ioda_dev_dma_weight(dev);
>>  		if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate)
>>@@ -1224,7 +1411,7 @@ static struct pnv_ioda_pe *pnv_ioda_setup_bus_PE(struct pci_bus *bus, int all)
>>  {
>>  	struct pci_controller *hose = pci_bus_to_host(bus);
>>  	struct pnv_phb *phb = hose->private_data;
>>-	struct pnv_ioda_pe *pe;
>>+	struct pnv_ioda_pe *pe = NULL;
>>  	int pe_num = IODA_INVALID_PE;
>>
>>  	/* For partial hotplug case, the PE instance hasn't been destroyed
>>@@ -1240,24 +1427,24 @@ static struct pnv_ioda_pe *pnv_ioda_setup_bus_PE(struct pci_bus *bus, int all)
>>  	}
>>
>>  	/* PE number for root bus should have been reserved */
>>-	if (pci_is_root_bus(bus))
>>-		pe_num = phb->ioda.root_pe_no;
>>+	if (pci_is_root_bus(bus) &&
>>+	    phb->ioda.root_pe_no != IODA_INVALID_PE)
>>+		pe = &phb->ioda.pe_array[phb->ioda.root_pe_no];
>>
>>  	/* Check if PE is determined by M64 */
>>-	if (pe_num == IODA_INVALID_PE && phb->pick_m64_pe)
>>-		pe_num = phb->pick_m64_pe(phb, bus, all);
>>+	if (!pe && phb->pick_m64_pe)
>>+		pe = phb->pick_m64_pe(phb, bus, all);
>>
>>  	/* The PE number isn't pinned by M64 */
>>-	if (pe_num == IODA_INVALID_PE)
>>-		pe_num = pnv_ioda_alloc_pe(phb);
>>+	if (!pe)
>>+		pe = pnv_ioda_alloc_pe(phb);
>>
>>-	if (pe_num == IODA_INVALID_PE) {
>>-		pr_warning("%s: Not enough PE# available for PCI bus %04x:%02x\n",
>>+	if (!pe) {
>>+		pr_warn("%s: No enough PE# available for PCI bus %04x:%02x\n",
>>  			__func__, pci_domain_nr(bus), bus->number);
>>  		return NULL;
>>  	}
>>
>>-	pe = &phb->ioda.pe_array[pe_num];
>>  	pe->flags |= (all ? PNV_IODA_PE_BUS_ALL : PNV_IODA_PE_BUS);
>>  	pe->pbus = bus;
>>  	pe->pdev = NULL;
>>@@ -1274,14 +1461,12 @@ static struct pnv_ioda_pe *pnv_ioda_setup_bus_PE(struct pci_bus *bus, int all)
>>
>>  	if (pnv_ioda_configure_pe(phb, pe)) {
>>  		/* XXX What do we do here ? */
>>-		if (pe_num)
>>-			pnv_ioda_free_pe(phb, pe_num);
>>-		pe->pbus = NULL;
>>+		pnv_ioda_pe_put(pe);
>>  		return NULL;
>>  	}
>>
>>  	pe->tce32_table = kzalloc_node(sizeof(struct iommu_table),
>>-			GFP_KERNEL, hose->node);
>>+				       GFP_KERNEL, hose->node);
>
>Seems like spaces change only - if you really want this change (which I hate
>- makes code look inaccurate to my taste but it seems I am in minority here
>:) ), please put it to the separate patch.
>

Ok. Confirm with you: You prefer the original format? I don't know
why I prefer the later one. Maybe my eyes are quite broken :-)

>
>>  	pe->tce32_table->data = pe;
>>
>>  	/* Associate it with all child devices */
>>@@ -1521,9 +1706,9 @@ static void pnv_ioda_release_vf_PE(struct pci_dev *pdev, u16 num_vfs)
>>  		list_del(&pe->list);
>>  		mutex_unlock(&phb->ioda.pe_list_mutex);
>>
>>-		pnv_ioda_deconfigure_pe(phb, pe);
>>+		pnv_ioda_deconfigure_pe(pe);
>
>
>Is this change necessary to get "Release PEs dynamically" working? Move it to
>mechanical changes patch may be?
>

Ok. I'll try to do that.

>
>>
>>-		pnv_ioda_free_pe(phb, pe->pe_number);
>>+		pnv_ioda_pe_put(pe);
>>  	}
>>  }
>>
>>@@ -1601,9 +1786,7 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs)
>>
>>  		if (pnv_ioda_configure_pe(phb, pe)) {
>>  			/* XXX What do we do here ? */
>>-			if (pe_num)
>>-				pnv_ioda_free_pe(phb, pe_num);
>>-			pe->pdev = NULL;
>>+			pnv_ioda_pe_put(pe);
>>  			continue;
>>  		}
>>
>>@@ -2263,7 +2446,7 @@ int pnv_phb_to_cxl_mode(struct pci_dev *dev, uint64_t mode)
>>  	struct pnv_ioda_pe *pe;
>>  	int rc;
>>
>>-	pe = pnv_ioda_get_pe(dev);
>>+	pe = pnv_ioda_pci_dev_to_pe(dev);
>
>
>And this change could to separately. Not clear how this helps to "Release PEs
>dynamically".
>
>

It's not related to "Release PEs dynamically". The change is introduced by
the function rename: Original pnv_ioda_get_pe() is renamed to pnv_ioda_pci_dev_to_pe().

>>  	if (!pe)
>>  		return -ENODEV;
>>
>>@@ -2379,7 +2562,7 @@ int pnv_cxl_ioda_msi_setup(struct pci_dev *dev, unsigned int hwirq,
>>  	struct pnv_ioda_pe *pe;
>>  	int rc;
>>
>>-	if (!(pe = pnv_ioda_get_pe(dev)))
>>+	if (!(pe = pnv_ioda_pci_dev_to_pe(dev)))
>>  		return -ENODEV;
>>
>>  	/* Assign XIVE to PE */
>>@@ -2401,7 +2584,7 @@ static int pnv_pci_ioda_msi_setup(struct pnv_phb *phb, struct pci_dev *dev,
>>  				  unsigned int hwirq, unsigned int virq,
>>  				  unsigned int is_64, struct msi_msg *msg)
>>  {
>>-	struct pnv_ioda_pe *pe = pnv_ioda_get_pe(dev);
>>+	struct pnv_ioda_pe *pe = pnv_ioda_pci_dev_to_pe(dev);
>>  	unsigned int xive_num = hwirq - phb->msi_base;
>>  	__be32 data;
>>  	int rc;
>>@@ -3065,6 +3248,7 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np,
>>  	pnv_pci_controller_ops.setup_bridge = pnv_pci_setup_bridge;
>>  	pnv_pci_controller_ops.window_alignment = pnv_pci_window_alignment;
>>  	pnv_pci_controller_ops.reset_secondary_bus = pnv_pci_reset_secondary_bus;
>>+	pnv_pci_controller_ops.release_device = pnv_pci_release_device;
>>  	hose->controller_ops = pnv_pci_controller_ops;
>>
>>  #ifdef CONFIG_PCI_IOV
>>diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
>>index 1bea3a8..8b10f01 100644
>>--- a/arch/powerpc/platforms/powernv/pci.h
>>+++ b/arch/powerpc/platforms/powernv/pci.h
>>@@ -28,6 +28,7 @@ enum pnv_phb_model {
>>  /* Data associated with a PE, including IOMMU tracking etc.. */
>>  struct pnv_phb;
>>  struct pnv_ioda_pe {
>>+	struct kref		kref;
>>  	unsigned long		flags;
>>  	struct pnv_phb		*phb;
>>
>>@@ -120,7 +121,8 @@ struct pnv_phb {
>>  	void (*shutdown)(struct pnv_phb *phb);
>>  	int (*init_m64)(struct pnv_phb *phb);
>>  	void (*reserve_m64_pe)(struct pnv_phb *phb, struct pci_bus *bus);
>>-	int (*pick_m64_pe)(struct pnv_phb *phb, struct pci_bus *bus, int all);
>>+	struct pnv_ioda_pe *(*pick_m64_pe)(struct pnv_phb *phb,
>>+					   struct pci_bus *bus, int all);
>>  	int (*get_pe_state)(struct pnv_phb *phb, int pe_no);
>>  	void (*freeze_pe)(struct pnv_phb *phb, int pe_no);
>>  	int (*unfreeze_pe)(struct pnv_phb *phb, int pe_no, int opt);
>>

Thanks,
Gavin



More information about the Linuxppc-dev mailing list