[PATCH v4 07/21] powerpc/powernv: Release PEs dynamically

Alexey Kardashevskiy aik at ozlabs.ru
Sat May 9 22:43:23 AEST 2015


On 05/01/2015 04:02 PM, Gavin Shan wrote:
> The original code doesn't support releasing PEs dynamically, meaning
> that PE and the associated resources (IO, M32, M64 and DMA) can't
> be released when unplugging a PCI adapter from one hotpluggable slot.
>
> The patch takes object oriented methodology, introducs reference
> count to PE, which is initialized to 1 and increased with 1 when a
> new PCI device joins the PE. Once the last PCI device leaves the
> PE, the PE is going to be release together with its associated
> (IO, M32, M64, DMA) resources.


Too little commit log for non-trivial non-cut-n-paste 30KB patch...


>
> Signed-off-by: Gavin Shan <gwshan at linux.vnet.ibm.com>
> ---
>   arch/powerpc/include/asm/pci-bridge.h     |   3 +
>   arch/powerpc/kernel/pci-hotplug.c         |   5 +
>   arch/powerpc/platforms/powernv/pci-ioda.c | 658 +++++++++++++++++++-----------
>   arch/powerpc/platforms/powernv/pci.h      |   4 +-
>   4 files changed, 432 insertions(+), 238 deletions(-)
>
> diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h
> index 5367eb3..a6ad4b1 100644
> --- a/arch/powerpc/include/asm/pci-bridge.h
> +++ b/arch/powerpc/include/asm/pci-bridge.h
> @@ -31,6 +31,9 @@ struct pci_controller_ops {
>   	resource_size_t (*window_alignment)(struct pci_bus *, unsigned long type);
>   	void		(*setup_bridge)(struct pci_bus *, unsigned long);
>   	void		(*reset_secondary_bus)(struct pci_dev *dev);
> +
> +	/* Called when PCI device is released */
> +	void		(*release_device)(struct pci_dev *);
>   };
>
>   /*
> diff --git a/arch/powerpc/kernel/pci-hotplug.c b/arch/powerpc/kernel/pci-hotplug.c
> index 7ed85a6..0040343 100644
> --- a/arch/powerpc/kernel/pci-hotplug.c
> +++ b/arch/powerpc/kernel/pci-hotplug.c
> @@ -29,6 +29,11 @@
>    */
>   void pcibios_release_device(struct pci_dev *dev)
>   {
> +	struct pci_controller *hose = pci_bus_to_host(dev->bus);
> +
> +	if (hose->controller_ops.release_device)
> +		hose->controller_ops.release_device(dev);
> +
>   	eeh_remove_device(dev);
>   }
>
> diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
> index 910fb67..ef8c216 100644
> --- a/arch/powerpc/platforms/powernv/pci-ioda.c
> +++ b/arch/powerpc/platforms/powernv/pci-ioda.c
> @@ -12,6 +12,8 @@
>   #undef DEBUG
>
>   #include <linux/kernel.h>
> +#include <linux/atomic.h>
> +#include <linux/kref.h>
>   #include <linux/pci.h>
>   #include <linux/crash_dump.h>
>   #include <linux/debugfs.h>
> @@ -47,6 +49,8 @@
>   /* 256M DMA window, 4K TCE pages, 8 bytes TCE */
>   #define TCE32_TABLE_SIZE	((0x10000000 / 0x1000) * 8)
>
> +static void pnv_ioda_release_pe(struct kref *kref);
> +
>   static void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level,
>   			    const char *fmt, ...)
>   {
> @@ -123,25 +127,400 @@ static inline bool pnv_pci_is_mem_pref_64(unsigned long flags)
>   		(IORESOURCE_MEM_64 | IORESOURCE_PREFETCH));
>   }
>
> -static void pnv_ioda_reserve_pe(struct pnv_phb *phb, int pe_no)
> +static inline void pnv_ioda_pe_get(struct pnv_ioda_pe *pe)
>   {
> -	if (!(pe_no >= 0 && pe_no < phb->ioda.total_pe)) {
> -		pr_warn("%s: Invalid PE %d on PHB#%x\n",
> -			__func__, pe_no, phb->hose->global_number);
> +	if (!pe)
> +		return;
> +
> +	kref_get(&pe->kref);
> +}
> +
> +static inline void pnv_ioda_pe_put(struct pnv_ioda_pe *pe)
> +{
> +	unsigned int count;
> +
> +	if (!pe)
>   		return;
> +
> +	/*
> +	 * The count is initialized to 1 and increased with 1 when
> +	 * a new PCI device is bound with the PE. Once the last PCI
> +	 * device is leaving from the PE, the PE is going to be
> +	 * released.
> +	 */
> +	count = atomic_read(&pe->kref.refcount);
> +	if (count == 2)
> +		kref_sub(&pe->kref, 2, pnv_ioda_release_pe);
> +	else
> +		kref_put(&pe->kref, pnv_ioda_release_pe);


What if pnv_ioda_pe_get() gets called between atomic_read() and kref_sub()?


> +}
> +
> +static void pnv_pci_release_device(struct pci_dev *pdev)
> +{
> +	struct pci_controller *hose = pci_bus_to_host(pdev->bus);
> +	struct pnv_phb *phb = hose->private_data;
> +	struct pci_dn *pdn = pci_get_pdn(pdev);
> +	struct pnv_ioda_pe *pe;
> +
> +	if (pdn && pdn->pe_number != IODA_INVALID_PE) {
> +		pe = &phb->ioda.pe_array[pdn->pe_number];
> +		pnv_ioda_pe_put(pe);
> +		pdn->pe_number = IODA_INVALID_PE;
>   	}
> +}
>
> -	if (test_and_set_bit(pe_no, phb->ioda.pe_alloc)) {
> -		pr_warn("%s: PE %d was assigned on PHB#%x\n",
> -			__func__, pe_no, phb->hose->global_number);
> +static void pnv_ioda_release_pe_dma(struct pnv_ioda_pe *pe)
> +{
> +	struct pnv_phb *phb = pe->phb;
> +	int index, count;
> +	unsigned long tbl_addr, tbl_size;
> +
> +	/* No DMA capability for slave PEs */
> +	if (pe->flags & PNV_IODA_PE_SLAVE)
> +		return;
> +
> +	/* Bypass DMA window */
> +	if (phb->type == PNV_PHB_IODA2 &&
> +	    pe->tce_bypass_enabled &&
> +	    pe->tce32_table &&
> +	    pe->tce32_table->set_bypass)
> +		pe->tce32_table->set_bypass(pe->tce32_table, false);
> +
> +	/* 32-bits DMA window */
> +	count = pe->tce32_seg_end - pe->tce32_seg_start;
> +	tbl_addr = pe->tce32_table->it_base;
> +	if (!count)
>   		return;
> +
> +	/* Free IOMMU table */
> +	iommu_free_table(pe->tce32_table,
> +			 of_node_full_name(phb->hose->dn));
> +
> +	/* Deconfigure TCE table */
> +	switch (phb->type) {
> +	case PNV_PHB_IODA1:
> +		for (index = 0; index < count; index++)
> +			opal_pci_map_pe_dma_window(phb->opal_id,
> +						   pe->pe_number,
> +						   pe->tce32_seg_start + index,
> +						   1,
> +						   __pa(tbl_addr) +
> +						   index * TCE32_TABLE_SIZE,
> +						   0,
> +						   0x1000);
> +		bitmap_clear(phb->ioda.tce32_segmap,
> +			     pe->tce32_seg_start,
> +			     count);
> +		tbl_size = TCE32_TABLE_SIZE * count;
> +		break;
> +	case PNV_PHB_IODA2:
> +		opal_pci_map_pe_dma_window(phb->opal_id,
> +					   pe->pe_number,
> +					   pe->pe_number << 1,
> +					   1,
> +					   __pa(tbl_addr),
> +					   0,
> +					   0x1000);
> +		tbl_size = (1ul << ilog2(phb->ioda.m32_pci_base));
> +		tbl_size = (tbl_size >> IOMMU_PAGE_SHIFT_4K) * 8;
> +		break;
> +	default:
> +		pe_warn(pe, "Unsupported PHB type %d\n", phb->type);
> +		return;
> +	}
> +
> +	/* Free memory of IOMMU table */
> +	free_pages(tbl_addr, get_order(tbl_size));


You just programmed the table address to TVT and then you are releasing the 
pages. It does not seem right, it will leave garbage in TVT. Also, I am 
adding helpers to alloc/free TCE pages in DDW patchset, you could reuse 
bits from there (I'll post v10 soon, you'll be in copy and you'll have to 
review that ;) ).


> +	pe->tce32_table = NULL;
> +	pe->tce32_seg_start = 0;
> +	pe->tce32_seg_end = 0;
> +}
> +
> +static void pnv_ioda_release_pe_seg(struct pnv_ioda_pe *pe)
> +{
> +	struct pnv_phb *phb = pe->phb;
> +	unsigned long *segmap = NULL, *pe_segmap = NULL;
> +	int i;
> +	uint16_t win, win_type[] = { OPAL_IO_WINDOW_TYPE,
> +				     OPAL_M32_WINDOW_TYPE,
> +				     OPAL_M64_WINDOW_TYPE };
> +
> +	for (win = 0; win < ARRAY_SIZE(win_type); win++) {
> +		switch (win_type[win]) {
> +		case OPAL_IO_WINDOW_TYPE:
> +			segmap = phb->ioda.io_segmap;
> +			pe_segmap = pe->io_segmap;
> +			break;
> +		case OPAL_M32_WINDOW_TYPE:
> +			segmap = phb->ioda.m32_segmap;
> +			pe_segmap = pe->m32_segmap;
> +			break;
> +		case OPAL_M64_WINDOW_TYPE:
> +			segmap = phb->ioda.m64_segmap;
> +			pe_segmap = pe->m64_segmap;
> +			break;
> +		}
> +		i = -1;
> +		while ((i = find_next_bit(pe_segmap,
> +			phb->ioda.total_pe, i + 1)) < phb->ioda.total_pe) {
> +			if (win_type[win] == OPAL_IO_WINDOW_TYPE ||
> +			    win_type[win] == OPAL_M32_WINDOW_TYPE)
> +				opal_pci_map_pe_mmio_window(phb->opal_id,
> +						phb->ioda.reserved_pe,
> +						win_type[win], 0, i);
> +			else if (phb->type == PNV_PHB_IODA1)
> +				opal_pci_map_pe_mmio_window(phb->opal_id,
> +						phb->ioda.reserved_pe,
> +						win_type[win],
> +						i / 8, i % 8);

The function is called ""release" but it programs something what looks like 
reasonable values, is it correct?



> +
> +			clear_bit(i, pe_segmap);
> +			clear_bit(i, segmap);
> +		}
> +	}
> +}
> +
> +static int pnv_ioda_set_one_peltv(struct pnv_phb *phb,
> +				  struct pnv_ioda_pe *parent,
> +				  struct pnv_ioda_pe *child,
> +				  bool is_add)
> +{
> +	const char *desc = is_add ? "adding" : "removing";
> +	uint8_t op = is_add ? OPAL_ADD_PE_TO_DOMAIN :
> +			      OPAL_REMOVE_PE_FROM_DOMAIN;
> +	struct pnv_ioda_pe *slave;
> +	long rc;
> +
> +	/* Parent PE affects child PE */
> +	rc = opal_pci_set_peltv(phb->opal_id, parent->pe_number,
> +				child->pe_number, op);
> +	if (rc != OPAL_SUCCESS) {
> +		pe_warn(child, "OPAL error %ld %s to parent PELTV\n",
> +			rc, desc);
> +		return -ENXIO;
> +	}
> +
> +	if (!(child->flags & PNV_IODA_PE_MASTER))
> +		return 0;
> +
> +	/* Compound case: parent PE affects slave PEs */
> +	list_for_each_entry(slave, &child->slaves, list) {
> +		rc = opal_pci_set_peltv(phb->opal_id, parent->pe_number,
> +					slave->pe_number, op);
> +		if (rc != OPAL_SUCCESS) {
> +			pe_warn(slave, "OPAL error %ld %s to parent PELTV\n",
> +				rc, desc);
> +			return -ENXIO;
> +		}
> +	}
> +
> +	return 0;
> +}
> +
> +static int pnv_ioda_set_peltv(struct pnv_ioda_pe *pe, bool is_add)
> +{
> +	struct pnv_phb *phb = pe->phb;
> +	struct pnv_ioda_pe *slave;
> +	struct pci_dev *pdev = NULL;
> +	int ret;
> +
> +	/*
> +	 * Clear PE frozen state. If it's master PE, we need
> +	 * clear slave PE frozen state as well.
> +	 */
> +	opal_pci_eeh_freeze_clear(phb->opal_id,
> +				  pe->pe_number,
> +				  OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
> +	if (pe->flags & PNV_IODA_PE_MASTER) {
> +		list_for_each_entry(slave, &pe->slaves, list) {
> +			opal_pci_eeh_freeze_clear(phb->opal_id,
> +						  slave->pe_number,
> +						  OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
> +		}
> +	}
> +
> +	/*
> +	 * Associate PE in PELT. We need add the PE into the
> +	 * corresponding PELT-V as well. Otherwise, the error
> +	 * originated from the PE might contribute to other
> +	 * PEs.
> +	 */
> +	ret = pnv_ioda_set_one_peltv(phb, pe, pe, is_add);
> +	if (ret)
> +		return ret;
> +
> +	/* For compound PEs, any one affects all of them */
> +	if (pe->flags & PNV_IODA_PE_MASTER) {
> +		list_for_each_entry(slave, &pe->slaves, list) {
> +			ret = pnv_ioda_set_one_peltv(phb, slave, pe, is_add);
> +			if (ret)
> +				return ret;
> +		}
> +	}
> +
> +	if (pe->flags & (PNV_IODA_PE_BUS_ALL | PNV_IODA_PE_BUS))
> +		pdev = pe->pbus->self;
> +	else if (pe->flags & PNV_IODA_PE_DEV)
> +		pdev = pe->pdev->bus->self;
> +#ifdef CONFIG_PCI_IOV
> +	else if (pe->flags & PNV_IODA_PE_VF)
> +		pdev = pe->parent_dev->bus->self;
> +#endif /* CONFIG_PCI_IOV */
> +
> +	while (pdev) {
> +		struct pci_dn *pdn = pci_get_pdn(pdev);
> +		struct pnv_ioda_pe *parent;
> +
> +		if (pdn && pdn->pe_number != IODA_INVALID_PE) {
> +			parent = &phb->ioda.pe_array[pdn->pe_number];
> +			ret = pnv_ioda_set_one_peltv(phb, parent, pe, is_add);
> +			if (ret)
> +				return ret;
> +		}
> +
> +		pdev = pdev->bus->self;
> +	}
> +
> +	return 0;
> +}
> +
> +static void pnv_ioda_deconfigure_pe(struct pnv_ioda_pe *pe)


It used to be under #ifdef CONFIG_PCI_IOV, now it is not. Looks like just 
moving of this function to a different place deserves a separate patch with 
a comment why ("it is going to be used now for non-SRIOV case too" may be?).



> +{
> +	struct pnv_phb *phb = pe->phb;
> +	struct pci_dev *parent;
> +	uint8_t bcomp, dcomp, fcomp;
> +	long rid_end, rid;
> +	int64_t rc;
> +
> +	/* Tear down MVE */
> +	if (phb->type == PNV_PHB_IODA1 &&
> +	    pe->mve_number != -1) {
> +		rc = opal_pci_set_mve(phb->opal_id,
> +				      pe->mve_number,
> +				      phb->ioda.reserved_pe);
> +		if (rc != OPAL_SUCCESS)
> +			pe_warn(pe, "Error %lld unmapping MVE#%d\n",
> +				rc, pe->mve_number);
> +		rc = opal_pci_set_mve_enable(phb->opal_id,
> +					     pe->mve_number,
> +					     OPAL_DISABLE_MVE);
> +		if (rc != OPAL_SUCCESS)
> +			pe_warn(pe, "Error %lld disabling MVE#%d\n",
> +				rc, pe->mve_number);
> +		pe->mve_number = -1;
> +	}
> +
> +	/* Unmapping PELTV */
> +	pnv_ioda_set_peltv(pe, false);
> +
> +	/* To unmap PELTM */
> +	if (pe->pbus) {
> +		int count;
> +
> +		dcomp = OPAL_IGNORE_RID_DEVICE_NUMBER;
> +		fcomp = OPAL_IGNORE_RID_FUNCTION_NUMBER;
> +		parent = pe->pbus->self;
> +		if (pe->flags & PNV_IODA_PE_BUS_ALL)
> +			count = pe->pbus->busn_res.end -
> +				pe->pbus->busn_res.start + 1;
> +		else
> +			count = 1;
> +
> +		switch(count) {
> +		case  1: bcomp = OpalPciBusAll;   break;
> +		case  2: bcomp = OpalPciBus7Bits; break;
> +		case  4: bcomp = OpalPciBus6Bits; break;
> +		case  8: bcomp = OpalPciBus5Bits; break;
> +		case 16: bcomp = OpalPciBus4Bits; break;
> +		case 32: bcomp = OpalPciBus3Bits; break;
> +		default:
> +			/* Fail back to case of one bus */
> +			pe_warn(pe, "Cannot support %d buses\n", count);
> +			bcomp = OpalPciBusAll;
> +		}
> +		rid_end = pe->rid + (count << 8);
> +	} else {
> +#ifdef CONFIG_PCI_IOV
> +		if (pe->flags & PNV_IODA_PE_VF)
> +			parent = pe->parent_dev;
> +		else
> +#endif
> +			parent = pe->pdev->bus->self;
> +		bcomp = OpalPciBusAll;
> +		dcomp = OPAL_COMPARE_RID_DEVICE_NUMBER;
> +		fcomp = OPAL_COMPARE_RID_FUNCTION_NUMBER;
> +		rid_end = pe->rid + 1;
> +	}
> +
> +	/* Clear RID mapping */
> +	for (rid = pe->rid; rid < rid_end; rid++)
> +		phb->ioda.pe_rmap[rid] = IODA_INVALID_PE;
> +
> +	/* Unmapping PELTM */
> +	rc = opal_pci_set_pe(phb->opal_id, pe->pe_number, pe->rid,
> +			     bcomp, dcomp, fcomp, OPAL_UNMAP_PE);
> +	if (rc)
> +		pe_warn(pe, "Error %ld unmapping PELTM\n", rc);
> +}
> +
> +static void pnv_ioda_release_pe(struct kref *kref)
> +{
> +	struct pnv_ioda_pe *pe = container_of(kref, struct pnv_ioda_pe, kref);
> +	struct pnv_ioda_pe *tmp, *slave;
> +	struct pnv_phb *phb = pe->phb;
> +
> +	pnv_ioda_release_pe_dma(pe);
> +	pnv_ioda_release_pe_seg(pe);
> +	pnv_ioda_deconfigure_pe(pe);
> +
> +	/* Release slave PEs for compound PE */
> +	if (pe->flags & PNV_IODA_PE_MASTER) {
> +		list_for_each_entry_safe(slave, tmp, &pe->slaves, list)
> +			pnv_ioda_pe_put(slave);
> +	}
> +
> +	/* Remove the PE from various list. We need remove slave
> +	 * PE from master's list.
> +	 */
> +	list_del(&pe->dma_link);
> +	list_del(&pe->list);
> +
> +	/* Free PE number */
> +	clear_bit(pe->pe_number, phb->ioda.pe_alloc);
> +}
> +
> +static struct pnv_ioda_pe *pnv_ioda_init_pe(struct pnv_phb *phb,
> +					    int pe_no)
> +{
> +	struct pnv_ioda_pe *pe = &phb->ioda.pe_array[pe_no];
> +
> +	kref_init(&pe->kref);
> +	pe->phb = phb;
> +	pe->pe_number = pe_no;
> +	INIT_LIST_HEAD(&pe->dma_link);
> +	INIT_LIST_HEAD(&pe->list);
> +
> +	return pe;
> +}
> +
> +static struct pnv_ioda_pe *pnv_ioda_reserve_pe(struct pnv_phb *phb,
> +					       int pe_no)
> +{
> +	if (!(pe_no >= 0 && pe_no < phb->ioda.total_pe)) {
> +		pr_warn("%s: Invalid PE %d on PHB#%x\n",
> +			__func__, pe_no, phb->hose->global_number);
> +		return NULL;
>   	}
>
> -	phb->ioda.pe_array[pe_no].phb = phb;
> -	phb->ioda.pe_array[pe_no].pe_number = pe_no;
> +	/*
> +	 * Same PE might be reserved for multiple times, which
> +	 * is out of problem actually.
> +	 */
> +	set_bit(pe_no, phb->ioda.pe_alloc);
> +	return pnv_ioda_init_pe(phb, pe_no);
>   }
>
> -static int pnv_ioda_alloc_pe(struct pnv_phb *phb)
> +static struct pnv_ioda_pe *pnv_ioda_alloc_pe(struct pnv_phb *phb)
>   {
>   	unsigned long pe_no;
>   	unsigned long limit = phb->ioda.total_pe - 1;
> @@ -154,20 +533,10 @@ static int pnv_ioda_alloc_pe(struct pnv_phb *phb)
>   			break;
>
>   		if (--limit >= phb->ioda.total_pe)
> -			return IODA_INVALID_PE;
> +			return NULL;
>   	} while(1);
>
> -	phb->ioda.pe_array[pe_no].phb = phb;
> -	phb->ioda.pe_array[pe_no].pe_number = pe_no;
> -	return pe_no;
> -}
> -
> -static void pnv_ioda_free_pe(struct pnv_phb *phb, int pe)
> -{
> -	WARN_ON(phb->ioda.pe_array[pe].pdev);
> -
> -	memset(&phb->ioda.pe_array[pe], 0, sizeof(struct pnv_ioda_pe));
> -	clear_bit(pe, phb->ioda.pe_alloc);
> +	return pnv_ioda_init_pe(phb, pe_no);
>   }
>
>   static int pnv_ioda1_init_m64(struct pnv_phb *phb)
> @@ -382,8 +751,9 @@ static void pnv_ioda_reserve_m64_pe(struct pnv_phb *phb,
>   	}
>   }
>
> -static int pnv_ioda_pick_m64_pe(struct pnv_phb *phb,
> -				struct pci_bus *bus, int all)
> +static struct pnv_ioda_pe *pnv_ioda_pick_m64_pe(struct pnv_phb *phb,
> +						struct pci_bus *bus,
> +						int all)


Mechanic changes like this could easily go to a separate patch.


>   {
>   	resource_size_t segsz = phb->ioda.m64_segsize;
>   	struct pci_dev *pdev;
> @@ -394,14 +764,14 @@ static int pnv_ioda_pick_m64_pe(struct pnv_phb *phb,
>   	int i;
>
>   	if (!pnv_ioda_need_m64_pe(phb, bus))
> -		return IODA_INVALID_PE;
> +		return NULL;
>
>           /* Allocate bitmap */
>   	size = _ALIGN_UP(phb->ioda.total_pe / 8, sizeof(unsigned long));
>   	pe_bitsmap = kzalloc(size, GFP_KERNEL);
>   	if (!pe_bitsmap) {
>   		pr_warn("%s: Out of memory !\n", __func__);
> -		return IODA_INVALID_PE;
> +		return NULL;
>   	}
>
>   	/* The bridge's M64 window might be extended to PHB's M64
> @@ -438,7 +808,7 @@ static int pnv_ioda_pick_m64_pe(struct pnv_phb *phb,
>   	/* No M64 window found ? */
>   	if (bitmap_empty(pe_bitsmap, phb->ioda.total_pe)) {
>   		kfree(pe_bitsmap);
> -		return IODA_INVALID_PE;
> +		return NULL;
>   	}
>
>   	/* Figure out the master PE and put all slave PEs
> @@ -491,7 +861,7 @@ static int pnv_ioda_pick_m64_pe(struct pnv_phb *phb,
>   	}
>
>   	kfree(pe_bitsmap);
> -	return master_pe->pe_number;
> +	return master_pe;
>   }
>
>   static void __init pnv_ioda_parse_m64_window(struct pnv_phb *phb)
> @@ -695,7 +1065,7 @@ static int pnv_ioda_get_pe_state(struct pnv_phb *phb, int pe_no)
>    * but in the meantime, we need to protect them to avoid warnings
>    */
>   #ifdef CONFIG_PCI_MSI
> -static struct pnv_ioda_pe *pnv_ioda_get_pe(struct pci_dev *dev)
> +static struct pnv_ioda_pe *pnv_ioda_pci_dev_to_pe(struct pci_dev *dev)
>   {
>   	struct pci_controller *hose = pci_bus_to_host(dev->bus);
>   	struct pnv_phb *phb = hose->private_data;
> @@ -709,191 +1079,6 @@ static struct pnv_ioda_pe *pnv_ioda_get_pe(struct pci_dev *dev)
>   }
>   #endif /* CONFIG_PCI_MSI */
>
> -static int pnv_ioda_set_one_peltv(struct pnv_phb *phb,
> -				  struct pnv_ioda_pe *parent,
> -				  struct pnv_ioda_pe *child,
> -				  bool is_add)
> -{
> -	const char *desc = is_add ? "adding" : "removing";
> -	uint8_t op = is_add ? OPAL_ADD_PE_TO_DOMAIN :
> -			      OPAL_REMOVE_PE_FROM_DOMAIN;
> -	struct pnv_ioda_pe *slave;
> -	long rc;
> -
> -	/* Parent PE affects child PE */
> -	rc = opal_pci_set_peltv(phb->opal_id, parent->pe_number,
> -				child->pe_number, op);
> -	if (rc != OPAL_SUCCESS) {
> -		pe_warn(child, "OPAL error %ld %s to parent PELTV\n",
> -			rc, desc);
> -		return -ENXIO;
> -	}
> -
> -	if (!(child->flags & PNV_IODA_PE_MASTER))
> -		return 0;
> -
> -	/* Compound case: parent PE affects slave PEs */
> -	list_for_each_entry(slave, &child->slaves, list) {
> -		rc = opal_pci_set_peltv(phb->opal_id, parent->pe_number,
> -					slave->pe_number, op);
> -		if (rc != OPAL_SUCCESS) {
> -			pe_warn(slave, "OPAL error %ld %s to parent PELTV\n",
> -				rc, desc);
> -			return -ENXIO;
> -		}
> -	}
> -
> -	return 0;
> -}
> -
> -static int pnv_ioda_set_peltv(struct pnv_phb *phb,
> -			      struct pnv_ioda_pe *pe,
> -			      bool is_add)
> -{
> -	struct pnv_ioda_pe *slave;
> -	struct pci_dev *pdev = NULL;
> -	int ret;
> -
> -	/*
> -	 * Clear PE frozen state. If it's master PE, we need
> -	 * clear slave PE frozen state as well.
> -	 */
> -	if (is_add) {
> -		opal_pci_eeh_freeze_clear(phb->opal_id, pe->pe_number,
> -					  OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
> -		if (pe->flags & PNV_IODA_PE_MASTER) {
> -			list_for_each_entry(slave, &pe->slaves, list)
> -				opal_pci_eeh_freeze_clear(phb->opal_id,
> -							  slave->pe_number,
> -							  OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
> -		}
> -	}
> -
> -	/*
> -	 * Associate PE in PELT. We need add the PE into the
> -	 * corresponding PELT-V as well. Otherwise, the error
> -	 * originated from the PE might contribute to other
> -	 * PEs.
> -	 */
> -	ret = pnv_ioda_set_one_peltv(phb, pe, pe, is_add);
> -	if (ret)
> -		return ret;
> -
> -	/* For compound PEs, any one affects all of them */
> -	if (pe->flags & PNV_IODA_PE_MASTER) {
> -		list_for_each_entry(slave, &pe->slaves, list) {
> -			ret = pnv_ioda_set_one_peltv(phb, slave, pe, is_add);
> -			if (ret)
> -				return ret;
> -		}
> -	}
> -
> -	if (pe->flags & (PNV_IODA_PE_BUS_ALL | PNV_IODA_PE_BUS))
> -		pdev = pe->pbus->self;
> -	else if (pe->flags & PNV_IODA_PE_DEV)
> -		pdev = pe->pdev->bus->self;
> -#ifdef CONFIG_PCI_IOV
> -	else if (pe->flags & PNV_IODA_PE_VF)
> -		pdev = pe->parent_dev->bus->self;
> -#endif /* CONFIG_PCI_IOV */
> -	while (pdev) {
> -		struct pci_dn *pdn = pci_get_pdn(pdev);
> -		struct pnv_ioda_pe *parent;
> -
> -		if (pdn && pdn->pe_number != IODA_INVALID_PE) {
> -			parent = &phb->ioda.pe_array[pdn->pe_number];
> -			ret = pnv_ioda_set_one_peltv(phb, parent, pe, is_add);
> -			if (ret)
> -				return ret;
> -		}
> -
> -		pdev = pdev->bus->self;
> -	}
> -
> -	return 0;
> -}
> -
> -#ifdef CONFIG_PCI_IOV
> -static int pnv_ioda_deconfigure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
> -{
> -	struct pci_dev *parent;
> -	uint8_t bcomp, dcomp, fcomp;
> -	int64_t rc;
> -	long rid_end, rid;
> -
> -	/* Currently, we just deconfigure VF PE. Bus PE will always there.*/
> -	if (pe->pbus) {
> -		int count;
> -
> -		dcomp = OPAL_IGNORE_RID_DEVICE_NUMBER;
> -		fcomp = OPAL_IGNORE_RID_FUNCTION_NUMBER;
> -		parent = pe->pbus->self;
> -		if (pe->flags & PNV_IODA_PE_BUS_ALL)
> -			count = pe->pbus->busn_res.end - pe->pbus->busn_res.start + 1;
> -		else
> -			count = 1;
> -
> -		switch(count) {
> -		case  1: bcomp = OpalPciBusAll;         break;
> -		case  2: bcomp = OpalPciBus7Bits;       break;
> -		case  4: bcomp = OpalPciBus6Bits;       break;
> -		case  8: bcomp = OpalPciBus5Bits;       break;
> -		case 16: bcomp = OpalPciBus4Bits;       break;
> -		case 32: bcomp = OpalPciBus3Bits;       break;
> -		default:
> -			dev_err(&pe->pbus->dev, "Number of subordinate buses %d unsupported\n",
> -			        count);
> -			/* Do an exact match only */
> -			bcomp = OpalPciBusAll;
> -		}
> -		rid_end = pe->rid + (count << 8);
> -	} else {
> -		if (pe->flags & PNV_IODA_PE_VF)
> -			parent = pe->parent_dev;
> -		else
> -			parent = pe->pdev->bus->self;
> -		bcomp = OpalPciBusAll;
> -		dcomp = OPAL_COMPARE_RID_DEVICE_NUMBER;
> -		fcomp = OPAL_COMPARE_RID_FUNCTION_NUMBER;
> -		rid_end = pe->rid + 1;
> -	}
> -
> -	/* Clear the reverse map */
> -	for (rid = pe->rid; rid < rid_end; rid++)
> -		phb->ioda.pe_rmap[rid] = IODA_INVALID_PE;
> -
> -	/* Release from all parents PELT-V */
> -	while (parent) {
> -		struct pci_dn *pdn = pci_get_pdn(parent);
> -		if (pdn && pdn->pe_number != IODA_INVALID_PE) {
> -			rc = opal_pci_set_peltv(phb->opal_id, pdn->pe_number,
> -						pe->pe_number, OPAL_REMOVE_PE_FROM_DOMAIN);
> -			/* XXX What to do in case of error ? */


Not much :) Free associated memory and mark it "dead" so it won't be used 
again till reboot. In what circumstance can this opal_pci_set_peltv() fail 
at all?


> -		}
> -		parent = parent->bus->self;
> -	}
> -
> -	opal_pci_eeh_freeze_set(phb->opal_id, pe->pe_number,
> -				  OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
> -
> -	/* Disassociate PE in PELT */
> -	rc = opal_pci_set_peltv(phb->opal_id, pe->pe_number,
> -				pe->pe_number, OPAL_REMOVE_PE_FROM_DOMAIN);
> -	if (rc)
> -		pe_warn(pe, "OPAL error %ld remove self from PELTV\n", rc);
> -	rc = opal_pci_set_pe(phb->opal_id, pe->pe_number, pe->rid,
> -			     bcomp, dcomp, fcomp, OPAL_UNMAP_PE);
> -	if (rc)
> -		pe_err(pe, "OPAL error %ld trying to setup PELT table\n", rc);
> -
> -	pe->pbus = NULL;
> -	pe->pdev = NULL;
> -	pe->parent_dev = NULL;
> -
> -	return 0;
> -}
> -#endif /* CONFIG_PCI_IOV */
> -
>   static int pnv_ioda_configure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
>   {
>   	struct pci_dev *parent;
> @@ -953,7 +1138,7 @@ static int pnv_ioda_configure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
>   	}
>
>   	/* Configure PELTV */
> -	pnv_ioda_set_peltv(phb, pe, true);
> +	pnv_ioda_set_peltv(pe, true);
>
>   	/* Setup reverse map */
>   	for (rid = pe->rid; rid < rid_end; rid++)
> @@ -1207,6 +1392,8 @@ static void pnv_ioda_setup_same_PE(struct pci_bus *bus, struct pnv_ioda_pe *pe)
>   		if (pdn->pe_number != IODA_INVALID_PE)
>   			continue;
>
> +		/* Increase reference count of the parent PE */

When you comment like this, I read it as the comment belongs to the whole 
next chunk till the first empty line, i.e. to all 5 lines below, which is 
not the case. I'd remove the comment as 1) "pe_get" in pnv_ioda_pe_get() 
name suggests incrementing the reference counter 2) "pe" is always parent 
in this function. I do not insist though.


> +		pnv_ioda_pe_get(pe);
>   		pdn->pe_number = pe->pe_number;
>   		pe->dma_weight += pnv_ioda_dev_dma_weight(dev);
>   		if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate)
> @@ -1224,7 +1411,7 @@ static struct pnv_ioda_pe *pnv_ioda_setup_bus_PE(struct pci_bus *bus, int all)
>   {
>   	struct pci_controller *hose = pci_bus_to_host(bus);
>   	struct pnv_phb *phb = hose->private_data;
> -	struct pnv_ioda_pe *pe;
> +	struct pnv_ioda_pe *pe = NULL;
>   	int pe_num = IODA_INVALID_PE;
>
>   	/* For partial hotplug case, the PE instance hasn't been destroyed
> @@ -1240,24 +1427,24 @@ static struct pnv_ioda_pe *pnv_ioda_setup_bus_PE(struct pci_bus *bus, int all)
>   	}
>
>   	/* PE number for root bus should have been reserved */
> -	if (pci_is_root_bus(bus))
> -		pe_num = phb->ioda.root_pe_no;
> +	if (pci_is_root_bus(bus) &&
> +	    phb->ioda.root_pe_no != IODA_INVALID_PE)
> +		pe = &phb->ioda.pe_array[phb->ioda.root_pe_no];
>
>   	/* Check if PE is determined by M64 */
> -	if (pe_num == IODA_INVALID_PE && phb->pick_m64_pe)
> -		pe_num = phb->pick_m64_pe(phb, bus, all);
> +	if (!pe && phb->pick_m64_pe)
> +		pe = phb->pick_m64_pe(phb, bus, all);
>
>   	/* The PE number isn't pinned by M64 */
> -	if (pe_num == IODA_INVALID_PE)
> -		pe_num = pnv_ioda_alloc_pe(phb);
> +	if (!pe)
> +		pe = pnv_ioda_alloc_pe(phb);
>
> -	if (pe_num == IODA_INVALID_PE) {
> -		pr_warning("%s: Not enough PE# available for PCI bus %04x:%02x\n",
> +	if (!pe) {
> +		pr_warn("%s: No enough PE# available for PCI bus %04x:%02x\n",
>   			__func__, pci_domain_nr(bus), bus->number);
>   		return NULL;
>   	}
>
> -	pe = &phb->ioda.pe_array[pe_num];
>   	pe->flags |= (all ? PNV_IODA_PE_BUS_ALL : PNV_IODA_PE_BUS);
>   	pe->pbus = bus;
>   	pe->pdev = NULL;
> @@ -1274,14 +1461,12 @@ static struct pnv_ioda_pe *pnv_ioda_setup_bus_PE(struct pci_bus *bus, int all)
>
>   	if (pnv_ioda_configure_pe(phb, pe)) {
>   		/* XXX What do we do here ? */
> -		if (pe_num)
> -			pnv_ioda_free_pe(phb, pe_num);
> -		pe->pbus = NULL;
> +		pnv_ioda_pe_put(pe);
>   		return NULL;
>   	}
>
>   	pe->tce32_table = kzalloc_node(sizeof(struct iommu_table),
> -			GFP_KERNEL, hose->node);
> +				       GFP_KERNEL, hose->node);

Seems like spaces change only - if you really want this change (which I 
hate - makes code look inaccurate to my taste but it seems I am in minority 
here :) ), please put it to the separate patch.


>   	pe->tce32_table->data = pe;
>
>   	/* Associate it with all child devices */
> @@ -1521,9 +1706,9 @@ static void pnv_ioda_release_vf_PE(struct pci_dev *pdev, u16 num_vfs)
>   		list_del(&pe->list);
>   		mutex_unlock(&phb->ioda.pe_list_mutex);
>
> -		pnv_ioda_deconfigure_pe(phb, pe);
> +		pnv_ioda_deconfigure_pe(pe);


Is this change necessary to get "Release PEs dynamically" working? Move it 
to mechanical changes patch may be?


>
> -		pnv_ioda_free_pe(phb, pe->pe_number);
> +		pnv_ioda_pe_put(pe);
>   	}
>   }
>
> @@ -1601,9 +1786,7 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs)
>
>   		if (pnv_ioda_configure_pe(phb, pe)) {
>   			/* XXX What do we do here ? */
> -			if (pe_num)
> -				pnv_ioda_free_pe(phb, pe_num);
> -			pe->pdev = NULL;
> +			pnv_ioda_pe_put(pe);
>   			continue;
>   		}
>
> @@ -2263,7 +2446,7 @@ int pnv_phb_to_cxl_mode(struct pci_dev *dev, uint64_t mode)
>   	struct pnv_ioda_pe *pe;
>   	int rc;
>
> -	pe = pnv_ioda_get_pe(dev);
> +	pe = pnv_ioda_pci_dev_to_pe(dev);


And this change could to separately. Not clear how this helps to "Release 
PEs dynamically".


>   	if (!pe)
>   		return -ENODEV;
>
> @@ -2379,7 +2562,7 @@ int pnv_cxl_ioda_msi_setup(struct pci_dev *dev, unsigned int hwirq,
>   	struct pnv_ioda_pe *pe;
>   	int rc;
>
> -	if (!(pe = pnv_ioda_get_pe(dev)))
> +	if (!(pe = pnv_ioda_pci_dev_to_pe(dev)))
>   		return -ENODEV;
>
>   	/* Assign XIVE to PE */
> @@ -2401,7 +2584,7 @@ static int pnv_pci_ioda_msi_setup(struct pnv_phb *phb, struct pci_dev *dev,
>   				  unsigned int hwirq, unsigned int virq,
>   				  unsigned int is_64, struct msi_msg *msg)
>   {
> -	struct pnv_ioda_pe *pe = pnv_ioda_get_pe(dev);
> +	struct pnv_ioda_pe *pe = pnv_ioda_pci_dev_to_pe(dev);
>   	unsigned int xive_num = hwirq - phb->msi_base;
>   	__be32 data;
>   	int rc;
> @@ -3065,6 +3248,7 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np,
>   	pnv_pci_controller_ops.setup_bridge = pnv_pci_setup_bridge;
>   	pnv_pci_controller_ops.window_alignment = pnv_pci_window_alignment;
>   	pnv_pci_controller_ops.reset_secondary_bus = pnv_pci_reset_secondary_bus;
> +	pnv_pci_controller_ops.release_device = pnv_pci_release_device;
>   	hose->controller_ops = pnv_pci_controller_ops;
>
>   #ifdef CONFIG_PCI_IOV
> diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
> index 1bea3a8..8b10f01 100644
> --- a/arch/powerpc/platforms/powernv/pci.h
> +++ b/arch/powerpc/platforms/powernv/pci.h
> @@ -28,6 +28,7 @@ enum pnv_phb_model {
>   /* Data associated with a PE, including IOMMU tracking etc.. */
>   struct pnv_phb;
>   struct pnv_ioda_pe {
> +	struct kref		kref;
>   	unsigned long		flags;
>   	struct pnv_phb		*phb;
>
> @@ -120,7 +121,8 @@ struct pnv_phb {
>   	void (*shutdown)(struct pnv_phb *phb);
>   	int (*init_m64)(struct pnv_phb *phb);
>   	void (*reserve_m64_pe)(struct pnv_phb *phb, struct pci_bus *bus);
> -	int (*pick_m64_pe)(struct pnv_phb *phb, struct pci_bus *bus, int all);
> +	struct pnv_ioda_pe *(*pick_m64_pe)(struct pnv_phb *phb,
> +					   struct pci_bus *bus, int all);
>   	int (*get_pe_state)(struct pnv_phb *phb, int pe_no);
>   	void (*freeze_pe)(struct pnv_phb *phb, int pe_no);
>   	int (*unfreeze_pe)(struct pnv_phb *phb, int pe_no, int opt);
>


-- 
Alexey


More information about the Linuxppc-dev mailing list