[PATCH v4 07/21] powerpc/powernv: Release PEs dynamically
Alexey Kardashevskiy
aik at ozlabs.ru
Mon May 11 17:02:08 AEST 2015
On 05/11/2015 04:25 PM, Gavin Shan wrote:
> On Sat, May 09, 2015 at 10:43:23PM +1000, Alexey Kardashevskiy wrote:
>> On 05/01/2015 04:02 PM, Gavin Shan wrote:
>>> The original code doesn't support releasing PEs dynamically, meaning
>>> that PE and the associated resources (IO, M32, M64 and DMA) can't
>>> be released when unplugging a PCI adapter from one hotpluggable slot.
>>>
>>> The patch takes object oriented methodology, introducs reference
>>> count to PE, which is initialized to 1 and increased with 1 when a
>>> new PCI device joins the PE. Once the last PCI device leaves the
>>> PE, the PE is going to be release together with its associated
>>> (IO, M32, M64, DMA) resources.
>>
>>
>> Too little commit log for non-trivial non-cut-n-paste 30KB patch...
>>
>
> Ok. I'll add more details in next revision.
>
>>>
>>> Signed-off-by: Gavin Shan <gwshan at linux.vnet.ibm.com>
>>> ---
>>> arch/powerpc/include/asm/pci-bridge.h | 3 +
>>> arch/powerpc/kernel/pci-hotplug.c | 5 +
>>> arch/powerpc/platforms/powernv/pci-ioda.c | 658 +++++++++++++++++++-----------
>>> arch/powerpc/platforms/powernv/pci.h | 4 +-
>>> 4 files changed, 432 insertions(+), 238 deletions(-)
>>>
>>> diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h
>>> index 5367eb3..a6ad4b1 100644
>>> --- a/arch/powerpc/include/asm/pci-bridge.h
>>> +++ b/arch/powerpc/include/asm/pci-bridge.h
>>> @@ -31,6 +31,9 @@ struct pci_controller_ops {
>>> resource_size_t (*window_alignment)(struct pci_bus *, unsigned long type);
>>> void (*setup_bridge)(struct pci_bus *, unsigned long);
>>> void (*reset_secondary_bus)(struct pci_dev *dev);
>>> +
>>> + /* Called when PCI device is released */
>>> + void (*release_device)(struct pci_dev *);
>>> };
>>>
>>> /*
>>> diff --git a/arch/powerpc/kernel/pci-hotplug.c b/arch/powerpc/kernel/pci-hotplug.c
>>> index 7ed85a6..0040343 100644
>>> --- a/arch/powerpc/kernel/pci-hotplug.c
>>> +++ b/arch/powerpc/kernel/pci-hotplug.c
>>> @@ -29,6 +29,11 @@
>>> */
>>> void pcibios_release_device(struct pci_dev *dev)
>>> {
>>> + struct pci_controller *hose = pci_bus_to_host(dev->bus);
>>> +
>>> + if (hose->controller_ops.release_device)
>>> + hose->controller_ops.release_device(dev);
>>> +
>>> eeh_remove_device(dev);
>>> }
>>>
>>> diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
>>> index 910fb67..ef8c216 100644
>>> --- a/arch/powerpc/platforms/powernv/pci-ioda.c
>>> +++ b/arch/powerpc/platforms/powernv/pci-ioda.c
>>> @@ -12,6 +12,8 @@
>>> #undef DEBUG
>>>
>>> #include <linux/kernel.h>
>>> +#include <linux/atomic.h>
>>> +#include <linux/kref.h>
>>> #include <linux/pci.h>
>>> #include <linux/crash_dump.h>
>>> #include <linux/debugfs.h>
>>> @@ -47,6 +49,8 @@
>>> /* 256M DMA window, 4K TCE pages, 8 bytes TCE */
>>> #define TCE32_TABLE_SIZE ((0x10000000 / 0x1000) * 8)
>>>
>>> +static void pnv_ioda_release_pe(struct kref *kref);
>>> +
>>> static void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level,
>>> const char *fmt, ...)
>>> {
>>> @@ -123,25 +127,400 @@ static inline bool pnv_pci_is_mem_pref_64(unsigned long flags)
>>> (IORESOURCE_MEM_64 | IORESOURCE_PREFETCH));
>>> }
>>>
>>> -static void pnv_ioda_reserve_pe(struct pnv_phb *phb, int pe_no)
>>> +static inline void pnv_ioda_pe_get(struct pnv_ioda_pe *pe)
>>> {
>>> - if (!(pe_no >= 0 && pe_no < phb->ioda.total_pe)) {
>>> - pr_warn("%s: Invalid PE %d on PHB#%x\n",
>>> - __func__, pe_no, phb->hose->global_number);
>>> + if (!pe)
>>> + return;
>>> +
>>> + kref_get(&pe->kref);
>>> +}
>>> +
>>> +static inline void pnv_ioda_pe_put(struct pnv_ioda_pe *pe)
>>> +{
>>> + unsigned int count;
>>> +
>>> + if (!pe)
>>> return;
>>> +
>>> + /*
>>> + * The count is initialized to 1 and increased with 1 when
>>> + * a new PCI device is bound with the PE. Once the last PCI
>>> + * device is leaving from the PE, the PE is going to be
>>> + * released.
>>> + */
>>> + count = atomic_read(&pe->kref.refcount);
>>> + if (count == 2)
>>> + kref_sub(&pe->kref, 2, pnv_ioda_release_pe);
>>> + else
>>> + kref_put(&pe->kref, pnv_ioda_release_pe);
>>
>>
>> What if pnv_ioda_pe_get() gets called between atomic_read() and kref_sub()?
>>
>
> Yeah, that would have problem. But it shouldn't happen because the
> PCI devices are joining the parent PE# in strictly serialized mode.
> Same thing happens when detaching PCI devices from its parent PE.
oookay. Another thing then - why is this kref counter initialized to 1?
It would make sense if you did something special when the counter becomes 1
after decrement but you do not.
Also, this kref thing makes sense if you do kref_put() in multiple places
and do not know which one will be the last one so you pass the callback to
all of them. Here you do kref_put/sub in one place and you read the counter
- so you can call pnv_ioda_release_pe() directly. And it feels like a
simple atomic_t would do the job just fine. If you still feel that the
counter should start from 1, there are atomic_dec_if_positive() and
atomic_inc_not_zero() and others.
>>> +}
>>> +
>>> +static void pnv_pci_release_device(struct pci_dev *pdev)
>>> +{
>>> + struct pci_controller *hose = pci_bus_to_host(pdev->bus);
>>> + struct pnv_phb *phb = hose->private_data;
>>> + struct pci_dn *pdn = pci_get_pdn(pdev);
>>> + struct pnv_ioda_pe *pe;
>>> +
>>> + if (pdn && pdn->pe_number != IODA_INVALID_PE) {
>>> + pe = &phb->ioda.pe_array[pdn->pe_number];
>>> + pnv_ioda_pe_put(pe);
>>> + pdn->pe_number = IODA_INVALID_PE;
>>> }
>>> +}
>>>
>>> - if (test_and_set_bit(pe_no, phb->ioda.pe_alloc)) {
>>> - pr_warn("%s: PE %d was assigned on PHB#%x\n",
>>> - __func__, pe_no, phb->hose->global_number);
>>> +static void pnv_ioda_release_pe_dma(struct pnv_ioda_pe *pe)
>>> +{
>>> + struct pnv_phb *phb = pe->phb;
>>> + int index, count;
>>> + unsigned long tbl_addr, tbl_size;
>>> +
>>> + /* No DMA capability for slave PEs */
>>> + if (pe->flags & PNV_IODA_PE_SLAVE)
>>> + return;
>>> +
>>> + /* Bypass DMA window */
>>> + if (phb->type == PNV_PHB_IODA2 &&
>>> + pe->tce_bypass_enabled &&
>>> + pe->tce32_table &&
>>> + pe->tce32_table->set_bypass)
>>> + pe->tce32_table->set_bypass(pe->tce32_table, false);
>>> +
>>> + /* 32-bits DMA window */
>>> + count = pe->tce32_seg_end - pe->tce32_seg_start;
>>> + tbl_addr = pe->tce32_table->it_base;
>>> + if (!count)
>>> return;
>>> +
>>> + /* Free IOMMU table */
>>> + iommu_free_table(pe->tce32_table,
>>> + of_node_full_name(phb->hose->dn));
>>> +
>>> + /* Deconfigure TCE table */
>>> + switch (phb->type) {
>>> + case PNV_PHB_IODA1:
>>> + for (index = 0; index < count; index++)
>>> + opal_pci_map_pe_dma_window(phb->opal_id,
>>> + pe->pe_number,
>>> + pe->tce32_seg_start + index,
>>> + 1,
>>> + __pa(tbl_addr) +
>>> + index * TCE32_TABLE_SIZE,
>>> + 0,
>>> + 0x1000);
>>> + bitmap_clear(phb->ioda.tce32_segmap,
>>> + pe->tce32_seg_start,
>>> + count);
>>> + tbl_size = TCE32_TABLE_SIZE * count;
>>> + break;
>>> + case PNV_PHB_IODA2:
>>> + opal_pci_map_pe_dma_window(phb->opal_id,
>>> + pe->pe_number,
>>> + pe->pe_number << 1,
>>> + 1,
>>> + __pa(tbl_addr),
>>> + 0,
>>> + 0x1000);
>>> + tbl_size = (1ul << ilog2(phb->ioda.m32_pci_base));
>>> + tbl_size = (tbl_size >> IOMMU_PAGE_SHIFT_4K) * 8;
>>> + break;
>>> + default:
>>> + pe_warn(pe, "Unsupported PHB type %d\n", phb->type);
>>> + return;
>>> + }
>>> +
>>> + /* Free memory of IOMMU table */
>>> + free_pages(tbl_addr, get_order(tbl_size));
>>
>>
>> You just programmed the table address to TVT and then you are releasing the
>> pages. It does not seem right, it will leave garbage in TVT. Also, I am
>> adding helpers to alloc/free TCE pages in DDW patchset, you could reuse bits
>>from there (I'll post v10 soon, you'll be in copy and you'll have to review
>> that ;) ).
>>
>
> I assume you're talking about TVE. I don't understand how garbage will be left
> in TVE. opal_pci_map_pe_dma_window(), which is handled by skiboot, clear TVE
> with zero'ed "tce_table_size". The pages previously allocated for TCE table is
> released to buddy system, which can be allocated by somebody else (from buddy
> or slab).
opal_pci_map_pe_dma_window() takes __pa(tbl_addr) which points to some
memory which is still allocated. This value goes to a table (which has 2
entries per PE, one for 32bit DMA window and one for bypass/hugewindow)
which PHB uses to get the actual TCE table address. What is the name of
this table? :) Anyway, you write an address there and then you call
free_pages() so after free_pages(), the value in that TVE/TVT/whatever
table is a garbage.
>
> Ok. Please put me into the cc list. I guess the whole series of patches is
> better to rebased on your DDW patchset, which is to be merged first, I believe.
>
>>
>>> + pe->tce32_table = NULL;
>>> + pe->tce32_seg_start = 0;
>>> + pe->tce32_seg_end = 0;
>>> +}
>>> +
>>> +static void pnv_ioda_release_pe_seg(struct pnv_ioda_pe *pe)
>>> +{
>>> + struct pnv_phb *phb = pe->phb;
>>> + unsigned long *segmap = NULL, *pe_segmap = NULL;
>>> + int i;
>>> + uint16_t win, win_type[] = { OPAL_IO_WINDOW_TYPE,
>>> + OPAL_M32_WINDOW_TYPE,
>>> + OPAL_M64_WINDOW_TYPE };
>>> +
>>> + for (win = 0; win < ARRAY_SIZE(win_type); win++) {
>>> + switch (win_type[win]) {
>>> + case OPAL_IO_WINDOW_TYPE:
>>> + segmap = phb->ioda.io_segmap;
>>> + pe_segmap = pe->io_segmap;
>>> + break;
>>> + case OPAL_M32_WINDOW_TYPE:
>>> + segmap = phb->ioda.m32_segmap;
>>> + pe_segmap = pe->m32_segmap;
>>> + break;
>>> + case OPAL_M64_WINDOW_TYPE:
>>> + segmap = phb->ioda.m64_segmap;
>>> + pe_segmap = pe->m64_segmap;
>>> + break;
>>> + }
>>> + i = -1;
>>> + while ((i = find_next_bit(pe_segmap,
>>> + phb->ioda.total_pe, i + 1)) < phb->ioda.total_pe) {
>>> + if (win_type[win] == OPAL_IO_WINDOW_TYPE ||
>>> + win_type[win] == OPAL_M32_WINDOW_TYPE)
>>> + opal_pci_map_pe_mmio_window(phb->opal_id,
>>> + phb->ioda.reserved_pe,
>>> + win_type[win], 0, i);
>>> + else if (phb->type == PNV_PHB_IODA1)
>>> + opal_pci_map_pe_mmio_window(phb->opal_id,
>>> + phb->ioda.reserved_pe,
>>> + win_type[win],
>>> + i / 8, i % 8);
>>
>> The function is called ""release" but it programs something what looks like
>> reasonable values, is it correct?
>>
>
> It's out of problem, When the segment is deallocated, it's mapped to the
> reserved PE#.
>
>>
>>
>>> +
>>> + clear_bit(i, pe_segmap);
>>> + clear_bit(i, segmap);
>>> + }
>>> + }
>>> +}
>>> +
>>> +static int pnv_ioda_set_one_peltv(struct pnv_phb *phb,
>>> + struct pnv_ioda_pe *parent,
>>> + struct pnv_ioda_pe *child,
>>> + bool is_add)
>>> +{
>>> + const char *desc = is_add ? "adding" : "removing";
>>> + uint8_t op = is_add ? OPAL_ADD_PE_TO_DOMAIN :
>>> + OPAL_REMOVE_PE_FROM_DOMAIN;
>>> + struct pnv_ioda_pe *slave;
>>> + long rc;
>>> +
>>> + /* Parent PE affects child PE */
>>> + rc = opal_pci_set_peltv(phb->opal_id, parent->pe_number,
>>> + child->pe_number, op);
>>> + if (rc != OPAL_SUCCESS) {
>>> + pe_warn(child, "OPAL error %ld %s to parent PELTV\n",
>>> + rc, desc);
>>> + return -ENXIO;
>>> + }
>>> +
>>> + if (!(child->flags & PNV_IODA_PE_MASTER))
>>> + return 0;
>>> +
>>> + /* Compound case: parent PE affects slave PEs */
>>> + list_for_each_entry(slave, &child->slaves, list) {
>>> + rc = opal_pci_set_peltv(phb->opal_id, parent->pe_number,
>>> + slave->pe_number, op);
>>> + if (rc != OPAL_SUCCESS) {
>>> + pe_warn(slave, "OPAL error %ld %s to parent PELTV\n",
>>> + rc, desc);
>>> + return -ENXIO;
>>> + }
>>> + }
>>> +
>>> + return 0;
>>> +}
>>> +
>>> +static int pnv_ioda_set_peltv(struct pnv_ioda_pe *pe, bool is_add)
>>> +{
>>> + struct pnv_phb *phb = pe->phb;
>>> + struct pnv_ioda_pe *slave;
>>> + struct pci_dev *pdev = NULL;
>>> + int ret;
>>> +
>>> + /*
>>> + * Clear PE frozen state. If it's master PE, we need
>>> + * clear slave PE frozen state as well.
>>> + */
>>> + opal_pci_eeh_freeze_clear(phb->opal_id,
>>> + pe->pe_number,
>>> + OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
>>> + if (pe->flags & PNV_IODA_PE_MASTER) {
>>> + list_for_each_entry(slave, &pe->slaves, list) {
>>> + opal_pci_eeh_freeze_clear(phb->opal_id,
>>> + slave->pe_number,
>>> + OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
>>> + }
>>> + }
>>> +
>>> + /*
>>> + * Associate PE in PELT. We need add the PE into the
>>> + * corresponding PELT-V as well. Otherwise, the error
>>> + * originated from the PE might contribute to other
>>> + * PEs.
>>> + */
>>> + ret = pnv_ioda_set_one_peltv(phb, pe, pe, is_add);
>>> + if (ret)
>>> + return ret;
>>> +
>>> + /* For compound PEs, any one affects all of them */
>>> + if (pe->flags & PNV_IODA_PE_MASTER) {
>>> + list_for_each_entry(slave, &pe->slaves, list) {
>>> + ret = pnv_ioda_set_one_peltv(phb, slave, pe, is_add);
>>> + if (ret)
>>> + return ret;
>>> + }
>>> + }
>>> +
>>> + if (pe->flags & (PNV_IODA_PE_BUS_ALL | PNV_IODA_PE_BUS))
>>> + pdev = pe->pbus->self;
>>> + else if (pe->flags & PNV_IODA_PE_DEV)
>>> + pdev = pe->pdev->bus->self;
>>> +#ifdef CONFIG_PCI_IOV
>>> + else if (pe->flags & PNV_IODA_PE_VF)
>>> + pdev = pe->parent_dev->bus->self;
>>> +#endif /* CONFIG_PCI_IOV */
>>> +
>>> + while (pdev) {
>>> + struct pci_dn *pdn = pci_get_pdn(pdev);
>>> + struct pnv_ioda_pe *parent;
>>> +
>>> + if (pdn && pdn->pe_number != IODA_INVALID_PE) {
>>> + parent = &phb->ioda.pe_array[pdn->pe_number];
>>> + ret = pnv_ioda_set_one_peltv(phb, parent, pe, is_add);
>>> + if (ret)
>>> + return ret;
>>> + }
>>> +
>>> + pdev = pdev->bus->self;
>>> + }
>>> +
>>> + return 0;
>>> +}
>>> +
>>> +static void pnv_ioda_deconfigure_pe(struct pnv_ioda_pe *pe)
>>
>>
>> It used to be under #ifdef CONFIG_PCI_IOV, now it is not. Looks like just
>> moving of this function to a different place deserves a separate patch with a
>> comment why ("it is going to be used now for non-SRIOV case too" may be?).
>>
>
> Yeah, it makes sense to me. Will fix it up.
>
>>
>>> +{
>>> + struct pnv_phb *phb = pe->phb;
>>> + struct pci_dev *parent;
>>> + uint8_t bcomp, dcomp, fcomp;
>>> + long rid_end, rid;
>>> + int64_t rc;
>>> +
>>> + /* Tear down MVE */
>>> + if (phb->type == PNV_PHB_IODA1 &&
>>> + pe->mve_number != -1) {
>>> + rc = opal_pci_set_mve(phb->opal_id,
>>> + pe->mve_number,
>>> + phb->ioda.reserved_pe);
>>> + if (rc != OPAL_SUCCESS)
>>> + pe_warn(pe, "Error %lld unmapping MVE#%d\n",
>>> + rc, pe->mve_number);
>>> + rc = opal_pci_set_mve_enable(phb->opal_id,
>>> + pe->mve_number,
>>> + OPAL_DISABLE_MVE);
>>> + if (rc != OPAL_SUCCESS)
>>> + pe_warn(pe, "Error %lld disabling MVE#%d\n",
>>> + rc, pe->mve_number);
>>> + pe->mve_number = -1;
>>> + }
>>> +
>>> + /* Unmapping PELTV */
>>> + pnv_ioda_set_peltv(pe, false);
>>> +
>>> + /* To unmap PELTM */
>>> + if (pe->pbus) {
>>> + int count;
>>> +
>>> + dcomp = OPAL_IGNORE_RID_DEVICE_NUMBER;
>>> + fcomp = OPAL_IGNORE_RID_FUNCTION_NUMBER;
>>> + parent = pe->pbus->self;
>>> + if (pe->flags & PNV_IODA_PE_BUS_ALL)
>>> + count = pe->pbus->busn_res.end -
>>> + pe->pbus->busn_res.start + 1;
>>> + else
>>> + count = 1;
>>> +
>>> + switch(count) {
>>> + case 1: bcomp = OpalPciBusAll; break;
>>> + case 2: bcomp = OpalPciBus7Bits; break;
>>> + case 4: bcomp = OpalPciBus6Bits; break;
>>> + case 8: bcomp = OpalPciBus5Bits; break;
>>> + case 16: bcomp = OpalPciBus4Bits; break;
>>> + case 32: bcomp = OpalPciBus3Bits; break;
>>> + default:
>>> + /* Fail back to case of one bus */
>>> + pe_warn(pe, "Cannot support %d buses\n", count);
>>> + bcomp = OpalPciBusAll;
>>> + }
>>> + rid_end = pe->rid + (count << 8);
>>> + } else {
>>> +#ifdef CONFIG_PCI_IOV
>>> + if (pe->flags & PNV_IODA_PE_VF)
>>> + parent = pe->parent_dev;
>>> + else
>>> +#endif
>>> + parent = pe->pdev->bus->self;
>>> + bcomp = OpalPciBusAll;
>>> + dcomp = OPAL_COMPARE_RID_DEVICE_NUMBER;
>>> + fcomp = OPAL_COMPARE_RID_FUNCTION_NUMBER;
>>> + rid_end = pe->rid + 1;
>>> + }
>>> +
>>> + /* Clear RID mapping */
>>> + for (rid = pe->rid; rid < rid_end; rid++)
>>> + phb->ioda.pe_rmap[rid] = IODA_INVALID_PE;
>>> +
>>> + /* Unmapping PELTM */
>>> + rc = opal_pci_set_pe(phb->opal_id, pe->pe_number, pe->rid,
>>> + bcomp, dcomp, fcomp, OPAL_UNMAP_PE);
>>> + if (rc)
>>> + pe_warn(pe, "Error %ld unmapping PELTM\n", rc);
>>> +}
>>> +
>>> +static void pnv_ioda_release_pe(struct kref *kref)
>>> +{
>>> + struct pnv_ioda_pe *pe = container_of(kref, struct pnv_ioda_pe, kref);
>>> + struct pnv_ioda_pe *tmp, *slave;
>>> + struct pnv_phb *phb = pe->phb;
>>> +
>>> + pnv_ioda_release_pe_dma(pe);
>>> + pnv_ioda_release_pe_seg(pe);
>>> + pnv_ioda_deconfigure_pe(pe);
>>> +
>>> + /* Release slave PEs for compound PE */
>>> + if (pe->flags & PNV_IODA_PE_MASTER) {
>>> + list_for_each_entry_safe(slave, tmp, &pe->slaves, list)
>>> + pnv_ioda_pe_put(slave);
>>> + }
>>> +
>>> + /* Remove the PE from various list. We need remove slave
>>> + * PE from master's list.
>>> + */
>>> + list_del(&pe->dma_link);
>>> + list_del(&pe->list);
>>> +
>>> + /* Free PE number */
>>> + clear_bit(pe->pe_number, phb->ioda.pe_alloc);
>>> +}
>>> +
>>> +static struct pnv_ioda_pe *pnv_ioda_init_pe(struct pnv_phb *phb,
>>> + int pe_no)
>>> +{
>>> + struct pnv_ioda_pe *pe = &phb->ioda.pe_array[pe_no];
>>> +
>>> + kref_init(&pe->kref);
>>> + pe->phb = phb;
>>> + pe->pe_number = pe_no;
>>> + INIT_LIST_HEAD(&pe->dma_link);
>>> + INIT_LIST_HEAD(&pe->list);
>>> +
>>> + return pe;
>>> +}
>>> +
>>> +static struct pnv_ioda_pe *pnv_ioda_reserve_pe(struct pnv_phb *phb,
>>> + int pe_no)
>>> +{
>>> + if (!(pe_no >= 0 && pe_no < phb->ioda.total_pe)) {
>>> + pr_warn("%s: Invalid PE %d on PHB#%x\n",
>>> + __func__, pe_no, phb->hose->global_number);
>>> + return NULL;
>>> }
>>>
>>> - phb->ioda.pe_array[pe_no].phb = phb;
>>> - phb->ioda.pe_array[pe_no].pe_number = pe_no;
>>> + /*
>>> + * Same PE might be reserved for multiple times, which
>>> + * is out of problem actually.
>>> + */
>>> + set_bit(pe_no, phb->ioda.pe_alloc);
>>> + return pnv_ioda_init_pe(phb, pe_no);
>>> }
>>>
>>> -static int pnv_ioda_alloc_pe(struct pnv_phb *phb)
>>> +static struct pnv_ioda_pe *pnv_ioda_alloc_pe(struct pnv_phb *phb)
>>> {
>>> unsigned long pe_no;
>>> unsigned long limit = phb->ioda.total_pe - 1;
>>> @@ -154,20 +533,10 @@ static int pnv_ioda_alloc_pe(struct pnv_phb *phb)
>>> break;
>>>
>>> if (--limit >= phb->ioda.total_pe)
>>> - return IODA_INVALID_PE;
>>> + return NULL;
>>> } while(1);
>>>
>>> - phb->ioda.pe_array[pe_no].phb = phb;
>>> - phb->ioda.pe_array[pe_no].pe_number = pe_no;
>>> - return pe_no;
>>> -}
>>> -
>>> -static void pnv_ioda_free_pe(struct pnv_phb *phb, int pe)
>>> -{
>>> - WARN_ON(phb->ioda.pe_array[pe].pdev);
>>> -
>>> - memset(&phb->ioda.pe_array[pe], 0, sizeof(struct pnv_ioda_pe));
>>> - clear_bit(pe, phb->ioda.pe_alloc);
>>> + return pnv_ioda_init_pe(phb, pe_no);
>>> }
>>>
>>> static int pnv_ioda1_init_m64(struct pnv_phb *phb)
>>> @@ -382,8 +751,9 @@ static void pnv_ioda_reserve_m64_pe(struct pnv_phb *phb,
>>> }
>>> }
>>>
>>> -static int pnv_ioda_pick_m64_pe(struct pnv_phb *phb,
>>> - struct pci_bus *bus, int all)
>>> +static struct pnv_ioda_pe *pnv_ioda_pick_m64_pe(struct pnv_phb *phb,
>>> + struct pci_bus *bus,
>>> + int all)
>>
>>
>> Mechanic changes like this could easily go to a separate patch.
>>
>
> Indeed. I'll see how I can split the patches up in next revision.
> Thanks for the suggestion.
>
>>> {
>>> resource_size_t segsz = phb->ioda.m64_segsize;
>>> struct pci_dev *pdev;
>>> @@ -394,14 +764,14 @@ static int pnv_ioda_pick_m64_pe(struct pnv_phb *phb,
>>> int i;
>>>
>>> if (!pnv_ioda_need_m64_pe(phb, bus))
>>> - return IODA_INVALID_PE;
>>> + return NULL;
>>>
>>> /* Allocate bitmap */
>>> size = _ALIGN_UP(phb->ioda.total_pe / 8, sizeof(unsigned long));
>>> pe_bitsmap = kzalloc(size, GFP_KERNEL);
>>> if (!pe_bitsmap) {
>>> pr_warn("%s: Out of memory !\n", __func__);
>>> - return IODA_INVALID_PE;
>>> + return NULL;
>>> }
>>>
>>> /* The bridge's M64 window might be extended to PHB's M64
>>> @@ -438,7 +808,7 @@ static int pnv_ioda_pick_m64_pe(struct pnv_phb *phb,
>>> /* No M64 window found ? */
>>> if (bitmap_empty(pe_bitsmap, phb->ioda.total_pe)) {
>>> kfree(pe_bitsmap);
>>> - return IODA_INVALID_PE;
>>> + return NULL;
>>> }
>>>
>>> /* Figure out the master PE and put all slave PEs
>>> @@ -491,7 +861,7 @@ static int pnv_ioda_pick_m64_pe(struct pnv_phb *phb,
>>> }
>>>
>>> kfree(pe_bitsmap);
>>> - return master_pe->pe_number;
>>> + return master_pe;
>>> }
>>>
>>> static void __init pnv_ioda_parse_m64_window(struct pnv_phb *phb)
>>> @@ -695,7 +1065,7 @@ static int pnv_ioda_get_pe_state(struct pnv_phb *phb, int pe_no)
>>> * but in the meantime, we need to protect them to avoid warnings
>>> */
>>> #ifdef CONFIG_PCI_MSI
>>> -static struct pnv_ioda_pe *pnv_ioda_get_pe(struct pci_dev *dev)
>>> +static struct pnv_ioda_pe *pnv_ioda_pci_dev_to_pe(struct pci_dev *dev)
>>> {
>>> struct pci_controller *hose = pci_bus_to_host(dev->bus);
>>> struct pnv_phb *phb = hose->private_data;
>>> @@ -709,191 +1079,6 @@ static struct pnv_ioda_pe *pnv_ioda_get_pe(struct pci_dev *dev)
>>> }
>>> #endif /* CONFIG_PCI_MSI */
>>>
>>> -static int pnv_ioda_set_one_peltv(struct pnv_phb *phb,
>>> - struct pnv_ioda_pe *parent,
>>> - struct pnv_ioda_pe *child,
>>> - bool is_add)
>>> -{
>>> - const char *desc = is_add ? "adding" : "removing";
>>> - uint8_t op = is_add ? OPAL_ADD_PE_TO_DOMAIN :
>>> - OPAL_REMOVE_PE_FROM_DOMAIN;
>>> - struct pnv_ioda_pe *slave;
>>> - long rc;
>>> -
>>> - /* Parent PE affects child PE */
>>> - rc = opal_pci_set_peltv(phb->opal_id, parent->pe_number,
>>> - child->pe_number, op);
>>> - if (rc != OPAL_SUCCESS) {
>>> - pe_warn(child, "OPAL error %ld %s to parent PELTV\n",
>>> - rc, desc);
>>> - return -ENXIO;
>>> - }
>>> -
>>> - if (!(child->flags & PNV_IODA_PE_MASTER))
>>> - return 0;
>>> -
>>> - /* Compound case: parent PE affects slave PEs */
>>> - list_for_each_entry(slave, &child->slaves, list) {
>>> - rc = opal_pci_set_peltv(phb->opal_id, parent->pe_number,
>>> - slave->pe_number, op);
>>> - if (rc != OPAL_SUCCESS) {
>>> - pe_warn(slave, "OPAL error %ld %s to parent PELTV\n",
>>> - rc, desc);
>>> - return -ENXIO;
>>> - }
>>> - }
>>> -
>>> - return 0;
>>> -}
>>> -
>>> -static int pnv_ioda_set_peltv(struct pnv_phb *phb,
>>> - struct pnv_ioda_pe *pe,
>>> - bool is_add)
>>> -{
>>> - struct pnv_ioda_pe *slave;
>>> - struct pci_dev *pdev = NULL;
>>> - int ret;
>>> -
>>> - /*
>>> - * Clear PE frozen state. If it's master PE, we need
>>> - * clear slave PE frozen state as well.
>>> - */
>>> - if (is_add) {
>>> - opal_pci_eeh_freeze_clear(phb->opal_id, pe->pe_number,
>>> - OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
>>> - if (pe->flags & PNV_IODA_PE_MASTER) {
>>> - list_for_each_entry(slave, &pe->slaves, list)
>>> - opal_pci_eeh_freeze_clear(phb->opal_id,
>>> - slave->pe_number,
>>> - OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
>>> - }
>>> - }
>>> -
>>> - /*
>>> - * Associate PE in PELT. We need add the PE into the
>>> - * corresponding PELT-V as well. Otherwise, the error
>>> - * originated from the PE might contribute to other
>>> - * PEs.
>>> - */
>>> - ret = pnv_ioda_set_one_peltv(phb, pe, pe, is_add);
>>> - if (ret)
>>> - return ret;
>>> -
>>> - /* For compound PEs, any one affects all of them */
>>> - if (pe->flags & PNV_IODA_PE_MASTER) {
>>> - list_for_each_entry(slave, &pe->slaves, list) {
>>> - ret = pnv_ioda_set_one_peltv(phb, slave, pe, is_add);
>>> - if (ret)
>>> - return ret;
>>> - }
>>> - }
>>> -
>>> - if (pe->flags & (PNV_IODA_PE_BUS_ALL | PNV_IODA_PE_BUS))
>>> - pdev = pe->pbus->self;
>>> - else if (pe->flags & PNV_IODA_PE_DEV)
>>> - pdev = pe->pdev->bus->self;
>>> -#ifdef CONFIG_PCI_IOV
>>> - else if (pe->flags & PNV_IODA_PE_VF)
>>> - pdev = pe->parent_dev->bus->self;
>>> -#endif /* CONFIG_PCI_IOV */
>>> - while (pdev) {
>>> - struct pci_dn *pdn = pci_get_pdn(pdev);
>>> - struct pnv_ioda_pe *parent;
>>> -
>>> - if (pdn && pdn->pe_number != IODA_INVALID_PE) {
>>> - parent = &phb->ioda.pe_array[pdn->pe_number];
>>> - ret = pnv_ioda_set_one_peltv(phb, parent, pe, is_add);
>>> - if (ret)
>>> - return ret;
>>> - }
>>> -
>>> - pdev = pdev->bus->self;
>>> - }
>>> -
>>> - return 0;
>>> -}
>>> -
>>> -#ifdef CONFIG_PCI_IOV
>>> -static int pnv_ioda_deconfigure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
>>> -{
>>> - struct pci_dev *parent;
>>> - uint8_t bcomp, dcomp, fcomp;
>>> - int64_t rc;
>>> - long rid_end, rid;
>>> -
>>> - /* Currently, we just deconfigure VF PE. Bus PE will always there.*/
>>> - if (pe->pbus) {
>>> - int count;
>>> -
>>> - dcomp = OPAL_IGNORE_RID_DEVICE_NUMBER;
>>> - fcomp = OPAL_IGNORE_RID_FUNCTION_NUMBER;
>>> - parent = pe->pbus->self;
>>> - if (pe->flags & PNV_IODA_PE_BUS_ALL)
>>> - count = pe->pbus->busn_res.end - pe->pbus->busn_res.start + 1;
>>> - else
>>> - count = 1;
>>> -
>>> - switch(count) {
>>> - case 1: bcomp = OpalPciBusAll; break;
>>> - case 2: bcomp = OpalPciBus7Bits; break;
>>> - case 4: bcomp = OpalPciBus6Bits; break;
>>> - case 8: bcomp = OpalPciBus5Bits; break;
>>> - case 16: bcomp = OpalPciBus4Bits; break;
>>> - case 32: bcomp = OpalPciBus3Bits; break;
>>> - default:
>>> - dev_err(&pe->pbus->dev, "Number of subordinate buses %d unsupported\n",
>>> - count);
>>> - /* Do an exact match only */
>>> - bcomp = OpalPciBusAll;
>>> - }
>>> - rid_end = pe->rid + (count << 8);
>>> - } else {
>>> - if (pe->flags & PNV_IODA_PE_VF)
>>> - parent = pe->parent_dev;
>>> - else
>>> - parent = pe->pdev->bus->self;
>>> - bcomp = OpalPciBusAll;
>>> - dcomp = OPAL_COMPARE_RID_DEVICE_NUMBER;
>>> - fcomp = OPAL_COMPARE_RID_FUNCTION_NUMBER;
>>> - rid_end = pe->rid + 1;
>>> - }
>>> -
>>> - /* Clear the reverse map */
>>> - for (rid = pe->rid; rid < rid_end; rid++)
>>> - phb->ioda.pe_rmap[rid] = IODA_INVALID_PE;
>>> -
>>> - /* Release from all parents PELT-V */
>>> - while (parent) {
>>> - struct pci_dn *pdn = pci_get_pdn(parent);
>>> - if (pdn && pdn->pe_number != IODA_INVALID_PE) {
>>> - rc = opal_pci_set_peltv(phb->opal_id, pdn->pe_number,
>>> - pe->pe_number, OPAL_REMOVE_PE_FROM_DOMAIN);
>>> - /* XXX What to do in case of error ? */
>>
>>
>> Not much :) Free associated memory and mark it "dead" so it won't be used
>> again till reboot. In what circumstance can this opal_pci_set_peltv() fail at
>> all?
>>
>
> Yeah, maybe. Until now, I didn't see this failure since the code is there
> from the day. Note the code has been there for almost 4 years since the
> day Ben wrote it.
Sure. But if it starts failing, we won't even notice it - there is no even
pr_err() or WARN_ON.
>
>>
>>> - }
>>> - parent = parent->bus->self;
>>> - }
>>> -
>>> - opal_pci_eeh_freeze_set(phb->opal_id, pe->pe_number,
>>> - OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
>>> -
>>> - /* Disassociate PE in PELT */
>>> - rc = opal_pci_set_peltv(phb->opal_id, pe->pe_number,
>>> - pe->pe_number, OPAL_REMOVE_PE_FROM_DOMAIN);
>>> - if (rc)
>>> - pe_warn(pe, "OPAL error %ld remove self from PELTV\n", rc);
>>> - rc = opal_pci_set_pe(phb->opal_id, pe->pe_number, pe->rid,
>>> - bcomp, dcomp, fcomp, OPAL_UNMAP_PE);
>>> - if (rc)
>>> - pe_err(pe, "OPAL error %ld trying to setup PELT table\n", rc);
>>> -
>>> - pe->pbus = NULL;
>>> - pe->pdev = NULL;
>>> - pe->parent_dev = NULL;
>>> -
>>> - return 0;
>>> -}
>>> -#endif /* CONFIG_PCI_IOV */
>>> -
>>> static int pnv_ioda_configure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
>>> {
>>> struct pci_dev *parent;
>>> @@ -953,7 +1138,7 @@ static int pnv_ioda_configure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
>>> }
>>>
>>> /* Configure PELTV */
>>> - pnv_ioda_set_peltv(phb, pe, true);
>>> + pnv_ioda_set_peltv(pe, true);
>>>
>>> /* Setup reverse map */
>>> for (rid = pe->rid; rid < rid_end; rid++)
>>> @@ -1207,6 +1392,8 @@ static void pnv_ioda_setup_same_PE(struct pci_bus *bus, struct pnv_ioda_pe *pe)
>>> if (pdn->pe_number != IODA_INVALID_PE)
>>> continue;
>>>
>>> + /* Increase reference count of the parent PE */
>>
>> When you comment like this, I read it as the comment belongs to the whole
>> next chunk till the first empty line, i.e. to all 5 lines below, which is not
>> the case. I'd remove the comment as 1) "pe_get" in pnv_ioda_pe_get() name
>> suggests incrementing the reference counter 2) "pe" is always parent in this
>> function. I do not insist though.
>>
>
> Agree on your explaining. I'll remove this unuseful comments.
>
>>
>>> + pnv_ioda_pe_get(pe);
>>> pdn->pe_number = pe->pe_number;
>>> pe->dma_weight += pnv_ioda_dev_dma_weight(dev);
>>> if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate)
>>> @@ -1224,7 +1411,7 @@ static struct pnv_ioda_pe *pnv_ioda_setup_bus_PE(struct pci_bus *bus, int all)
>>> {
>>> struct pci_controller *hose = pci_bus_to_host(bus);
>>> struct pnv_phb *phb = hose->private_data;
>>> - struct pnv_ioda_pe *pe;
>>> + struct pnv_ioda_pe *pe = NULL;
>>> int pe_num = IODA_INVALID_PE;
>>>
>>> /* For partial hotplug case, the PE instance hasn't been destroyed
>>> @@ -1240,24 +1427,24 @@ static struct pnv_ioda_pe *pnv_ioda_setup_bus_PE(struct pci_bus *bus, int all)
>>> }
>>>
>>> /* PE number for root bus should have been reserved */
>>> - if (pci_is_root_bus(bus))
>>> - pe_num = phb->ioda.root_pe_no;
>>> + if (pci_is_root_bus(bus) &&
>>> + phb->ioda.root_pe_no != IODA_INVALID_PE)
>>> + pe = &phb->ioda.pe_array[phb->ioda.root_pe_no];
>>>
>>> /* Check if PE is determined by M64 */
>>> - if (pe_num == IODA_INVALID_PE && phb->pick_m64_pe)
>>> - pe_num = phb->pick_m64_pe(phb, bus, all);
>>> + if (!pe && phb->pick_m64_pe)
>>> + pe = phb->pick_m64_pe(phb, bus, all);
>>>
>>> /* The PE number isn't pinned by M64 */
>>> - if (pe_num == IODA_INVALID_PE)
>>> - pe_num = pnv_ioda_alloc_pe(phb);
>>> + if (!pe)
>>> + pe = pnv_ioda_alloc_pe(phb);
>>>
>>> - if (pe_num == IODA_INVALID_PE) {
>>> - pr_warning("%s: Not enough PE# available for PCI bus %04x:%02x\n",
>>> + if (!pe) {
>>> + pr_warn("%s: No enough PE# available for PCI bus %04x:%02x\n",
>>> __func__, pci_domain_nr(bus), bus->number);
>>> return NULL;
>>> }
>>>
>>> - pe = &phb->ioda.pe_array[pe_num];
>>> pe->flags |= (all ? PNV_IODA_PE_BUS_ALL : PNV_IODA_PE_BUS);
>>> pe->pbus = bus;
>>> pe->pdev = NULL;
>>> @@ -1274,14 +1461,12 @@ static struct pnv_ioda_pe *pnv_ioda_setup_bus_PE(struct pci_bus *bus, int all)
>>>
>>> if (pnv_ioda_configure_pe(phb, pe)) {
>>> /* XXX What do we do here ? */
>>> - if (pe_num)
>>> - pnv_ioda_free_pe(phb, pe_num);
>>> - pe->pbus = NULL;
>>> + pnv_ioda_pe_put(pe);
>>> return NULL;
>>> }
>>>
>>> pe->tce32_table = kzalloc_node(sizeof(struct iommu_table),
>>> - GFP_KERNEL, hose->node);
>>> + GFP_KERNEL, hose->node);
>>
>> Seems like spaces change only - if you really want this change (which I hate
>> - makes code look inaccurate to my taste but it seems I am in minority here
>> :) ), please put it to the separate patch.
>>
>
> Ok. Confirm with you: You prefer the original format? I don't know
> why I prefer the later one. Maybe my eyes are quite broken :-)
I prefer not to change existing whitespaces unless it is done once and for
the entire file :) Just remove this change from the patch.
>>
>>> pe->tce32_table->data = pe;
>>>
>>> /* Associate it with all child devices */
>>> @@ -1521,9 +1706,9 @@ static void pnv_ioda_release_vf_PE(struct pci_dev *pdev, u16 num_vfs)
>>> list_del(&pe->list);
>>> mutex_unlock(&phb->ioda.pe_list_mutex);
>>>
>>> - pnv_ioda_deconfigure_pe(phb, pe);
>>> + pnv_ioda_deconfigure_pe(pe);
>>
>>
>> Is this change necessary to get "Release PEs dynamically" working? Move it to
>> mechanical changes patch may be?
>>
>
> Ok. I'll try to do that.
>
>>
>>>
>>> - pnv_ioda_free_pe(phb, pe->pe_number);
>>> + pnv_ioda_pe_put(pe);
>>> }
>>> }
>>>
>>> @@ -1601,9 +1786,7 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs)
>>>
>>> if (pnv_ioda_configure_pe(phb, pe)) {
>>> /* XXX What do we do here ? */
>>> - if (pe_num)
>>> - pnv_ioda_free_pe(phb, pe_num);
>>> - pe->pdev = NULL;
>>> + pnv_ioda_pe_put(pe);
>>> continue;
>>> }
>>>
>>> @@ -2263,7 +2446,7 @@ int pnv_phb_to_cxl_mode(struct pci_dev *dev, uint64_t mode)
>>> struct pnv_ioda_pe *pe;
>>> int rc;
>>>
>>> - pe = pnv_ioda_get_pe(dev);
>>> + pe = pnv_ioda_pci_dev_to_pe(dev);
>>
>>
>> And this change could to separately. Not clear how this helps to "Release PEs
>> dynamically".
>>
>>
>
> It's not related to "Release PEs dynamically". The change is introduced by
> the function rename: Original pnv_ioda_get_pe() is renamed to pnv_ioda_pci_dev_to_pe().
But the rename happened in this patch and the patch's subj is "Release PEs
dynamically" so it should be related somehow or move it to a simple
separate patch "let's give the lalala function a better name to reflect
what it actually does" (but in this case the new name does not make any
more sense than the old one).
>>> if (!pe)
>>> return -ENODEV;
>>>
>>> @@ -2379,7 +2562,7 @@ int pnv_cxl_ioda_msi_setup(struct pci_dev *dev, unsigned int hwirq,
>>> struct pnv_ioda_pe *pe;
>>> int rc;
>>>
>>> - if (!(pe = pnv_ioda_get_pe(dev)))
>>> + if (!(pe = pnv_ioda_pci_dev_to_pe(dev)))
>>> return -ENODEV;
>>>
>>> /* Assign XIVE to PE */
>>> @@ -2401,7 +2584,7 @@ static int pnv_pci_ioda_msi_setup(struct pnv_phb *phb, struct pci_dev *dev,
>>> unsigned int hwirq, unsigned int virq,
>>> unsigned int is_64, struct msi_msg *msg)
>>> {
>>> - struct pnv_ioda_pe *pe = pnv_ioda_get_pe(dev);
>>> + struct pnv_ioda_pe *pe = pnv_ioda_pci_dev_to_pe(dev);
>>> unsigned int xive_num = hwirq - phb->msi_base;
>>> __be32 data;
>>> int rc;
>>> @@ -3065,6 +3248,7 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np,
>>> pnv_pci_controller_ops.setup_bridge = pnv_pci_setup_bridge;
>>> pnv_pci_controller_ops.window_alignment = pnv_pci_window_alignment;
>>> pnv_pci_controller_ops.reset_secondary_bus = pnv_pci_reset_secondary_bus;
>>> + pnv_pci_controller_ops.release_device = pnv_pci_release_device;
>>> hose->controller_ops = pnv_pci_controller_ops;
>>>
>>> #ifdef CONFIG_PCI_IOV
>>> diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
>>> index 1bea3a8..8b10f01 100644
>>> --- a/arch/powerpc/platforms/powernv/pci.h
>>> +++ b/arch/powerpc/platforms/powernv/pci.h
>>> @@ -28,6 +28,7 @@ enum pnv_phb_model {
>>> /* Data associated with a PE, including IOMMU tracking etc.. */
>>> struct pnv_phb;
>>> struct pnv_ioda_pe {
>>> + struct kref kref;
>>> unsigned long flags;
>>> struct pnv_phb *phb;
>>>
>>> @@ -120,7 +121,8 @@ struct pnv_phb {
>>> void (*shutdown)(struct pnv_phb *phb);
>>> int (*init_m64)(struct pnv_phb *phb);
>>> void (*reserve_m64_pe)(struct pnv_phb *phb, struct pci_bus *bus);
>>> - int (*pick_m64_pe)(struct pnv_phb *phb, struct pci_bus *bus, int all);
>>> + struct pnv_ioda_pe *(*pick_m64_pe)(struct pnv_phb *phb,
>>> + struct pci_bus *bus, int all);
>>> int (*get_pe_state)(struct pnv_phb *phb, int pe_no);
>>> void (*freeze_pe)(struct pnv_phb *phb, int pe_no);
>>> int (*unfreeze_pe)(struct pnv_phb *phb, int pe_no, int opt);
>>>
>
> Thanks,
> Gavin
>
--
Alexey
More information about the Linuxppc-dev
mailing list