[PATCH v4 10/11] powerpc/pseries/iommu: Make use of DDW for indirect mapping

Alexey Kardashevskiy aik at ozlabs.ru
Wed Jul 14 18:38:41 AEST 2021



On 13/07/2021 14:36, Leonardo Brás wrote:
> On Tue, 2021-05-11 at 17:57 +1000, Alexey Kardashevskiy wrote:
>>
>>
>> On 01/05/2021 02:31, Leonardo Bras wrote:
>>> [...]
>>>        pmem_present = dn != NULL;
>>> @@ -1218,8 +1224,12 @@ static bool enable_ddw(struct pci_dev *dev,
>>> struct device_node *pdn)
>>>    
>>>          mutex_lock(&direct_window_init_mutex);
>>>    
>>> -       if (find_existing_ddw(pdn, &dev->dev.archdata.dma_offset,
>>> &len))
>>> -               goto out_unlock;
>>> +       if (find_existing_ddw(pdn, &dev->dev.archdata.dma_offset,
>>> &len)) {
>>> +               direct_mapping = (len >= max_ram_len);
>>> +
>>> +               mutex_unlock(&direct_window_init_mutex);
>>> +               return direct_mapping;
>>
>> Does not this break the existing case when direct_mapping==true by
>> skipping setting dev->dev.bus_dma_limit before returning?
>>
> 
> Yes, it does. Good catch!
> I changed it to use a flag instead of win64 for return, and now I can
> use the same success exit path for both the new config and the config
> found in list. (out_unlock)
> 
>>
>>
>>> +       }
>>>    
>>>          /*
>>>           * If we already went through this for a previous function of
>>> @@ -1298,7 +1308,6 @@ static bool enable_ddw(struct pci_dev *dev,
>>> struct device_node *pdn)
>>>                  goto out_failed;
>>>          }
>>>          /* verify the window * number of ptes will map the partition
>>> */
>>> -       /* check largest block * page size > max memory hotplug addr
>>> */
>>>          /*
>>>           * The "ibm,pmemory" can appear anywhere in the address
>>> space.
>>>           * Assuming it is still backed by page structs, try
>>> MAX_PHYSMEM_BITS
>>> @@ -1320,6 +1329,17 @@ static bool enable_ddw(struct pci_dev *dev,
>>> struct device_node *pdn)
>>>                          1ULL << len,
>>>                          query.largest_available_block,
>>>                          1ULL << page_shift);
>>> +
>>> +               len = order_base_2(query.largest_available_block <<
>>> page_shift);
>>> +               win_name = DMA64_PROPNAME;
>>
>> [1] ....
>>
>>
>>> +       } else {
>>> +               direct_mapping = true;
>>> +               win_name = DIRECT64_PROPNAME;
>>> +       }
>>> +
>>> +       /* DDW + IOMMU on single window may fail if there is any
>>> allocation */
>>> +       if (default_win_removed && !direct_mapping &&
>>> iommu_table_in_use(tbl)) {
>>> +               dev_dbg(&dev->dev, "current IOMMU table in use, can't
>>> be replaced.\n");
>>
>>
>> ... remove !direct_mapping and move to [1]?
> 
> 
> sure, done!
> 
>>
>>
>>>                  goto out_failed;
>>>          }
>>>    
>>> @@ -1331,8 +1351,7 @@ static bool enable_ddw(struct pci_dev *dev,
>>> struct device_node *pdn)
>>>                    create.liobn, dn);
>>>    
>>>          win_addr = ((u64)create.addr_hi << 32) | create.addr_lo;
>>> -       win64 = ddw_property_create(DIRECT64_PROPNAME, create.liobn,
>>> win_addr,
>>> -                                   page_shift, len);
>>> +       win64 = ddw_property_create(win_name, create.liobn, win_addr,
>>> page_shift, len);
>>>          if (!win64) {
>>>                  dev_info(&dev->dev,
>>>                           "couldn't allocate property, property name,
>>> or value\n");
>>> @@ -1350,12 +1369,47 @@ static bool enable_ddw(struct pci_dev *dev,
>>> struct device_node *pdn)
>>>          if (!window)
>>>                  goto out_del_prop;
>>>    
>>> -       ret = walk_system_ram_range(0, memblock_end_of_DRAM() >>
>>> PAGE_SHIFT,
>>> -                       win64->value,
>>> tce_setrange_multi_pSeriesLP_walk);
>>> -       if (ret) {
>>> -               dev_info(&dev->dev, "failed to map direct window for
>>> %pOF: %d\n",
>>> -                        dn, ret);
>>> -               goto out_del_list;
>>> +       if (direct_mapping) {
>>> +               /* DDW maps the whole partition, so enable direct DMA
>>> mapping */
>>> +               ret = walk_system_ram_range(0, memblock_end_of_DRAM()
>>>>> PAGE_SHIFT,
>>> +                                           win64->value,
>>> tce_setrange_multi_pSeriesLP_walk);
>>> +               if (ret) {
>>> +                       dev_info(&dev->dev, "failed to map direct
>>> window for %pOF: %d\n",
>>> +                                dn, ret);
>>> +                       goto out_del_list;
>>> +               }
>>> +       } else {
>>> +               struct iommu_table *newtbl;
>>> +               int i;
>>> +
>>> +               /* New table for using DDW instead of the default DMA
>>> window */
>>> +               newtbl = iommu_pseries_alloc_table(pci->phb->node);
>>> +               if (!newtbl) {
>>> +                       dev_dbg(&dev->dev, "couldn't create new IOMMU
>>> table\n");
>>> +                       goto out_del_list;
>>> +               }
>>> +
>>> +               for (i = 0; i < ARRAY_SIZE(pci->phb->mem_resources);
>>> i++) {
>>> +                       const unsigned long mask = IORESOURCE_MEM_64
>>> | IORESOURCE_MEM;
>>> +
>>> +                       /* Look for MMIO32 */
>>> +                       if ((pci->phb->mem_resources[i].flags & mask)
>>> == IORESOURCE_MEM)
>>> +                               break;
>>
>> What if there is no IORESOURCE_MEM? pci->phb->mem_resources[i].start
>> below will have garbage.
> 
> 
> 
> Yeah, that makes sense. I will add this lines after 'for':
> 
> if (i == ARRAY_SIZE(pci->phb->mem_resources)) {
>   iommu_tce_table_put(newtbl);
>   goto out_del_list;
> }
> 
> What do you think?


Move this and that "for" before iommu_pseries_alloc_table() so you won't 
need to free if there is no IORESOURCE_MEM.


> 
> 
>>
>>
>>> +               }
>>> +
>>> +               _iommu_table_setparms(newtbl, pci->phb->bus->number,
>>> create.liobn, win_addr,
>>> +                                     1UL << len, page_shift, 0,
>>> &iommu_table_lpar_multi_ops);
>>> +               iommu_init_table(newtbl, pci->phb->node, pci->phb-
>>>> mem_resources[i].start,
>>> +                                pci->phb->mem_resources[i].end);
>>> +
>>> +               if (default_win_removed)
>>> +                       iommu_tce_table_put(tbl);
>>
>>
>> iommu_tce_table_put() should have been called when the window was
>> removed.
>>
>> Also after some thinking - what happens if there were 2 devices in the
>> PE and one requested 64bit DMA? This will only update
>> set_iommu_table_base() for the 64bit one but not for the other.
>>
>> I think the right thing to do is:
>>
>> 1. check if table[0] is in use, if yes => fail (which this does
>> already)
>>
>> 2. remove default dma window but keep the iommu_table struct with one
>> change - set it_size to 0 (and free it_map) so the 32bit device won't
>> look at a stale structure and think there is some window (imaginery
>> situation for phyp but easy to recreate in qemu).
>>
>> 3. use table[1] for newly created indirect DDW window.
>>
>> 4. change get_iommu_table_base() to return a usable table (or may be
>> not
>> needed?).
>>
>> If this sounds reasonable (does it?),
> 
> Looks ok, I will try your suggestion.
> I was not aware of how pci->table_group->tables[] worked, so I replaced
> pci->table_group->tables[0] with the new tbl, while moving the older in
> pci->table_group->tables[1].


pci->table_group->tables[0] is window#0 at @0.
pci->table_group->tables[1] is window#1 at 0x0800.0000.0000.0000. That 
is all :)

pseries does not use tables[1] but powernv does (by VFIO only though).


> (4) get_iommu_table_base() does not seem to need update, as it returns
> the tlb set by set_iommu_table_base() which is already called in the
> !direct_mapping path in current patch.

Sounds right.

> 
>>   the question is now if you have
>> time to do that and the hardware to test that, or I'll have to finish
>> the work :)
> 
> Sorry, for some reason part of this got lost in Evolution mail client.
> 
> If possible, I do want to finish this work, and I am talking to IBM
> Virt people in order to get testing HW.


Even I struggle to find a powervm machine :)

>>
>>
>>> +               else
>>> +                       pci->table_group->tables[1] = tbl;
>>
>>
>> What is this for?
> 
> I was thinking of adding the older table to pci->table_group->tables[1]
> while keeping the newer table on pci->table_group->tables[0].
> This did work, but I think your suggestion may work better.
> 
> Best regards,
> Leonardo Bras
> 
> 

-- 
Alexey


More information about the Linuxppc-dev mailing list