[PATCH v6 10/11] powerpc/pseries/iommu: Make use of DDW for indirect mapping
Frederic Barrat
fbarrat at linux.ibm.com
Sat Aug 28 03:06:44 AEST 2021
On 17/08/2021 08:39, Leonardo Bras wrote:
> So far it's assumed possible to map the guest RAM 1:1 to the bus, which
> works with a small number of devices. SRIOV changes it as the user can
> configure hundreds VFs and since phyp preallocates TCEs and does not
> allow IOMMU pages bigger than 64K, it has to limit the number of TCEs
> per a PE to limit waste of physical pages.
>
> As of today, if the assumed direct mapping is not possible, DDW creation
> is skipped and the default DMA window "ibm,dma-window" is used instead.
>
> By using DDW, indirect mapping can get more TCEs than available for the
> default DMA window, and also get access to using much larger pagesizes
> (16MB as implemented in qemu vs 4k from default DMA window), causing a
> significant increase on the maximum amount of memory that can be IOMMU
> mapped at the same time.
>
> Indirect mapping will only be used if direct mapping is not a
> possibility.
>
> For indirect mapping, it's necessary to re-create the iommu_table with
> the new DMA window parameters, so iommu_alloc() can use it.
>
> Removing the default DMA window for using DDW with indirect mapping
> is only allowed if there is no current IOMMU memory allocated in
> the iommu_table. enable_ddw() is aborted otherwise.
>
> Even though there won't be both direct and indirect mappings at the
> same time, we can't reuse the DIRECT64_PROPNAME property name, or else
> an older kexec()ed kernel can assume direct mapping, and skip
> iommu_alloc(), causing undesirable behavior.
> So a new property name DMA64_PROPNAME "linux,dma64-ddr-window-info"
> was created to represent a DDW that does not allow direct mapping.
>
> Signed-off-by: Leonardo Bras <leobras.c at gmail.com>
> ---
I think it looks ok now as it was mostly me who was misunderstanding one
part of the previous iteration.
Reviewed-by: Frederic Barrat <fbarrat at linux.ibm.com>
Sorry for the late review, I was enjoying some time off. And thanks for
that series, I believe it should help with those bugs complaining about
lack of DMA space. It was also very educational for me, thanks to you
and Alexey for your detailed answers.
Fred
> arch/powerpc/platforms/pseries/iommu.c | 89 +++++++++++++++++++++-----
> 1 file changed, 74 insertions(+), 15 deletions(-)
>
> diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
> index e11c00b2dc1e..0eccc29f5573 100644
> --- a/arch/powerpc/platforms/pseries/iommu.c
> +++ b/arch/powerpc/platforms/pseries/iommu.c
> @@ -375,6 +375,7 @@ static DEFINE_SPINLOCK(direct_window_list_lock);
> /* protects initializing window twice for same device */
> static DEFINE_MUTEX(direct_window_init_mutex);
> #define DIRECT64_PROPNAME "linux,direct64-ddr-window-info"
> +#define DMA64_PROPNAME "linux,dma64-ddr-window-info"
>
> static int tce_clearrange_multi_pSeriesLP(unsigned long start_pfn,
> unsigned long num_pfn, const void *arg)
> @@ -940,6 +941,7 @@ static int find_existing_ddw_windows(void)
> return 0;
>
> find_existing_ddw_windows_named(DIRECT64_PROPNAME);
> + find_existing_ddw_windows_named(DMA64_PROPNAME);
>
> return 0;
> }
> @@ -1226,14 +1228,17 @@ static bool enable_ddw(struct pci_dev *dev, struct device_node *pdn)
> struct ddw_create_response create;
> int page_shift;
> u64 win_addr;
> + const char *win_name;
> struct device_node *dn;
> u32 ddw_avail[DDW_APPLICABLE_SIZE];
> struct direct_window *window;
> struct property *win64;
> bool ddw_enabled = false;
> struct failed_ddw_pdn *fpdn;
> - bool default_win_removed = false;
> + bool default_win_removed = false, direct_mapping = false;
> bool pmem_present;
> + struct pci_dn *pci = PCI_DN(pdn);
> + struct iommu_table *tbl = pci->table_group->tables[0];
>
> dn = of_find_node_by_type(NULL, "ibm,pmemory");
> pmem_present = dn != NULL;
> @@ -1242,6 +1247,7 @@ static bool enable_ddw(struct pci_dev *dev, struct device_node *pdn)
> mutex_lock(&direct_window_init_mutex);
>
> if (find_existing_ddw(pdn, &dev->dev.archdata.dma_offset, &len)) {
> + direct_mapping = (len >= max_ram_len);
> ddw_enabled = true;
> goto out_unlock;
> }
> @@ -1322,8 +1328,8 @@ static bool enable_ddw(struct pci_dev *dev, struct device_node *pdn)
> query.page_size);
> goto out_failed;
> }
> - /* verify the window * number of ptes will map the partition */
> - /* check largest block * page size > max memory hotplug addr */
> +
> +
> /*
> * The "ibm,pmemory" can appear anywhere in the address space.
> * Assuming it is still backed by page structs, try MAX_PHYSMEM_BITS
> @@ -1339,13 +1345,25 @@ static bool enable_ddw(struct pci_dev *dev, struct device_node *pdn)
> dev_info(&dev->dev, "Skipping ibm,pmemory");
> }
>
> + /* check if the available block * number of ptes will map everything */
> if (query.largest_available_block < (1ULL << (len - page_shift))) {
> dev_dbg(&dev->dev,
> "can't map partition max 0x%llx with %llu %llu-sized pages\n",
> 1ULL << len,
> query.largest_available_block,
> 1ULL << page_shift);
> - goto out_failed;
> +
> + /* DDW + IOMMU on single window may fail if there is any allocation */
> + if (default_win_removed && iommu_table_in_use(tbl)) {
> + dev_dbg(&dev->dev, "current IOMMU table in use, can't be replaced.\n");
> + goto out_failed;
> + }
> +
> + len = order_base_2(query.largest_available_block << page_shift);
> + win_name = DMA64_PROPNAME;
> + } else {
> + direct_mapping = true;
> + win_name = DIRECT64_PROPNAME;
> }
>
> ret = create_ddw(dev, ddw_avail, &create, page_shift, len);
> @@ -1356,8 +1374,8 @@ static bool enable_ddw(struct pci_dev *dev, struct device_node *pdn)
> create.liobn, dn);
>
> win_addr = ((u64)create.addr_hi << 32) | create.addr_lo;
> - win64 = ddw_property_create(DIRECT64_PROPNAME, create.liobn, win_addr,
> - page_shift, len);
> + win64 = ddw_property_create(win_name, create.liobn, win_addr, page_shift, len);
> +
> if (!win64) {
> dev_info(&dev->dev,
> "couldn't allocate property, property name, or value\n");
> @@ -1375,15 +1393,54 @@ static bool enable_ddw(struct pci_dev *dev, struct device_node *pdn)
> if (!window)
> goto out_del_prop;
>
> - ret = walk_system_ram_range(0, memblock_end_of_DRAM() >> PAGE_SHIFT,
> - win64->value, tce_setrange_multi_pSeriesLP_walk);
> - if (ret) {
> - dev_info(&dev->dev, "failed to map direct window for %pOF: %d\n",
> - dn, ret);
> + if (direct_mapping) {
> + /* DDW maps the whole partition, so enable direct DMA mapping */
> + ret = walk_system_ram_range(0, memblock_end_of_DRAM() >> PAGE_SHIFT,
> + win64->value, tce_setrange_multi_pSeriesLP_walk);
> + if (ret) {
> + dev_info(&dev->dev, "failed to map direct window for %pOF: %d\n",
> + dn, ret);
>
> /* Make sure to clean DDW if any TCE was set*/
> clean_dma_window(pdn, win64->value);
> - goto out_del_list;
> + goto out_del_list;
> + }
> + } else {
> + struct iommu_table *newtbl;
> + int i;
> +
> + for (i = 0; i < ARRAY_SIZE(pci->phb->mem_resources); i++) {
> + const unsigned long mask = IORESOURCE_MEM_64 | IORESOURCE_MEM;
> +
> + /* Look for MMIO32 */
> + if ((pci->phb->mem_resources[i].flags & mask) == IORESOURCE_MEM)
> + break;
> + }
> +
> + if (i == ARRAY_SIZE(pci->phb->mem_resources))
> + goto out_del_list;
> +
> + /* New table for using DDW instead of the default DMA window */
> + newtbl = iommu_pseries_alloc_table(pci->phb->node);
> + if (!newtbl) {
> + dev_dbg(&dev->dev, "couldn't create new IOMMU table\n");
> + goto out_del_list;
> + }
> +
> + iommu_table_setparms_common(newtbl, pci->phb->bus->number, create.liobn, win_addr,
> + 1UL << len, page_shift, NULL, &iommu_table_lpar_multi_ops);
> + iommu_init_table(newtbl, pci->phb->node, pci->phb->mem_resources[i].start,
> + pci->phb->mem_resources[i].end);
> +
> + pci->table_group->tables[1] = newtbl;
> +
> + /* Keep default DMA window stuct if removed */
> + if (default_win_removed) {
> + tbl->it_size = 0;
> + kfree(tbl->it_map);
> + }
> +
> + set_iommu_table_base(&dev->dev, newtbl);
> }
>
> spin_lock(&direct_window_list_lock);
> @@ -1427,10 +1484,10 @@ static bool enable_ddw(struct pci_dev *dev, struct device_node *pdn)
> * as RAM, then we failed to create a window to cover persistent
> * memory and need to set the DMA limit.
> */
> - if (pmem_present && ddw_enabled && (len == max_ram_len))
> + if (pmem_present && ddw_enabled && direct_mapping && len == max_ram_len)
> dev->dev.bus_dma_limit = dev->dev.archdata.dma_offset + (1ULL << len);
>
> - return ddw_enabled;
> + return ddw_enabled && direct_mapping;
> }
>
> static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev)
> @@ -1572,7 +1629,9 @@ static int iommu_reconfig_notifier(struct notifier_block *nb, unsigned long acti
> * we have to remove the property when releasing
> * the device node.
> */
> - remove_ddw(np, false, DIRECT64_PROPNAME);
> + if (remove_ddw(np, false, DIRECT64_PROPNAME))
> + remove_ddw(np, false, DMA64_PROPNAME);
> +
> if (pci && pci->table_group)
> iommu_pseries_free_group(pci->table_group,
> np->full_name);
>
More information about the Linuxppc-dev
mailing list