[PATCH v2 13/14] powerpc/pseries/iommu: Make use of DDW for indirect mapping
Alexey Kardashevskiy
aik at ozlabs.ru
Tue Sep 29 13:56:23 AEST 2020
On 12/09/2020 03:07, Leonardo Bras wrote:
> Cc: linuxppc-dev at lists.ozlabs.org, linux-kernel at vger.kernel.org,
>
> So far it's assumed possible to map the guest RAM 1:1 to the bus, which
> works with a small number of devices. SRIOV changes it as the user can
> configure hundreds VFs and since phyp preallocates TCEs and does not
> allow IOMMU pages bigger than 64K, it has to limit the number of TCEs
> per a PE to limit waste of physical pages.
>
> As of today, if the assumed direct mapping is not possible, DDW creation
> is skipped and the default DMA window "ibm,dma-window" is used instead.
>
> The default DMA window uses 4k pages instead of 64k pages, and since
> the amount of pages (TCEs) may stay the same (on pHyp case), making
> use of DDW instead of the default DMA window for indirect mapping will
> expand in up to 16x the amount of memory that can be mapped on DMA.
>
> Indirect mapping will only be used if direct mapping is not a
> possibility.
>
> For indirect mapping, it's necessary to re-create the iommu_table with
> the new DMA window parameters, so iommu_alloc() can use it.
>
> Removing the default DMA window for using DDW with indirect mapping
> is only allowed if there is no current IOMMU memory allocated in
> the iommu_table. enable_ddw() is aborted otherwise.
>
> Even though there won't be both direct and indirect mappings at the
> same time, we can't reuse the DIRECT64_PROPNAME property name, or else
> an older kexec()ed kernel can assume direct mapping, and skip
> iommu_alloc(), causing undesirable behavior.
> So a new property name DMA64_PROPNAME "linux,dma64-ddr-window-info"
> was created to represent a DDW that does not allow direct mapping.
>
> Note: ddw_memory_hotplug_max() was moved up so it can be used in
> find_existing_ddw().
>
> Signed-off-by: Leonardo Bras <leobras.c at gmail.com>
> ---
> arch/powerpc/platforms/pseries/iommu.c | 160 ++++++++++++++++---------
> 1 file changed, 103 insertions(+), 57 deletions(-)
>
> diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
> index 9b7c03652e72..c4de23080d1b 100644
> --- a/arch/powerpc/platforms/pseries/iommu.c
> +++ b/arch/powerpc/platforms/pseries/iommu.c
> @@ -375,6 +375,7 @@ static DEFINE_SPINLOCK(direct_window_list_lock);
> /* protects initializing window twice for same device */
> static DEFINE_MUTEX(direct_window_init_mutex);
> #define DIRECT64_PROPNAME "linux,direct64-ddr-window-info"
> +#define DMA64_PROPNAME "linux,dma64-ddr-window-info"
>
> static int tce_clearrange_multi_pSeriesLP(unsigned long start_pfn,
> unsigned long num_pfn, const void *arg)
> @@ -846,10 +847,48 @@ static int remove_ddw(struct device_node *np, bool remove_prop, const char *win_
> return 0;
> }
>
> -static bool find_existing_ddw(struct device_node *pdn, u64 *dma_addr)
> +static phys_addr_t ddw_memory_hotplug_max(void)
Please, forward declaration or a separate patch; this creates
unnecessary noise to the actual change.
> +{
> + phys_addr_t max_addr = memory_hotplug_max();
> + struct device_node *memory;
> +
> + /*
> + * The "ibm,pmemory" can appear anywhere in the address space.
> + * Assuming it is still backed by page structs, set the upper limit
> + * for the huge DMA window as MAX_PHYSMEM_BITS.
> + */
> + if (of_find_node_by_type(NULL, "ibm,pmemory"))
> + return (sizeof(phys_addr_t) * 8 <= MAX_PHYSMEM_BITS) ?
> + (phys_addr_t)-1 : (1ULL << MAX_PHYSMEM_BITS);
> +
> + for_each_node_by_type(memory, "memory") {
> + unsigned long start, size;
> + int n_mem_addr_cells, n_mem_size_cells, len;
> + const __be32 *memcell_buf;
> +
> + memcell_buf = of_get_property(memory, "reg", &len);
> + if (!memcell_buf || len <= 0)
> + continue;
> +
> + n_mem_addr_cells = of_n_addr_cells(memory);
> + n_mem_size_cells = of_n_size_cells(memory);
> +
> + start = of_read_number(memcell_buf, n_mem_addr_cells);
> + memcell_buf += n_mem_addr_cells;
> + size = of_read_number(memcell_buf, n_mem_size_cells);
> + memcell_buf += n_mem_size_cells;
> +
> + max_addr = max_t(phys_addr_t, max_addr, start + size);
> + }
> +
> + return max_addr;
> +}
> +
> +static bool find_existing_ddw(struct device_node *pdn, u64 *dma_addr, bool *direct_mapping)
> {
> struct direct_window *window;
> const struct dynamic_dma_window_prop *direct64;
> + unsigned long window_size;
> bool found = false;
>
> spin_lock(&direct_window_list_lock);
> @@ -858,6 +897,10 @@ static bool find_existing_ddw(struct device_node *pdn, u64 *dma_addr)
> if (window->device == pdn) {
> direct64 = window->prop;
> *dma_addr = be64_to_cpu(direct64->dma_base);
> +
> + window_size = (1UL << be32_to_cpu(direct64->window_shift));
> + *direct_mapping = (window_size >= ddw_memory_hotplug_max());
> +
> found = true;
> break;
> }
> @@ -912,6 +955,7 @@ static int find_existing_ddw_windows(void)
> return 0;
>
> find_existing_ddw_windows_named(DIRECT64_PROPNAME);
> + find_existing_ddw_windows_named(DMA64_PROPNAME);
>
> return 0;
> }
> @@ -1054,43 +1098,6 @@ struct failed_ddw_pdn {
>
> static LIST_HEAD(failed_ddw_pdn_list);
>
> -static phys_addr_t ddw_memory_hotplug_max(void)
> -{
> - phys_addr_t max_addr = memory_hotplug_max();
> - struct device_node *memory;
> -
> - /*
> - * The "ibm,pmemory" can appear anywhere in the address space.
> - * Assuming it is still backed by page structs, set the upper limit
> - * for the huge DMA window as MAX_PHYSMEM_BITS.
> - */
> - if (of_find_node_by_type(NULL, "ibm,pmemory"))
> - return (sizeof(phys_addr_t) * 8 <= MAX_PHYSMEM_BITS) ?
> - (phys_addr_t) -1 : (1ULL << MAX_PHYSMEM_BITS);
> -
> - for_each_node_by_type(memory, "memory") {
> - unsigned long start, size;
> - int n_mem_addr_cells, n_mem_size_cells, len;
> - const __be32 *memcell_buf;
> -
> - memcell_buf = of_get_property(memory, "reg", &len);
> - if (!memcell_buf || len <= 0)
> - continue;
> -
> - n_mem_addr_cells = of_n_addr_cells(memory);
> - n_mem_size_cells = of_n_size_cells(memory);
> -
> - start = of_read_number(memcell_buf, n_mem_addr_cells);
> - memcell_buf += n_mem_addr_cells;
> - size = of_read_number(memcell_buf, n_mem_size_cells);
> - memcell_buf += n_mem_size_cells;
> -
> - max_addr = max_t(phys_addr_t, max_addr, start + size);
> - }
> -
> - return max_addr;
> -}
> -
> /*
> * Platforms supporting the DDW option starting with LoPAR level 2.7 implement
> * ibm,ddw-extensions, which carries the rtas token for
> @@ -1173,14 +1180,19 @@ static bool enable_ddw(struct pci_dev *dev, struct device_node *pdn)
> struct device_node *dn;
> u32 ddw_avail[DDW_APPLICABLE_SIZE];
> struct direct_window *window;
> + const char *win_name;
> struct property *win64 = NULL;
> struct failed_ddw_pdn *fpdn;
> - bool default_win_removed = false;
> + bool default_win_removed = false, direct_mapping = false;
> + struct pci_dn *pci = PCI_DN(pdn);
> + struct iommu_table *tbl = pci->table_group->tables[0];
>
> mutex_lock(&direct_window_init_mutex);
>
> - if (find_existing_ddw(pdn, &dev->dev.archdata.dma_offset))
> - goto out_unlock;
> + if (find_existing_ddw(pdn, &dev->dev.archdata.dma_offset, &direct_mapping)) {
> + mutex_unlock(&direct_window_init_mutex);
> + return direct_mapping;
> + }
>
> /*
> * If we already went through this for a previous function of
> @@ -1266,15 +1278,25 @@ static bool enable_ddw(struct pci_dev *dev, struct device_node *pdn)
> }
>
> /* verify the window * number of ptes will map the partition */
> - /* check largest block * page size > max memory hotplug addr */
> max_addr = ddw_memory_hotplug_max();
> if (query.largest_available_block < (max_addr >> page_shift)) {
> - dev_dbg(&dev->dev, "can't map partition max 0x%llx with %llu "
> - "%llu-sized pages\n", max_addr, query.largest_available_block,
> - 1ULL << page_shift);
> + dev_dbg(&dev->dev, "can't map partition max 0x%llx with %llu %llu-sized pages\n",
> + max_addr, query.largest_available_block,
> + 1ULL << page_shift);
> +
> + len = order_base_2(query.largest_available_block << page_shift);
> + win_name = DMA64_PROPNAME;
> + } else {
> + direct_mapping = true;
> + len = order_base_2(max_addr);
> + win_name = DIRECT64_PROPNAME;
> + }
> +
> + /* DDW + IOMMU on single window may fail if there is any allocation */
> + if (default_win_removed && !direct_mapping && iommu_table_in_use(tbl)) {
> + dev_dbg(&dev->dev, "current IOMMU table in use, can't be replaced.\n");
> goto out_failed;
> }
> - len = order_base_2(max_addr);
>
> ret = create_ddw(dev, ddw_avail, &create, page_shift, len);
> if (ret != 0)
> @@ -1284,8 +1306,7 @@ static bool enable_ddw(struct pci_dev *dev, struct device_node *pdn)
> create.liobn, dn);
>
> win_addr = ((u64)create.addr_hi << 32) | create.addr_lo;
> - win64 = ddw_property_create(DIRECT64_PROPNAME, create.liobn, win_addr,
> - page_shift, len);
> + win64 = ddw_property_create(win_name, create.liobn, win_addr, page_shift, len);
> if (!win64) {
> dev_info(&dev->dev,
> "couldn't allocate property, property name, or value\n");
> @@ -1300,15 +1321,37 @@ static bool enable_ddw(struct pci_dev *dev, struct device_node *pdn)
> }
>
> window = ddw_list_new_entry(pdn, win64->value);
> - if (!window)
> + if (!window) {
> + dev_dbg(&dev->dev, "couldn't create new list entry\n");
> goto out_prop_del;
> + }
>
> - ret = walk_system_ram_range(0, memblock_end_of_DRAM() >> PAGE_SHIFT,
> - win64->value, tce_setrange_multi_pSeriesLP_walk);
> - if (ret) {
> - dev_info(&dev->dev, "failed to map direct window for %pOF: %d\n",
> - dn, ret);
> - goto out_list_del;
> + if (direct_mapping) {
> + /* DDW maps the whole partition, so enable direct DMA mapping */
> + ret = walk_system_ram_range(0, memblock_end_of_DRAM() >> PAGE_SHIFT,
> + win64->value, tce_setrange_multi_pSeriesLP_walk);
> + if (ret) {
> + dev_info(&dev->dev, "failed to map direct window for %pOF: %d\n",
> + dn, ret);
> + goto out_list_del;
> + }
> + } else {
> + /* New table for using DDW instead of the default DMA window */
> + tbl = iommu_pseries_alloc_table(pci->phb->node);
> + if (!tbl) {
> + dev_dbg(&dev->dev, "couldn't create new IOMMU table\n");
> + goto out_list_del;
> + }
> +
> + _iommu_table_setparms(tbl, pci->phb->bus->number, create.liobn, win_addr,
> + 1UL << len, page_shift, 0, &iommu_table_lpar_multi_ops);
> + iommu_init_table(tbl, pci->phb->node, 0, 0);
It is 0,0 only if win_addr>0 which is not the QEMU case.
> +
> + /* Free old table and replace by the newer */
> + iommu_tce_table_put(pci->table_group->tables[0]);
> + pci->table_group->tables[0] = tbl;
> +
> + set_iommu_table_base(&dev->dev, tbl);
> }
>
> spin_lock(&direct_window_list_lock);
> @@ -1345,7 +1388,7 @@ static bool enable_ddw(struct pci_dev *dev, struct device_node *pdn)
>
> out_unlock:
> mutex_unlock(&direct_window_init_mutex);
> - return win64;
> + return win64 && direct_mapping;
> }
>
> static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev)
> @@ -1486,7 +1529,10 @@ static int iommu_reconfig_notifier(struct notifier_block *nb, unsigned long acti
> * we have to remove the property when releasing
> * the device node.
> */
> - remove_ddw(np, false, DIRECT64_PROPNAME);
> +
> + if (remove_ddw(np, false, DIRECT64_PROPNAME))
> + remove_ddw(np, false, DMA64_PROPNAME);
> +
> if (pci && pci->table_group)
> iommu_pseries_free_group(pci->table_group,
> np->full_name);
>
--
Alexey
More information about the Linuxppc-dev
mailing list