[PATCH kernel v2] pseries/iommu/ddw: Fix kdump to work in absence of ibm,dma-window

Hari Bathini hbathini at linux.ibm.com
Thu Jul 14 17:59:22 AEST 2022



On 29/06/22 11:36 am, Alexey Kardashevskiy wrote:
> The pseries platform uses 32bit default DMA window (always 4K pages) and
> optional 64bit DMA window available via DDW ("Dynamic DMA Windows"),
> 64K or 2M pages. For ages the default one was not removed and a huge
> window was created in addition. Things changed with SRIOV-enabled
> PowerVM which creates a default-and-bigger DMA window in 64bit space
> (still using 4K pages) for IOV VFs so certain OSes do not need to use
> the DDW API in order to utilize all available TCE budget.
> 
> Linux on the other hand removes the default window and creates a bigger
> one (with more TCEs or/and a bigger page size - 64K/2M) in a bid to map
> the entire RAM, and if the new window size is smaller than that - it
> still uses this new bigger window. The result is that the default window
> is removed but the "ibm,dma-window" property is not.
> 
> When kdump is invoked, the existing code tries reusing the existing 64bit
> DMA window which location and parameters are stored in the device tree but
> this fails as the new property does not make it to the kdump device tree
> blob. So the code falls back to the default window which does not exist
> anymore although the device tree says that it does. The result of that
> is that PCI devices become unusable and cannot be used for kdumping.
> 
> This preserves the DMA64 and DIRECT64 properties in the device tree blob
> for the crash kernel. Since the crash kernel setup is done after device
> drivers are loaded and probed, the proper DMA config is stored at least
> for boot time devices.
> 
> Because DDW window is optional and the code configures the default window
> first, the existing code creates an IOMMU table descriptor for
> the non-existing default DMA window. It is harmless for kdump as it does
> not touch the actual window (only reads what is mapped and marks those IO
> pages as used) but it is bad for kexec which clears it thinking it is
> a smaller default window rather than a bigger DDW window.
> 
> This removes the "ibm,dma-window" property from the device tree after
> a bigger window is created and the crash kernel setup picks it up.

> Fixes: 381ceda88c4c ("powerpc/pseries/iommu: Make use of DDW for indirect mapping")
> Signed-off-by: Alexey Kardashevskiy <aik at ozlabs.ru>

Maybe another day  but one more reason why fdt should be prepared from
the current unflatted dt instead of initial_boot_params. ..

Anyway, the patch looks good to me..

Acked-by: Hari Bathini <hbathini at linux.ibm.com>

> ---
> 
> Looks like SYSCALL(kexec_file_load) never supported DMA. so it could be:
> Fixes: a0458284f062 ("powerpc: Add support code for kexec_file_load()")
> 
> ---
> Changes:
> v2:
> * reworked enable_ddw() to reuse default_win
> * removed @tbl as it was only used once and later on this zeroes it
> * undef for xxx64_PROPNAME in file_load_64.c
> * renamed new functions in file_load_64.c
> ---
>   arch/powerpc/kexec/file_load_64.c      | 54 ++++++++++++++++
>   arch/powerpc/platforms/pseries/iommu.c | 89 ++++++++++++++------------
>   2 files changed, 102 insertions(+), 41 deletions(-)
> 
> diff --git a/arch/powerpc/kexec/file_load_64.c b/arch/powerpc/kexec/file_load_64.c
> index b4981b651d9a..5d2c22aa34fb 100644
> --- a/arch/powerpc/kexec/file_load_64.c
> +++ b/arch/powerpc/kexec/file_load_64.c
> @@ -1038,6 +1038,48 @@ static int update_cpus_node(void *fdt)
>   	return ret;
>   }
>   
> +static int copy_property(void *fdt, int node_offset, const struct device_node *dn,
> +			 const char *propname)
> +{
> +	const void *prop, *fdtprop;
> +	int len = 0, fdtlen = 0, ret;
> +
> +	prop = of_get_property(dn, propname, &len);
> +	fdtprop = fdt_getprop(fdt, node_offset, propname, &fdtlen);
> +
> +	if (fdtprop && !prop)
> +		ret = fdt_delprop(fdt, node_offset, propname);
> +	else if (prop)
> +		ret = fdt_setprop(fdt, node_offset, propname, prop, len);
> +
> +	return ret;
> +}
> +
> +static int update_pci_dma_nodes(void *fdt, const char *dmapropname)
> +{
> +	struct device_node *dn;
> +	int pci_offset, root_offset, ret = 0;
> +
> +	if (!firmware_has_feature(FW_FEATURE_LPAR))
> +		return 0;
> +
> +	root_offset = fdt_path_offset(fdt, "/");
> +	for_each_node_with_property(dn, dmapropname) {
> +		pci_offset = fdt_subnode_offset(fdt, root_offset, of_node_full_name(dn));
> +		if (pci_offset < 0)
> +			continue;
> +
> +		ret = copy_property(fdt, pci_offset, dn, "ibm,dma-window");
> +		if (ret < 0)
> +			break;
> +		ret = copy_property(fdt, pci_offset, dn, dmapropname);
> +		if (ret < 0)
> +			break;
> +	}
> +
> +	return ret;
> +}
> +
>   /**
>    * setup_new_fdt_ppc64 - Update the flattend device-tree of the kernel
>    *                       being loaded.
> @@ -1099,6 +1141,18 @@ int setup_new_fdt_ppc64(const struct kimage *image, void *fdt,
>   	if (ret < 0)
>   		goto out;
>   
> +#define DIRECT64_PROPNAME "linux,direct64-ddr-window-info"
> +#define DMA64_PROPNAME "linux,dma64-ddr-window-info"
> +	ret = update_pci_dma_nodes(fdt, DIRECT64_PROPNAME);
> +	if (ret < 0)
> +		goto out;
> +
> +	ret = update_pci_dma_nodes(fdt, DMA64_PROPNAME);
> +	if (ret < 0)
> +		goto out;
> +#undef DMA64_PROPNAME
> +#undef DIRECT64_PROPNAME
> +
>   	/* Update memory reserve map */
>   	ret = get_reserved_memory_ranges(&rmem);
>   	if (ret)
> diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
> index fba64304e859..c3d425ef7b39 100644
> --- a/arch/powerpc/platforms/pseries/iommu.c
> +++ b/arch/powerpc/platforms/pseries/iommu.c
> @@ -700,6 +700,33 @@ struct iommu_table_ops iommu_table_lpar_multi_ops = {
>   	.get = tce_get_pSeriesLP
>   };
>   
> +/*
> + * Find nearest ibm,dma-window (default DMA window) or direct DMA window or
> + * dynamic 64bit DMA window, walking up the device tree.
> + */
> +static struct device_node *pci_dma_find(struct device_node *dn,
> +					const __be32 **dma_window)
> +{
> +	const __be32 *dw = NULL;
> +
> +	for ( ; dn && PCI_DN(dn); dn = dn->parent) {
> +		dw = of_get_property(dn, "ibm,dma-window", NULL);
> +		if (dw) {
> +			if (dma_window)
> +				*dma_window = dw;
> +			return dn;
> +		}
> +		dw = of_get_property(dn, DIRECT64_PROPNAME, NULL);
> +		if (dw)
> +			return dn;
> +		dw = of_get_property(dn, DMA64_PROPNAME, NULL);
> +		if (dw)
> +			return dn;
> +	}
> +
> +	return NULL;
> +}
> +
>   static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus)
>   {
>   	struct iommu_table *tbl;
> @@ -712,20 +739,10 @@ static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus)
>   	pr_debug("pci_dma_bus_setup_pSeriesLP: setting up bus %pOF\n",
>   		 dn);
>   
> -	/*
> -	 * Find nearest ibm,dma-window (default DMA window), walking up the
> -	 * device tree
> -	 */
> -	for (pdn = dn; pdn != NULL; pdn = pdn->parent) {
> -		dma_window = of_get_property(pdn, "ibm,dma-window", NULL);
> -		if (dma_window != NULL)
> -			break;
> -	}
> +	pdn = pci_dma_find(dn, &dma_window);
>   
> -	if (dma_window == NULL) {
> +	if (dma_window == NULL)
>   		pr_debug("  no ibm,dma-window property !\n");
> -		return;
> -	}
>   
>   	ppci = PCI_DN(pdn);
>   
> @@ -735,11 +752,13 @@ static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus)
>   	if (!ppci->table_group) {
>   		ppci->table_group = iommu_pseries_alloc_group(ppci->phb->node);
>   		tbl = ppci->table_group->tables[0];
> -		iommu_table_setparms_lpar(ppci->phb, pdn, tbl,
> -				ppci->table_group, dma_window);
> +		if (dma_window) {
> +			iommu_table_setparms_lpar(ppci->phb, pdn, tbl,
> +						  ppci->table_group, dma_window);
>   
> -		if (!iommu_init_table(tbl, ppci->phb->node, 0, 0))
> -			panic("Failed to initialize iommu table");
> +			if (!iommu_init_table(tbl, ppci->phb->node, 0, 0))
> +				panic("Failed to initialize iommu table");
> +		}
>   		iommu_register_group(ppci->table_group,
>   				pci_domain_nr(bus), 0);
>   		pr_debug("  created table: %p\n", ppci->table_group);
> @@ -1232,7 +1251,7 @@ static bool enable_ddw(struct pci_dev *dev, struct device_node *pdn)
>   	bool default_win_removed = false, direct_mapping = false;
>   	bool pmem_present;
>   	struct pci_dn *pci = PCI_DN(pdn);
> -	struct iommu_table *tbl = pci->table_group->tables[0];
> +	struct property *default_win = NULL;
>   
>   	dn = of_find_node_by_type(NULL, "ibm,pmemory");
>   	pmem_present = dn != NULL;
> @@ -1289,11 +1308,10 @@ static bool enable_ddw(struct pci_dev *dev, struct device_node *pdn)
>   	 * for extensions presence.
>   	 */
>   	if (query.windows_available == 0) {
> -		struct property *default_win;
>   		int reset_win_ext;
>   
>   		/* DDW + IOMMU on single window may fail if there is any allocation */
> -		if (iommu_table_in_use(tbl)) {
> +		if (iommu_table_in_use(pci->table_group->tables[0])) {
>   			dev_warn(&dev->dev, "current IOMMU table in use, can't be replaced.\n");
>   			goto out_failed;
>   		}
> @@ -1429,16 +1447,18 @@ static bool enable_ddw(struct pci_dev *dev, struct device_node *pdn)
>   
>   		pci->table_group->tables[1] = newtbl;
>   
> -		/* Keep default DMA window struct if removed */
> -		if (default_win_removed) {
> -			tbl->it_size = 0;
> -			vfree(tbl->it_map);
> -			tbl->it_map = NULL;
> -		}
> -
>   		set_iommu_table_base(&dev->dev, newtbl);
>   	}
>   
> +	if (default_win_removed) {
> +		iommu_tce_table_put(pci->table_group->tables[0]);
> +		pci->table_group->tables[0] = NULL;
> +
> +		/* default_win is valid here because default_win_removed == true */
> +		of_remove_property(pdn, default_win);
> +		dev_info(&dev->dev, "Removed default DMA window for %pOF\n", pdn);
> +	}
> +
>   	spin_lock(&dma_win_list_lock);
>   	list_add(&window->list, &dma_win_list);
>   	spin_unlock(&dma_win_list_lock);
> @@ -1503,13 +1523,7 @@ static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev)
>   	dn = pci_device_to_OF_node(dev);
>   	pr_debug("  node is %pOF\n", dn);
>   
> -	for (pdn = dn; pdn && PCI_DN(pdn) && !PCI_DN(pdn)->table_group;
> -	     pdn = pdn->parent) {
> -		dma_window = of_get_property(pdn, "ibm,dma-window", NULL);
> -		if (dma_window)
> -			break;
> -	}
> -
> +	pdn = pci_dma_find(dn, &dma_window);
>   	if (!pdn || !PCI_DN(pdn)) {
>   		printk(KERN_WARNING "pci_dma_dev_setup_pSeriesLP: "
>   		       "no DMA window found for pci dev=%s dn=%pOF\n",
> @@ -1540,7 +1554,6 @@ static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev)
>   static bool iommu_bypass_supported_pSeriesLP(struct pci_dev *pdev, u64 dma_mask)
>   {
>   	struct device_node *dn = pci_device_to_OF_node(pdev), *pdn;
> -	const __be32 *dma_window = NULL;
>   
>   	/* only attempt to use a new window if 64-bit DMA is requested */
>   	if (dma_mask < DMA_BIT_MASK(64))
> @@ -1554,13 +1567,7 @@ static bool iommu_bypass_supported_pSeriesLP(struct pci_dev *pdev, u64 dma_mask)
>   	 * search upwards in the tree until we either hit a dma-window
>   	 * property, OR find a parent with a table already allocated.
>   	 */
> -	for (pdn = dn; pdn && PCI_DN(pdn) && !PCI_DN(pdn)->table_group;
> -			pdn = pdn->parent) {
> -		dma_window = of_get_property(pdn, "ibm,dma-window", NULL);
> -		if (dma_window)
> -			break;
> -	}
> -
> +	pdn = pci_dma_find(dn, NULL);
>   	if (pdn && PCI_DN(pdn))
>   		return enable_ddw(pdev, pdn);
>   


More information about the Linuxppc-dev mailing list