[PATCH kernel] pseries/iommu/ddw: Fix kdump to work in absence of ibm,dma-window
Russell Currey
ruscur at russell.cc
Mon Jun 27 14:10:36 AEST 2022
On Thu, 2022-06-16 at 17:59 +1000, Alexey Kardashevskiy wrote:
> The pseries platform uses 32bit default DMA window (always 4K pages)
> and
> optional 64bit DMA window available via DDW ("Dynamic DMA Windows"),
> 64K or 2M pages. For ages the default one was not removed and a huge
> window was created in addition. Things changed with SRIOV-enabled
> PowerVM which creates a default-and-bigger DMA window in 64bit space
> (still using 4K pages) for IOV VFs so certain OSes do not need to use
> the DDW API in order to utilize all available TCE budget.
>
> Linux on the other hand removes the default window and creates a
> bigger
> one (with more TCEs or/and a bigger page size - 64K/2M) in a bid to
> map
> the entire RAM, and if the new window size is smaller than that - it
> still uses this new bigger window. The result is that the default
> window
> is removed but the "ibm,dma-window" property is not.
>
> When kdump is invoked, the existing code tries reusing the existing
> 64bit
> DMA window which location and parameters are stored in the device
> tree but
> this fails as the new property does not make it to the kdump device
> tree
> blob. So the code falls back to the default window which does not
> exist
> anymore although the device tree says that it does. The result of
> that
> is that PCI devices become unusable and cannot be used for kdumping.
>
> This preserves the DMA64 and DIRECT64 properties in the device tree
> blob
> for the crash kernel. Since the crash kernel setup is done after
> device
> drivers are loaded and probed, the proper DMA config is stored at
> least
> for boot time devices.
>
> Because DDW window is optional and the code configures the default
> window
> first, the existing code creates an IOMMU table descriptor for
> the non-existing default DMA window. It is harmless for kdump as it
> does
> not touch the actual window (only reads what is mapped and marks
> those IO
> pages as used) but it is bad for kexec which clears it thinking it is
> a smaller default window rather than a bigger DDW window.
>
> This removes the "ibm,dma-window" property from the device tree after
> a bigger window is created and the crash kernel setup picks it up.
>
> Signed-off-by: Alexey Kardashevskiy <aik at ozlabs.ru>
Hey Alexey, great description of the problem. Would this need a Fixes:
tag?
> ---
> arch/powerpc/kexec/file_load_64.c | 52 +++++++++++++++
> arch/powerpc/platforms/pseries/iommu.c | 88 +++++++++++++++---------
> --
> 2 files changed, 102 insertions(+), 38 deletions(-)
>
> diff --git a/arch/powerpc/kexec/file_load_64.c
> b/arch/powerpc/kexec/file_load_64.c
> index b4981b651d9a..b4b486b68b63 100644
> --- a/arch/powerpc/kexec/file_load_64.c
> +++ b/arch/powerpc/kexec/file_load_64.c
> @@ -1038,6 +1038,48 @@ static int update_cpus_node(void *fdt)
> return ret;
> }
>
> +static int copy_dma_property(void *fdt, int node_offset, const
> struct device_node *dn,
> + const char *propname)
> +{
> + const void *prop, *fdtprop;
> + int len = 0, fdtlen = 0, ret;
> +
> + prop = of_get_property(dn, propname, &len);
> + fdtprop = fdt_getprop(fdt, node_offset, propname, &fdtlen);
> +
> + if (fdtprop && !prop)
> + ret = fdt_delprop(fdt, node_offset, propname);
> + else if (prop)
> + ret = fdt_setprop(fdt, node_offset, propname, prop,
> len);
If fdtprop and prop are both false, ret is returned uninitialised.
> +
> + return ret;
> +}
> +
> +static int update_pci_nodes(void *fdt, const char *dmapropname)
> +{
> + struct device_node *dn;
> + int pci_offset, root_offset, ret = 0;
> +
> + if (!firmware_has_feature(FW_FEATURE_LPAR))
> + return 0;
> +
> + root_offset = fdt_path_offset(fdt, "/");
> + for_each_node_with_property(dn, dmapropname) {
> + pci_offset = fdt_subnode_offset(fdt, root_offset,
> of_node_full_name(dn));
> + if (pci_offset < 0)
> + continue;
> +
> + ret = copy_dma_property(fdt, pci_offset, dn,
> "ibm,dma-window");
> + if (ret < 0)
> + break;
> + ret = copy_dma_property(fdt, pci_offset, dn,
> dmapropname);
> + if (ret < 0)
> + break;
> + }
> +
> + return ret;
> +}
> +
> /**
> * setup_new_fdt_ppc64 - Update the flattend device-tree of the
> kernel
> * being loaded.
> @@ -1099,6 +1141,16 @@ int setup_new_fdt_ppc64(const struct kimage
> *image, void *fdt,
> if (ret < 0)
> goto out;
>
> +#define DIRECT64_PROPNAME "linux,direct64-ddr-window-info"
> +#define DMA64_PROPNAME "linux,dma64-ddr-window-info"
Instead of having these defined in two different places, could they be
moved out of iommu.c and into a header? Though we hardcode ibm,dma-
window everywhere anyway.
> + ret = update_pci_nodes(fdt, DIRECT64_PROPNAME);
> + if (ret < 0)
> + goto out;
> +
> + ret = update_pci_nodes(fdt, DMA64_PROPNAME);
> + if (ret < 0)
> + goto out;
> +
> /* Update memory reserve map */
> ret = get_reserved_memory_ranges(&rmem);
> if (ret)
> diff --git a/arch/powerpc/platforms/pseries/iommu.c
> b/arch/powerpc/platforms/pseries/iommu.c
> index fba64304e859..af3c871668df 100644
> --- a/arch/powerpc/platforms/pseries/iommu.c
> +++ b/arch/powerpc/platforms/pseries/iommu.c
> @@ -700,6 +700,33 @@ struct iommu_table_ops
> iommu_table_lpar_multi_ops = {
> .get = tce_get_pSeriesLP
> };
>
> +/*
> + * Find nearest ibm,dma-window (default DMA window) or direct DMA
> window or
> + * dynamic 64bit DMA window, walking up the device tree.
> + */
> +static struct device_node *pci_dma_find(struct device_node *dn,
> + const __be32 **dma_window)
> +{
> + const __be32 *dw = NULL;
> +
> + for ( ; dn && PCI_DN(dn); dn = dn->parent) {
> + dw = of_get_property(dn, "ibm,dma-window", NULL);
> + if (dw) {
> + if (dma_window)
> + *dma_window = dw;
> + return dn;
> + }
> + dw = of_get_property(dn, DIRECT64_PROPNAME, NULL);
> + if (dw)
> + return dn;
> + dw = of_get_property(dn, DMA64_PROPNAME, NULL);
> + if (dw)
> + return dn;
> + }
> +
> + return NULL;
> +}
> +
> static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus)
> {
> struct iommu_table *tbl;
> @@ -712,20 +739,10 @@ static void pci_dma_bus_setup_pSeriesLP(struct
> pci_bus *bus)
> pr_debug("pci_dma_bus_setup_pSeriesLP: setting up bus
> %pOF\n",
> dn);
>
> - /*
> - * Find nearest ibm,dma-window (default DMA window), walking
> up the
> - * device tree
> - */
> - for (pdn = dn; pdn != NULL; pdn = pdn->parent) {
> - dma_window = of_get_property(pdn, "ibm,dma-window",
> NULL);
> - if (dma_window != NULL)
> - break;
> - }
> + pdn = pci_dma_find(dn, &dma_window);
>
> - if (dma_window == NULL) {
> + if (dma_window == NULL)
> pr_debug(" no ibm,dma-window property !\n");
> - return;
> - }
>
> ppci = PCI_DN(pdn);
>
> @@ -735,11 +752,13 @@ static void pci_dma_bus_setup_pSeriesLP(struct
> pci_bus *bus)
> if (!ppci->table_group) {
> ppci->table_group = iommu_pseries_alloc_group(ppci-
> >phb->node);
> tbl = ppci->table_group->tables[0];
> - iommu_table_setparms_lpar(ppci->phb, pdn, tbl,
> - ppci->table_group, dma_window);
> + if (dma_window) {
> + iommu_table_setparms_lpar(ppci->phb, pdn,
> tbl,
> + ppci->table_group,
> dma_window);
>
> - if (!iommu_init_table(tbl, ppci->phb->node, 0, 0))
> - panic("Failed to initialize iommu table");
> + if (!iommu_init_table(tbl, ppci->phb->node,
> 0, 0))
> + panic("Failed to initialize iommu
> table");
> + }
> iommu_register_group(ppci->table_group,
> pci_domain_nr(bus), 0);
> pr_debug(" created table: %p\n", ppci->table_group);
> @@ -1429,16 +1448,22 @@ static bool enable_ddw(struct pci_dev *dev,
> struct device_node *pdn)
>
> pci->table_group->tables[1] = newtbl;
>
> - /* Keep default DMA window struct if removed */
> - if (default_win_removed) {
> - tbl->it_size = 0;
> - vfree(tbl->it_map);
> - tbl->it_map = NULL;
> - }
> -
> set_iommu_table_base(&dev->dev, newtbl);
> }
>
> + if (default_win_removed) {
> + struct property *def_win;
Another section of enable_ddw() already has a default_win, could that
variable be made top-level and shared?
> + struct pci_dn *pci = PCI_DN(pdn);
enable_ddw() already has the same variable declared.
Otherwise, LGTM.
Reviewed-by: Russell Currey <ruscur at russell.cc>
> +
> + iommu_tce_table_put(pci->table_group->tables[0]);
> + def_win = of_find_property(pdn, "ibm,dma-window",
> NULL);
> + if (def_win) {
> + of_remove_property(pdn, def_win);
> + dev_info(&dev->dev, "Removed default DMA
> window for %pOF\n", pdn);
> + }
> + pci->table_group->tables[0] = NULL;
> + }
> +
> spin_lock(&dma_win_list_lock);
> list_add(&window->list, &dma_win_list);
> spin_unlock(&dma_win_list_lock);
> @@ -1503,13 +1528,7 @@ static void pci_dma_dev_setup_pSeriesLP(struct
> pci_dev *dev)
> dn = pci_device_to_OF_node(dev);
> pr_debug(" node is %pOF\n", dn);
>
> - for (pdn = dn; pdn && PCI_DN(pdn) && !PCI_DN(pdn)-
> >table_group;
> - pdn = pdn->parent) {
> - dma_window = of_get_property(pdn, "ibm,dma-window",
> NULL);
> - if (dma_window)
> - break;
> - }
> -
> + pdn = pci_dma_find(dn, &dma_window);
> if (!pdn || !PCI_DN(pdn)) {
> printk(KERN_WARNING "pci_dma_dev_setup_pSeriesLP: "
> "no DMA window found for pci dev=%s
> dn=%pOF\n",
> @@ -1540,7 +1559,6 @@ static void pci_dma_dev_setup_pSeriesLP(struct
> pci_dev *dev)
> static bool iommu_bypass_supported_pSeriesLP(struct pci_dev *pdev,
> u64 dma_mask)
> {
> struct device_node *dn = pci_device_to_OF_node(pdev), *pdn;
> - const __be32 *dma_window = NULL;
>
> /* only attempt to use a new window if 64-bit DMA is
> requested */
> if (dma_mask < DMA_BIT_MASK(64))
> @@ -1554,13 +1572,7 @@ static bool
> iommu_bypass_supported_pSeriesLP(struct pci_dev *pdev, u64 dma_mask)
> * search upwards in the tree until we either hit a dma-
> window
> * property, OR find a parent with a table already allocated.
> */
> - for (pdn = dn; pdn && PCI_DN(pdn) && !PCI_DN(pdn)-
> >table_group;
> - pdn = pdn->parent) {
> - dma_window = of_get_property(pdn, "ibm,dma-window",
> NULL);
> - if (dma_window)
> - break;
> - }
> -
> + pdn = pci_dma_find(dn, NULL);
> if (pdn && PCI_DN(pdn))
> return enable_ddw(pdev, pdn);
>
More information about the Linuxppc-dev
mailing list