[RFC PATCH] powerpc: Make crashing cpu to be discovered first in kdump kernel.

Pingfan Liu kernelfans at gmail.com
Mon Sep 11 23:23:42 AEST 2023


Hi Mahesh,

I am not quite sure about fdt, so I skip that part, and leave some
comments from the kexec view.

On Thu, Sep 7, 2023 at 1:59 AM Mahesh Salgaonkar <mahesh at linux.ibm.com> wrote:
>
> The kernel boot parameter 'nr_cpus=' allows one to specify number of
> possible cpus in the system. In the normal scenario the first cpu (cpu0)
> that shows up is the boot cpu and hence it gets covered under nr_cpus
> limit.
>
> But this assumption is broken in kdump scenario where kdump kernel after a
> crash can boot up on an non-zero boot cpu. The paca structure allocation
> depends on value of nr_cpus and is indexed using logical cpu ids. The cpu
> discovery code brings up the cpus as they appear sequentially on device
> tree and assigns logical cpu ids starting from 0. This definitely becomes
> an issue if boot cpu id > nr_cpus. When this occurs it results into
>
> In past there were proposals to fix this by making changes to cpu discovery
> code to identify non-zero boot cpu and map it to logical cpu 0. However,
> the changes were very invasive, making discovery code more complicated and
> risky.
>
> Considering that the non-zero boot cpu scenario is more specific to kdump
> kernel, limiting the changes in panic/crash kexec path would probably be a
> best approach to have.
>
> Hence proposed change is, in crash kexec path, move the crashing cpu's
> device node to the first position under '/cpus' node, which will make the
> crashing cpu to be discovered as part of the first core in kdump kernel.
>
> In order to accommodate boot cpu for the case where boot_cpuid > nr_cpu_ids,
> align up the nr_cpu_ids to SMT threads in early_init_dt_scan_cpus(). This
> will allow kdump kernel to work with nr_cpus=X where X will be aligned up
> in multiple of SMT threads per core.
>
> Signed-off-by: Mahesh Salgaonkar <mahesh at linux.ibm.com>
> ---
>  arch/powerpc/include/asm/kexec.h  |    1
>  arch/powerpc/kernel/prom.c        |   13 ++++
>  arch/powerpc/kexec/core_64.c      |  128 +++++++++++++++++++++++++++++++++++++
>  arch/powerpc/kexec/file_load_64.c |    2 -
>  4 files changed, 143 insertions(+), 1 deletion(-)
>
> diff --git a/arch/powerpc/include/asm/kexec.h b/arch/powerpc/include/asm/kexec.h
> index a1ddba01e7d13..f5a6f4a1b8eb0 100644
> --- a/arch/powerpc/include/asm/kexec.h
> +++ b/arch/powerpc/include/asm/kexec.h
> @@ -144,6 +144,7 @@ unsigned int kexec_extra_fdt_size_ppc64(struct kimage *image);
>  int setup_new_fdt_ppc64(const struct kimage *image, void *fdt,
>                         unsigned long initrd_load_addr,
>                         unsigned long initrd_len, const char *cmdline);
> +int add_node_props(void *fdt, int node_offset, const struct device_node *dn);
>  #endif /* CONFIG_PPC64 */
>
>  #endif /* CONFIG_KEXEC_FILE */
> diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c
> index 0b5878c3125b1..c2d4f55042d72 100644
> --- a/arch/powerpc/kernel/prom.c
> +++ b/arch/powerpc/kernel/prom.c
> @@ -322,6 +322,9 @@ static void __init check_cpu_feature_properties(unsigned long node)
>         }
>  }
>
> +/* align addr on a size boundary - adjust address up */
> +#define _ALIGN_UP(addr, size)   (((addr)+((size)-1))&(~((typeof(addr))(size)-1)))
> +
>  static int __init early_init_dt_scan_cpus(unsigned long node,
>                                           const char *uname, int depth,
>                                           void *data)
> @@ -348,6 +351,16 @@ static int __init early_init_dt_scan_cpus(unsigned long node,
>
>         nthreads = len / sizeof(int);
>
> +       /*
> +        * Align nr_cpu_ids to correct SMT value. This will help us to allocate
> +        * pacas correctly to accomodate boot_cpu != 0 scenario e.g. in kdump
> +        * kernel the boot cpu can be any cpu between 0 through nthreads.
> +        */
> +       if (nr_cpu_ids % nthreads) {
> +               nr_cpu_ids = _ALIGN_UP(nr_cpu_ids, nthreads);

It is better to use set_nr_cpu_ids(), which can hide the difference of
nr_cpus_ids under different kernel configuration.

> +               pr_info("Aligned nr_cpus to SMT=%d, nr_cpu_ids = %d\n", nthreads, nr_cpu_ids);
> +       }
> +
>         /*
>          * Now see if any of these threads match our boot cpu.
>          * NOTE: This must match the parsing done in smp_setup_cpu_maps.
> diff --git a/arch/powerpc/kexec/core_64.c b/arch/powerpc/kexec/core_64.c
> index a79e28c91e2be..168bef43e22c2 100644
> --- a/arch/powerpc/kexec/core_64.c
> +++ b/arch/powerpc/kexec/core_64.c
> @@ -17,6 +17,7 @@
>  #include <linux/cpu.h>
>  #include <linux/hardirq.h>
>  #include <linux/of.h>
> +#include <linux/libfdt.h>
>
>  #include <asm/page.h>
>  #include <asm/current.h>
> @@ -298,6 +299,119 @@ extern void kexec_sequence(void *newstack, unsigned long start,
>                            void (*clear_all)(void),
>                            bool copy_with_mmu_off) __noreturn;
>
> +/*
> + * Move the crashing cpus FDT node as the first node under '/cpus' node.
> + *
> + * - Get the FDT segment from the crash image segments.
> + * - Locate the crashing CPUs fdt subnode 'A' under '/cpus' node.
> + * - Now locate the crashing cpu device node 'B' from of_root device tree.
> + * - Delete the crashing cpu FDT node 'A' from kexec FDT segment.
> + * - Insert the crashing cpu device node 'B' into kexec FDT segment as first
> + *   subnode under '/cpus' node.
> + */
> +static void move_crashing_cpu(struct kimage *image)
> +{
> +       void *fdt, *ptr;
> +       const char *pathp = NULL;
> +       unsigned long mem;
> +       const __be32 *intserv;
> +       struct device_node *dn;
> +       bool first_node = true;
> +       int cpus_offset, offset, fdt_index = -1;
> +       int initial_depth, depth, len, i, ret, nthreads;
> +
> +       /* Find the FDT segment index in kexec segment array. */
> +       for (i = 0; i < image->nr_segments; i++) {
> +               mem = image->segment[i].mem;
> +               ptr = __va(mem);
> +               if (ptr && fdt_magic(ptr) == FDT_MAGIC) {
> +                       fdt_index = i;
> +                       break;
> +               }
> +       }
> +       if (fdt_index < 0) {
> +               pr_err("Unable to locate FDT segment.\n");
> +               return;
> +       }
> +
> +       fdt = __va((void *)image->segment[fdt_index].mem);
> +
> +       offset = cpus_offset = fdt_path_offset(fdt, "/cpus");
> +       if (cpus_offset < 0) {
> +               if (cpus_offset != -FDT_ERR_NOTFOUND)
> +                       pr_err("Malformed device tree: error reading /cpus node: %s\n",
> +                               fdt_strerror(cpus_offset));
> +               return;
> +       }
> +
> +       /* Locate crashing cpus fdt node */
> +        initial_depth = depth = 0;
> +       for (offset = fdt_next_node(fdt, offset, &depth);
> +               offset >= 0 && depth > initial_depth;
> +               offset = fdt_next_node(fdt, offset, &depth)) {
> +
> +
> +               intserv = fdt_getprop(fdt, offset, "ibm,ppc-interrupt-server#s", &len);
> +               if (!intserv) {
> +                       pr_err("Unable to fetch ibm,ppc-interrupt-server#s property\n");
> +                       return;
> +               }
> +
> +               /* Find the match for crashing cpus phys id. */
> +               nthreads = len / sizeof(int);
> +               for (i = 0; i < nthreads; i++) {
> +                       if (be32_to_cpu(intserv[i]) == get_paca()->hw_cpu_id)
> +                               break;
> +               }
> +               if (i < nthreads) {
> +                       /* Found the match */
> +                       pathp = fdt_get_name(fdt, offset, NULL);
> +                       break;
> +               }
> +
> +               first_node = false;
> +       }
> +
> +       /*
> +        * Nothing to be done if crashing cpu's fdt node is already at first
> +        * position OR crashing cpu's fdt node isn't present in kexec FDT
> +        * segment, which is not possible unless kexec FDT segment hasn't been
> +        * refreshed after DLPAR.
> +        */
> +       if (first_node || offset < 0)
> +               return;
> +
> +       /* Locate the device node of crashing cpu from of_root */
> +       for_each_node_by_type(dn, "cpu") {
> +               if (!strcmp(dn->full_name, pathp))
> +                       break;
> +       }
> +       if (!dn) {
> +               pr_err("Could not locate device node of crashing cpu: %s\n", pathp);
> +               return;
> +       }
> +
> +       /* Delete the crashing cpu FDT node from kexec FDT segment */
> +       ret = fdt_del_node(fdt, offset);
> +       if (ret < 0) {
> +               pr_err("Error deleting node /cpus/%s: %s\n", pathp, fdt_strerror(ret));
> +               return;
> +       }
> +
> +       /* Add it as first subnode under /cpus node. */
> +       offset = fdt_add_subnode(fdt, cpus_offset, dn->full_name);
> +       if (offset < 0) {
> +               pr_err("Unable to add %s subnode: %s\n", dn->full_name,
> +                       fdt_strerror(offset));
> +               return;
> +       }
> +
> +       /* Copy rest of the properties of crashing cpus */
> +       add_node_props(fdt, offset, dn);
> +
> +       return;
> +}
> +
>  /* too late to fail here */
>  void default_machine_kexec(struct kimage *image)
>  {
> @@ -341,6 +455,20 @@ void default_machine_kexec(struct kimage *image)
>                 printk("kexec: Unshared all shared pages.\n");
>         }
>
> +       /*
> +        * Move the crashing cpus FDT node as the first node under /cpus node.
> +        * This will make the core (where crashing cpu belongs) to
> +        * automatically become first core to show up in kdump kernel and
> +        * crashing cpu as boot cpu within first n threads of that core.
> +        *
> +        * Currently this will work with kexec_file_load only.
> +        *
> +        * XXX: For kexec_load, change is required in kexec tool to exclude FDT
> +        * segment from purgatory checksum check.
> +        */
> +       if (image->type == KEXEC_TYPE_CRASH && image->file_mode)
> +               move_crashing_cpu(image);
> +

Could "kexec -e" have the same logic? Then nr_cpus can work for both
"kexec -p" and "kexec -e".

Thanks,

Pingfan

>         paca_ptrs[kexec_paca.paca_index] = &kexec_paca;
>
>         setup_paca(&kexec_paca);
> diff --git a/arch/powerpc/kexec/file_load_64.c b/arch/powerpc/kexec/file_load_64.c
> index 110d28bede2a7..42d55a19454a7 100644
> --- a/arch/powerpc/kexec/file_load_64.c
> +++ b/arch/powerpc/kexec/file_load_64.c
> @@ -1027,7 +1027,7 @@ unsigned int kexec_extra_fdt_size_ppc64(struct kimage *image)
>   *
>   * Returns 0 on success, negative errno on error.
>   */
> -static int add_node_props(void *fdt, int node_offset, const struct device_node *dn)
> +int add_node_props(void *fdt, int node_offset, const struct device_node *dn)
>  {
>         int ret = 0;
>         struct property *pp;
>
>


More information about the Linuxppc-dev mailing list