Hi Mahesh, Thanks for sharing your great idea. I was in the middle of V5 and finish it today.
My v5 is based on the same idea of my v4 [1] with the improvement of the code. And I will send it out. [1]: https://lore.kernel.org/linuxppc-dev/1520829790-14029-1-git-send-email-kernelf...@gmail.com/ I will have a close look at your patch later. Thanks, Pingfan On Thu, Sep 7, 2023 at 1:59 AM Mahesh Salgaonkar <mah...@linux.ibm.com> wrote: > > The kernel boot parameter 'nr_cpus=' allows one to specify number of > possible cpus in the system. In the normal scenario the first cpu (cpu0) > that shows up is the boot cpu and hence it gets covered under nr_cpus > limit. > > But this assumption is broken in kdump scenario where kdump kernel after a > crash can boot up on an non-zero boot cpu. The paca structure allocation > depends on value of nr_cpus and is indexed using logical cpu ids. The cpu > discovery code brings up the cpus as they appear sequentially on device > tree and assigns logical cpu ids starting from 0. This definitely becomes > an issue if boot cpu id > nr_cpus. When this occurs it results into > > In past there were proposals to fix this by making changes to cpu discovery > code to identify non-zero boot cpu and map it to logical cpu 0. However, > the changes were very invasive, making discovery code more complicated and > risky. > > Considering that the non-zero boot cpu scenario is more specific to kdump > kernel, limiting the changes in panic/crash kexec path would probably be a > best approach to have. > > Hence proposed change is, in crash kexec path, move the crashing cpu's > device node to the first position under '/cpus' node, which will make the > crashing cpu to be discovered as part of the first core in kdump kernel. > > In order to accommodate boot cpu for the case where boot_cpuid > nr_cpu_ids, > align up the nr_cpu_ids to SMT threads in early_init_dt_scan_cpus(). This > will allow kdump kernel to work with nr_cpus=X where X will be aligned up > in multiple of SMT threads per core. > > Signed-off-by: Mahesh Salgaonkar <mah...@linux.ibm.com> > --- > arch/powerpc/include/asm/kexec.h | 1 > arch/powerpc/kernel/prom.c | 13 ++++ > arch/powerpc/kexec/core_64.c | 128 > +++++++++++++++++++++++++++++++++++++ > arch/powerpc/kexec/file_load_64.c | 2 - > 4 files changed, 143 insertions(+), 1 deletion(-) > > diff --git a/arch/powerpc/include/asm/kexec.h > b/arch/powerpc/include/asm/kexec.h > index a1ddba01e7d13..f5a6f4a1b8eb0 100644 > --- a/arch/powerpc/include/asm/kexec.h > +++ b/arch/powerpc/include/asm/kexec.h > @@ -144,6 +144,7 @@ unsigned int kexec_extra_fdt_size_ppc64(struct kimage > *image); > int setup_new_fdt_ppc64(const struct kimage *image, void *fdt, > unsigned long initrd_load_addr, > unsigned long initrd_len, const char *cmdline); > +int add_node_props(void *fdt, int node_offset, const struct device_node *dn); > #endif /* CONFIG_PPC64 */ > > #endif /* CONFIG_KEXEC_FILE */ > diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c > index 0b5878c3125b1..c2d4f55042d72 100644 > --- a/arch/powerpc/kernel/prom.c > +++ b/arch/powerpc/kernel/prom.c > @@ -322,6 +322,9 @@ static void __init check_cpu_feature_properties(unsigned > long node) > } > } > > +/* align addr on a size boundary - adjust address up */ > +#define _ALIGN_UP(addr, size) > (((addr)+((size)-1))&(~((typeof(addr))(size)-1))) > + > static int __init early_init_dt_scan_cpus(unsigned long node, > const char *uname, int depth, > void *data) > @@ -348,6 +351,16 @@ static int __init early_init_dt_scan_cpus(unsigned long > node, > > nthreads = len / sizeof(int); > > + /* > + * Align nr_cpu_ids to correct SMT value. This will help us to > allocate > + * pacas correctly to accomodate boot_cpu != 0 scenario e.g. in kdump > + * kernel the boot cpu can be any cpu between 0 through nthreads. > + */ > + if (nr_cpu_ids % nthreads) { > + nr_cpu_ids = _ALIGN_UP(nr_cpu_ids, nthreads); > + pr_info("Aligned nr_cpus to SMT=%d, nr_cpu_ids = %d\n", > nthreads, nr_cpu_ids); > + } > + > /* > * Now see if any of these threads match our boot cpu. > * NOTE: This must match the parsing done in smp_setup_cpu_maps. > diff --git a/arch/powerpc/kexec/core_64.c b/arch/powerpc/kexec/core_64.c > index a79e28c91e2be..168bef43e22c2 100644 > --- a/arch/powerpc/kexec/core_64.c > +++ b/arch/powerpc/kexec/core_64.c > @@ -17,6 +17,7 @@ > #include <linux/cpu.h> > #include <linux/hardirq.h> > #include <linux/of.h> > +#include <linux/libfdt.h> > > #include <asm/page.h> > #include <asm/current.h> > @@ -298,6 +299,119 @@ extern void kexec_sequence(void *newstack, unsigned > long start, > void (*clear_all)(void), > bool copy_with_mmu_off) __noreturn; > > +/* > + * Move the crashing cpus FDT node as the first node under '/cpus' node. > + * > + * - Get the FDT segment from the crash image segments. > + * - Locate the crashing CPUs fdt subnode 'A' under '/cpus' node. > + * - Now locate the crashing cpu device node 'B' from of_root device tree. > + * - Delete the crashing cpu FDT node 'A' from kexec FDT segment. > + * - Insert the crashing cpu device node 'B' into kexec FDT segment as first > + * subnode under '/cpus' node. > + */ > +static void move_crashing_cpu(struct kimage *image) > +{ > + void *fdt, *ptr; > + const char *pathp = NULL; > + unsigned long mem; > + const __be32 *intserv; > + struct device_node *dn; > + bool first_node = true; > + int cpus_offset, offset, fdt_index = -1; > + int initial_depth, depth, len, i, ret, nthreads; > + > + /* Find the FDT segment index in kexec segment array. */ > + for (i = 0; i < image->nr_segments; i++) { > + mem = image->segment[i].mem; > + ptr = __va(mem); > + if (ptr && fdt_magic(ptr) == FDT_MAGIC) { > + fdt_index = i; > + break; > + } > + } > + if (fdt_index < 0) { > + pr_err("Unable to locate FDT segment.\n"); > + return; > + } > + > + fdt = __va((void *)image->segment[fdt_index].mem); > + > + offset = cpus_offset = fdt_path_offset(fdt, "/cpus"); > + if (cpus_offset < 0) { > + if (cpus_offset != -FDT_ERR_NOTFOUND) > + pr_err("Malformed device tree: error reading /cpus > node: %s\n", > + fdt_strerror(cpus_offset)); > + return; > + } > + > + /* Locate crashing cpus fdt node */ > + initial_depth = depth = 0; > + for (offset = fdt_next_node(fdt, offset, &depth); > + offset >= 0 && depth > initial_depth; > + offset = fdt_next_node(fdt, offset, &depth)) { > + > + > + intserv = fdt_getprop(fdt, offset, > "ibm,ppc-interrupt-server#s", &len); > + if (!intserv) { > + pr_err("Unable to fetch ibm,ppc-interrupt-server#s > property\n"); > + return; > + } > + > + /* Find the match for crashing cpus phys id. */ > + nthreads = len / sizeof(int); > + for (i = 0; i < nthreads; i++) { > + if (be32_to_cpu(intserv[i]) == get_paca()->hw_cpu_id) > + break; > + } > + if (i < nthreads) { > + /* Found the match */ > + pathp = fdt_get_name(fdt, offset, NULL); > + break; > + } > + > + first_node = false; > + } > + > + /* > + * Nothing to be done if crashing cpu's fdt node is already at first > + * position OR crashing cpu's fdt node isn't present in kexec FDT > + * segment, which is not possible unless kexec FDT segment hasn't been > + * refreshed after DLPAR. > + */ > + if (first_node || offset < 0) > + return; > + > + /* Locate the device node of crashing cpu from of_root */ > + for_each_node_by_type(dn, "cpu") { > + if (!strcmp(dn->full_name, pathp)) > + break; > + } > + if (!dn) { > + pr_err("Could not locate device node of crashing cpu: %s\n", > pathp); > + return; > + } > + > + /* Delete the crashing cpu FDT node from kexec FDT segment */ > + ret = fdt_del_node(fdt, offset); > + if (ret < 0) { > + pr_err("Error deleting node /cpus/%s: %s\n", pathp, > fdt_strerror(ret)); > + return; > + } > + > + /* Add it as first subnode under /cpus node. */ > + offset = fdt_add_subnode(fdt, cpus_offset, dn->full_name); > + if (offset < 0) { > + pr_err("Unable to add %s subnode: %s\n", dn->full_name, > + fdt_strerror(offset)); > + return; > + } > + > + /* Copy rest of the properties of crashing cpus */ > + add_node_props(fdt, offset, dn); > + > + return; > +} > + > /* too late to fail here */ > void default_machine_kexec(struct kimage *image) > { > @@ -341,6 +455,20 @@ void default_machine_kexec(struct kimage *image) > printk("kexec: Unshared all shared pages.\n"); > } > > + /* > + * Move the crashing cpus FDT node as the first node under /cpus node. > + * This will make the core (where crashing cpu belongs) to > + * automatically become first core to show up in kdump kernel and > + * crashing cpu as boot cpu within first n threads of that core. > + * > + * Currently this will work with kexec_file_load only. > + * > + * XXX: For kexec_load, change is required in kexec tool to exclude > FDT > + * segment from purgatory checksum check. > + */ > + if (image->type == KEXEC_TYPE_CRASH && image->file_mode) > + move_crashing_cpu(image); > + > paca_ptrs[kexec_paca.paca_index] = &kexec_paca; > > setup_paca(&kexec_paca); > diff --git a/arch/powerpc/kexec/file_load_64.c > b/arch/powerpc/kexec/file_load_64.c > index 110d28bede2a7..42d55a19454a7 100644 > --- a/arch/powerpc/kexec/file_load_64.c > +++ b/arch/powerpc/kexec/file_load_64.c > @@ -1027,7 +1027,7 @@ unsigned int kexec_extra_fdt_size_ppc64(struct kimage > *image) > * > * Returns 0 on success, negative errno on error. > */ > -static int add_node_props(void *fdt, int node_offset, const struct > device_node *dn) > +int add_node_props(void *fdt, int node_offset, const struct device_node *dn) > { > int ret = 0; > struct property *pp; > >