crash, the second/fadump boots and access the fadump header prepared by
first kernel and do the following in a platform-specific function
[rtas|opal]_fadump_process:
At present, the first kernel is responsible for preparing the fadump
header and storing it in the fadump reserved area. The fadump header
includes the start address of the elfcorehd, crashing CPU details, and
other relevant information. In the event of a crash in the first kernel,
the second/fadump boots and accesses the fadump header prepared by the
first kernel. It then performs the following steps in a
platform-specific function [rtas|opal]_fadump_process:
1. Sanity check for fadump header
2. Update CPU notes in elfcorehdr
3. Set the global variable elfcorehdr_addr to the address of the
fadump header's elfcorehdr. For vmcore module to process it later on.
Along with the above, update the setup_fadump()/fadump.c to create
elfcorehdr in second/fadump kernel.
Section below outlines the information required to create the elfcorehdr
and the changes made to make it available to the fadump kernel if it's
not already.
To create elfcorehdr, the following crashed kernel information is
required: CPU notes, vmcoreinfo, and memory ranges.
At present, the CPU notes are already prepared in the fadump kernel, so
no changes are needed in that regard. The fadump kernel has access to
all crashed kernel memory regions, including boot memory regions that
are relocated by firmware to fadump reserved areas, so no changes for
that either. However, it is necessary to add new members to the fadump
header, i.e., the 'fadump_crash_info_header' structure, in order to pass
the crashed kernel's vmcoreinfo address and its size to fadump kernel.
In addition to the vmcoreinfo address and size, there are a few other
attributes also added to the fadump_crash_info_header structure.
1. version:
It stores the fadump header version, which is currently set to 1.
This provides flexibility to update the fadump crash info header in
the future without changing the magic number. For each change in the
fadump header, the version will be increased. This will help the
updated kernel determine how to handle kernel dumps from older
kernels. The magic number remains relevant for checking fadump header
corruption.
2. elfcorehdr_size:
since elfcorehdr is now prepared in the fadump/second kernel and
it is not part of the reserved area, this attribute is needed to
track the memory allocated for elfcorehdr to do the deallocation
properly.
3. pt_regs_sz/cpu_mask_sz:
Store size of pt_regs and cpu_mask strucutre in first kernel. These
attributes are used avoid processing the dump if the sizes of pt_regs
and cpu_mask are not the same across the crashed and fadump kernel.
Note: if either first/crashed kernel or second/fadump kernel do not have
the changes introduced here then kernel fail to collect the dump and
prints relevant error message on the console.
Signed-off-by: Sourabh Jain <sourabhj...@linux.ibm.com>
Cc: Aditya Gupta <adit...@linux.ibm.com>
Cc: Aneesh Kumar K.V <aneesh.ku...@kernel.org>
Cc: Hari Bathini <hbath...@linux.ibm.com>
Cc: Mahesh Salgaonkar <mah...@linux.ibm.com>
Cc: Michael Ellerman <m...@ellerman.id.au>
Cc: Naveen N Rao <nav...@kernel.org>
---
arch/powerpc/include/asm/fadump-internal.h | 31 +-
arch/powerpc/kernel/fadump.c | 355 +++++++++++--------
arch/powerpc/platforms/powernv/opal-fadump.c | 18 +-
arch/powerpc/platforms/pseries/rtas-fadump.c | 23 +-
4 files changed, 242 insertions(+), 185 deletions(-)
diff --git a/arch/powerpc/include/asm/fadump-internal.h
b/arch/powerpc/include/asm/fadump-internal.h
index 27f9e11eda28..a632e9708610 100644
--- a/arch/powerpc/include/asm/fadump-internal.h
+++ b/arch/powerpc/include/asm/fadump-internal.h
@@ -42,13 +42,40 @@ static inline u64 fadump_str_to_u64(const char *str)
#define FADUMP_CPU_UNKNOWN (~((u32)0))
-#define FADUMP_CRASH_INFO_MAGIC fadump_str_to_u64("FADMPINF")
+/*
+ * The introduction of new fields in the fadump crash info header has
+ * led to a change in the magic key from `FADMPINF` to `FADMPSIG` for
+ * identifying a kernel crash from an old kernel.
+ *
+ * To prevent the need for further changes to the magic number in the
+ * event of future modifications to the fadump crash info header, a
+ * version field has been introduced to track the fadump crash info
+ * header version.
+ *
+ * Consider a few points before adding new members to the fadump crash info
+ * header structure:
+ *
+ * - Append new members; avoid adding them in between.
+ * - Non-primitive members should have a size member as well.
+ * - For every change in the fadump header, increment the
+ * fadump header version. This helps the updated kernel decide how to
+ * handle kernel dumps from older kernels.
+ */
+#define FADUMP_CRASH_INFO_MAGIC_OLD fadump_str_to_u64("FADMPINF")
+#define FADUMP_CRASH_INFO_MAGIC fadump_str_to_u64("FADMPSIG")
+#define FADUMP_HEADER_VERSION 1
/* fadump crash info structure */
struct fadump_crash_info_header {
u64 magic_number;
- u64 elfcorehdr_addr;
+ u32 version;
u32 crashing_cpu;
+ u64 vmcoreinfo_raddr;
+ u64 vmcoreinfo_size;
+ u32 pt_regs_sz;
+ u32 cpu_mask_sz;
struct pt_regs regs;
struct cpumask cpu_mask;
};
diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
index d14eda1e8589..eb9132538268 100644
--- a/arch/powerpc/kernel/fadump.c
+++ b/arch/powerpc/kernel/fadump.c
@@ -53,8 +53,6 @@ static struct kobject *fadump_kobj;
static atomic_t cpus_in_fadump;
static DEFINE_MUTEX(fadump_mutex);
-static struct fadump_mrange_info crash_mrange_info = { "crash", NULL, 0, 0, 0, false };
-
#define RESERVED_RNGS_SZ 16384 /* 16K - 128 entries */
#define RESERVED_RNGS_CNT (RESERVED_RNGS_SZ / \
sizeof(struct fadump_memory_range))
@@ -373,12 +371,6 @@ static unsigned long __init get_fadump_area_size(void)
size = PAGE_ALIGN(size);
size += fw_dump.boot_memory_size;
size += sizeof(struct fadump_crash_info_header);
- size += sizeof(struct elfhdr); /* ELF core header.*/
- size += sizeof(struct elf_phdr); /* place holder for cpu notes */
- /* Program headers for crash memory regions. */
- size += sizeof(struct elf_phdr) * (memblock_num_regions(memory) + 2);
-
- size = PAGE_ALIGN(size);
/* This is to hold kernel metadata on platforms that support it */
size += (fw_dump.ops->fadump_get_metadata_size ?
@@ -931,36 +923,6 @@ static inline int fadump_add_mem_range(struct
fadump_mrange_info *mrange_info,
return 0;
}
-static int fadump_exclude_reserved_area(u64 start, u64 end)
-{
- u64 ra_start, ra_end;
- int ret = 0;
-
- ra_start = fw_dump.reserve_dump_area_start;
- ra_end = ra_start + fw_dump.reserve_dump_area_size;
-
- if ((ra_start < end) && (ra_end > start)) {
- if ((start < ra_start) && (end > ra_end)) {
- ret = fadump_add_mem_range(&crash_mrange_info,
- start, ra_start);
- if (ret)
- return ret;
-
- ret = fadump_add_mem_range(&crash_mrange_info,
- ra_end, end);
- } else if (start < ra_start) {
- ret = fadump_add_mem_range(&crash_mrange_info,
- start, ra_start);
- } else if (ra_end < end) {
- ret = fadump_add_mem_range(&crash_mrange_info,
- ra_end, end);
- }
- } else
- ret = fadump_add_mem_range(&crash_mrange_info, start, end);
-
- return ret;
-}
-
static int fadump_init_elfcore_header(char *bufp)
{
struct elfhdr *elf;
@@ -997,52 +959,6 @@ static int fadump_init_elfcore_header(char *bufp)
return 0;
}
-/*
- * Traverse through memblock structure and setup crash memory ranges. These
- * ranges will be used create PT_LOAD program headers in elfcore header.
- */
-static int fadump_setup_crash_memory_ranges(void)
-{
- u64 i, start, end;
- int ret;
-
- pr_debug("Setup crash memory ranges.\n");
- crash_mrange_info.mem_range_cnt = 0;
-
- /*
- * Boot memory region(s) registered with firmware are moved to
- * different location at the time of crash. Create separate program
- * header(s) for this memory chunk(s) with the correct offset.
- */
- for (i = 0; i < fw_dump.boot_mem_regs_cnt; i++) {
- start = fw_dump.boot_mem_addr[i];
- end = start + fw_dump.boot_mem_sz[i];
- ret = fadump_add_mem_range(&crash_mrange_info, start, end);
- if (ret)
- return ret;
- }
-
- for_each_mem_range(i, &start, &end) {
- /*
- * skip the memory chunk that is already added
- * (0 through boot_memory_top).
- */
- if (start < fw_dump.boot_mem_top) {
- if (end > fw_dump.boot_mem_top)
- start = fw_dump.boot_mem_top;
- else
- continue;
- }
-
- /* add this range excluding the reserved dump area. */
- ret = fadump_exclude_reserved_area(start, end);
- if (ret)
- return ret;
- }
-
- return 0;
-}
-
/*
* If the given physical address falls within the boot memory region then
* return the relocated address that points to the dump region reserved
@@ -1073,13 +989,28 @@ static inline unsigned long fadump_relocate(unsigned
long paddr)
return raddr;
}
-static int fadump_create_elfcore_headers(char *bufp)
+static void populate_elf_pt_load(struct elf_phdr *phdr, u64 start,
+ u64 size, unsigned long long offset)
{
- unsigned long long raddr, offset;
- struct elf_phdr *phdr;
+ phdr->p_align = 0;
+ phdr->p_memsz = size;
+ phdr->p_filesz = size;
+ phdr->p_paddr = start;
+ phdr->p_offset = offset;
+ phdr->p_type = PT_LOAD;
+ phdr->p_flags = PF_R|PF_W|PF_X;
+ phdr->p_vaddr = (unsigned long)__va(start);
+}
+
+static void __init fadump_populate_elfcorehdr(struct fadump_crash_info_header
*fdh)
+{
+ char *bufp;
struct elfhdr *elf;
- int i, j;
+ struct elf_phdr *phdr;
+ u64 boot_mem_dest_offset;
+ unsigned long long i, ra_start, ra_end, ra_size, mstart, mend;
+ bufp = (char *) __va(fdh->elfcorehdr_addr);
fadump_init_elfcore_header(bufp);
elf = (struct elfhdr *)bufp;
bufp += sizeof(struct elfhdr);
@@ -1113,54 +1044,68 @@ static int fadump_create_elfcore_headers(char *bufp)
phdr->p_vaddr = 0;
phdr->p_align = 0;
- phdr->p_paddr = fadump_relocate(paddr_vmcoreinfo_note());
- phdr->p_offset = phdr->p_paddr;
- phdr->p_memsz = phdr->p_filesz = VMCOREINFO_NOTE_SIZE;
+ phdr->p_paddr = phdr->p_offset = fdh->vmcoreinfo_raddr;
+ phdr->p_memsz = phdr->p_filesz = fdh->vmcoreinfo_size;
/* Increment number of program headers. */
(elf->e_phnum)++;
- /* setup PT_LOAD sections. */
- j = 0;
- offset = 0;
- raddr = fw_dump.boot_mem_addr[0];
- for (i = 0; i < crash_mrange_info.mem_range_cnt; i++) {
- u64 mbase, msize;
-
- mbase = crash_mrange_info.mem_ranges[i].base;
- msize = crash_mrange_info.mem_ranges[i].size;
- if (!msize)
- continue;
-
+ /*
+ * setup PT_LOAD sections.
+ * first include boot memory regions and then add rest of the memory
+ * regions
+ */
+ boot_mem_dest_offset = fw_dump.boot_mem_dest_addr;
+ for (i = 0; i < fw_dump.boot_mem_regs_cnt; i++) {
phdr = (struct elf_phdr *)bufp;
bufp += sizeof(struct elf_phdr);
- phdr->p_type = PT_LOAD;
- phdr->p_flags = PF_R|PF_W|PF_X;
- phdr->p_offset = mbase;
-
- if (mbase == raddr) {
- /*
- * The entire real memory region will be moved by
- * firmware to the specified destination_address.
- * Hence set the correct offset.
- */
- phdr->p_offset = fw_dump.boot_mem_dest_addr + offset;
- if (j < (fw_dump.boot_mem_regs_cnt - 1)) {
- offset += fw_dump.boot_mem_sz[j];
- raddr = fw_dump.boot_mem_addr[++j];
- }
+ populate_elf_pt_load(phdr, fw_dump.boot_mem_addr[i],
+ fw_dump.boot_mem_sz[i],
+ boot_mem_dest_offset);
+ /* Increment number of program headers. */
+ (elf->e_phnum)++;
+ boot_mem_dest_offset += fw_dump.boot_mem_sz[i];
+ }
+
+ /* Memory reserved for fadump in first kernel */
+ ra_start = fw_dump.reserve_dump_area_start;
+ ra_size = get_fadump_area_size();
+ ra_end = ra_start + ra_size;
+
+ phdr = (struct elf_phdr *)bufp;
+ for_each_mem_range(i, &mstart, &mend) {
+ /*
+ * Skip the memory regions that already added
+ * to elfcorehdr.
+ */
+ if (mstart < fw_dump.boot_mem_top) {
+ if (mend > fw_dump.boot_mem_top)
+ mstart = fw_dump.boot_mem_top;
+ else
+ continue;
}
- phdr->p_paddr = mbase;
- phdr->p_vaddr = (unsigned long)__va(mbase);
- phdr->p_filesz = msize;
- phdr->p_memsz = msize;
- phdr->p_align = 0;
+ /* Handle memblock regions overlaps with reserved area */
+ if ((ra_start < mend) && (ra_end > mstart)) {
+ if ((mstart < ra_start) && (mend > ra_end)) {
+ populate_elf_pt_load(phdr, mstart, ra_start -
mstart, mstart);
+ elf->e_phnum++;
+ bufp += sizeof(struct elf_phdr);
+ phdr = (struct elf_phdr *)bufp;
+ populate_elf_pt_load(phdr, ra_end, mend -
ra_end, ra_end);
+ } else if (mstart < ra_start) {
+ populate_elf_pt_load(phdr, mstart, ra_start -
mstart, mstart);
+ } else if (ra_end < mend) {
+ populate_elf_pt_load(phdr, ra_end, mend -
ra_end, ra_end);
+ }
+ } else {
+ populate_elf_pt_load(phdr, mstart, mend - mstart,
mstart);
+ }
- /* Increment number of program headers. */
- (elf->e_phnum)++;
+ elf->e_phnum++;
+ bufp += sizeof(struct elf_phdr);
+ phdr = (struct elf_phdr *) bufp;
}
- return 0;
}
static unsigned long init_fadump_header(unsigned long addr)
@@ -1175,14 +1120,25 @@ static unsigned long init_fadump_header(unsigned long
addr)
memset(fdh, 0, sizeof(struct fadump_crash_info_header));
fdh->magic_number = FADUMP_CRASH_INFO_MAGIC;
- fdh->elfcorehdr_addr = addr;
+ fdh->version = FADUMP_HEADER_VERSION;
+ /* Elfcorehdr will now be prepared in the second kernel. */
+ fdh->elfcorehdr_addr = 0;
+ fdh->elfcorehdr_size = 0;
/* We will set the crashing cpu id in crash_fadump() during crash. */
fdh->crashing_cpu = FADUMP_CPU_UNKNOWN;
+
+ /* Need below vmcoreinfo in second kernel to prepare elfcorehdr. */
+ fdh->vmcoreinfo_raddr = fadump_relocate(paddr_vmcoreinfo_note());
+ fdh->vmcoreinfo_size = VMCOREINFO_NOTE_SIZE;
+
+
+ fdh->pt_regs_sz = sizeof(struct pt_regs);
/*
* When LPAR is terminated by PYHP, ensure all possible CPUs'
* register data is processed while exporting the vmcore.
*/
fdh->cpu_mask = *cpu_possible_mask;
+ fdh->cpu_mask_sz = sizeof(struct cpumask);
return addr;
}
@@ -1190,8 +1146,6 @@ static unsigned long init_fadump_header(unsigned long
addr)
static int register_fadump(void)
{
unsigned long addr;
- void *vaddr;
- int ret;
/*
* If no memory is reserved then we can not register for firmware-
@@ -1200,18 +1154,10 @@ static int register_fadump(void)
if (!fw_dump.reserve_dump_area_size)
return -ENODEV;
- ret = fadump_setup_crash_memory_ranges();
- if (ret)
- return ret;
-
addr = fw_dump.fadumphdr_addr;
/* Initialize fadump crash info header. */
addr = init_fadump_header(addr);
- vaddr = __va(addr);
-
- pr_debug("Creating ELF core headers at %#016lx\n", addr);
- fadump_create_elfcore_headers(vaddr);
/* register the future kernel dump with firmware. */
pr_debug("Registering for firmware-assisted kernel dump...\n");
@@ -1230,7 +1176,6 @@ void fadump_cleanup(void)
} else if (fw_dump.dump_registered) {
/* Un-register Firmware-assisted dump if it was registered. */
fw_dump.ops->fadump_unregister(&fw_dump);
- fadump_free_mem_ranges(&crash_mrange_info);
}
if (fw_dump.ops->fadump_cleanup)
@@ -1416,6 +1361,30 @@ static void fadump_release_memory(u64 begin, u64 end)
fadump_release_reserved_area(tstart, end);
}
+static void fadump_free_elfcorehdr_buf(void)
+{
+ struct fadump_crash_info_header *fdh;
+ unsigned long elfcorehdr_vaddr;
+
+ if (!fw_dump.fadumphdr_addr)
+ return;
+
+ fdh = (struct fadump_crash_info_header *) __va(fw_dump.fadumphdr_addr);
+
+ if (fdh->elfcorehdr_addr == 0 || fdh->elfcorehdr_size == 0)
+ return;
+
+ /*
+ * Reset the global `elfcorehdr_addr` before freeing `elfcorehdr`
+ * memory so that modules, like `vmcore`, do not access the invalid
memory.
+ */
+ elfcorehdr_addr = ELFCORE_ADDR_MAX;
+ elfcorehdr_vaddr = (unsigned long) __va(fdh->elfcorehdr_addr);
+ fadump_free_buffer(elfcorehdr_vaddr, fdh->elfcorehdr_size);
+ fdh->elfcorehdr_addr = 0;
+ fdh->elfcorehdr_size = 0;
+}
+
static void fadump_invalidate_release_mem(void)
{
mutex_lock(&fadump_mutex);
@@ -1424,6 +1393,8 @@ static void fadump_invalidate_release_mem(void)
return;
}
+ /* Free elfcorehdr buf before cleaning up the fadump reserved area. */
+ fadump_free_elfcorehdr_buf();
fadump_cleanup();
mutex_unlock(&fadump_mutex);
@@ -1632,6 +1603,107 @@ static void __init fadump_init_files(void)
return;
}
+static int __init fadump_setup_elfcorehdr_buf(struct fadump_crash_info_header *fdh)
+{
+ char *bufp;
+ int elf_phdr_cnt;
+ unsigned long elfcorehdr_size;
+
+ /*
+ * Program header for CPU notes comes first, followed by one for
+ * vmcoreinfo, and the remaining program headers correspond to
+ * memory regions.
+ */
+ elf_phdr_cnt = 2 + fw_dump.boot_mem_regs_cnt +
memblock_num_regions(memory);
+ elfcorehdr_size = sizeof(struct elfhdr) + (elf_phdr_cnt * sizeof(struct
elf_phdr));
+ elfcorehdr_size = PAGE_ALIGN(elfcorehdr_size);
+ bufp = (char *)fadump_alloc_buffer(elfcorehdr_size);
+ if (!bufp) {
+ pr_err("Failed to allocate %lu bytes for elfcorehdr\n",
+ elfcorehdr_size);
+ return -ENOMEM;
+ }
+ fdh->elfcorehdr_addr = virt_to_phys(bufp);
+ fdh->elfcorehdr_size = elfcorehdr_size;
+ return 0;
+}
+
+static bool is_fadump_header_compatible(struct fadump_crash_info_header *fdh)
+{
+ if (fdh->magic_number == FADUMP_CRASH_INFO_MAGIC_OLD) {
+ pr_err("Old magic number, can't process the dump.\n");
+ return false;
+ }
+
+ if (fdh->magic_number != FADUMP_CRASH_INFO_MAGIC) {
+ if (fdh->magic_number == swab64(FADUMP_CRASH_INFO_MAGIC))
+ pr_err("Endianness mismatch between the crashed and fadump
kernels.\n");
+ else
+ pr_err("Fadump header is corruped.\n");