On Tue, May 07, 2024 at 03:23:31PM +0530, Vignesh Balasubramanian wrote:
> Add a new .note section containing type, size, offset and flags of
> every xfeature that is present.
> 
> This information will be used by the debuggers to understand the XSAVE
> layout of the machine where the core file is dumped, and to read XSAVE
> registers, especially during cross-platform debugging.
> 
> Some background:
> 
> The XSAVE layouts of modern AMD and Intel CPUs differ, especially since
> Memory Protection Keys and the AVX-512 features have been inculcated into
> the AMD CPUs.
> This is since AMD never adopted (and hence never left room in the XSAVE
> layout for) the Intel MPX feature. Tools like GDB had assumed a fixed XSAVE
> layout matching that of Intel (based on the XCR0 mask).
> Hence, the core dumps from AMD CPUs didn't match the known size for the
> XCR0 mask. This resulted in GDB and other tools not being able to access
> the values of the AVX-512 and PKRU registers on AMD CPUs.
> To solve this, an interim solution has been accepted into GDB, and is
> already a part of GDB 14, thanks to these series of patches
> [ https://sourceware.org/pipermail/gdb-patches/2023-March/198081.html ].
> But this patch series depends on heuristics based on the total XSAVE
> register set size and the XCR0 mask to infer the layouts of the various
> register blocks for core dumps, and hence, is not a foolproof mechanism to
> determine the layout of the XSAVE area.
> 
> Hence this new core dump note has been proposed as a more sturdy mechanism
> to allow GDB/LLDB and other relevant tools to determine the layout of the
> XSAVE area of the machine where the corefile was dumped.
> The new core dump note (which is being proposed as a per-process .note
> section), NT_X86_XSAVE_LAYOUT (0x205) contains an array of structures.
> Each structure describes an individual extended feature containing offset,
> size and flags (that is obtained through CPUID instruction) in a format
> roughly matching the follow C structure:
> 
> struct xfeat_component {
>        u32 xfeat_type;
>        u32 xfeat_sz;
>        u32 xfeat_off;
>        u32 xfeat_flags;
> };
> 
> Co-developed-by: Jini Susan George <jinisusan.geo...@amd.com>
> Signed-off-by: Jini Susan George <jinisusan.geo...@amd.com>
> Signed-off-by: Vignesh Balasubramanian <vigba...@amd.com>
> ---
> v1->v2: Removed kernel internal defn dependency, code improvements
> 
>  arch/x86/Kconfig             |   1 +
>  arch/x86/include/asm/elf.h   |  34 +++++++++
>  arch/x86/kernel/fpu/xstate.c | 141 +++++++++++++++++++++++++++++++++++
>  fs/binfmt_elf.c              |   4 +-
>  include/uapi/linux/elf.h     |   1 +
>  5 files changed, 179 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
> index 928820e61cb5..cc67daab3396 100644
> --- a/arch/x86/Kconfig
> +++ b/arch/x86/Kconfig
> @@ -105,6 +105,7 @@ config X86
>       select ARCH_HAS_DEBUG_WX
>       select ARCH_HAS_ZONE_DMA_SET if EXPERT
>       select ARCH_HAVE_NMI_SAFE_CMPXCHG
> +     select ARCH_HAVE_EXTRA_ELF_NOTES
>       select ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE
>       select ARCH_MIGHT_HAVE_ACPI_PDC         if ACPI
>       select ARCH_MIGHT_HAVE_PC_PARPORT
> diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
> index 1fb83d47711f..5952574db64b 100644
> --- a/arch/x86/include/asm/elf.h
> +++ b/arch/x86/include/asm/elf.h
> @@ -13,6 +13,40 @@
>  #include <asm/auxvec.h>
>  #include <asm/fsgsbase.h>
>  
> +struct xfeat_component {
> +     u32 xfeat_type;
> +     u32 xfeat_sz;
> +     u32 xfeat_off;
> +     u32 xfeat_flags;
> +} __packed;
> +
> +_Static_assert(sizeof(struct xfeat_component)%4 == 0, "xfeat_component is 
> not aligned");
> +
> +enum custom_feature {
> +     FEATURE_XSAVE_FP = 0,
> +     FEATURE_XSAVE_SSE = 1,
> +     FEATURE_XSAVE_YMM = 2,
> +     FEATURE_XSAVE_BNDREGS = 3,
> +     FEATURE_XSAVE_BNDCSR = 4,
> +     FEATURE_XSAVE_OPMASK = 5,
> +     FEATURE_XSAVE_ZMM_Hi256 = 6,
> +     FEATURE_XSAVE_Hi16_ZMM = 7,
> +     FEATURE_XSAVE_PT = 8,
> +     FEATURE_XSAVE_PKRU = 9,
> +     FEATURE_XSAVE_PASID = 10,
> +     FEATURE_XSAVE_CET_USER = 11,
> +     FEATURE_XSAVE_CET_SHADOW_STACK = 12,
> +     FEATURE_XSAVE_HDC = 13,
> +     FEATURE_XSAVE_UINTR = 14,
> +     FEATURE_XSAVE_LBR = 15,
> +     FEATURE_XSAVE_HWP = 16,
> +     FEATURE_XSAVE_XTILE_CFG = 17,
> +     FEATURE_XSAVE_XTILE_DATA = 18,
> +     FEATURE_MAX,
> +     FEATURE_XSAVE_EXTENDED_START = FEATURE_XSAVE_YMM,
> +     FEATURE_XSAVE_EXTENDED_END = FEATURE_XSAVE_XTILE_DATA,
> +};
> +
>  typedef unsigned long elf_greg_t;
>  
>  #define ELF_NGREG (sizeof(struct user_regs_struct) / sizeof(elf_greg_t))
> diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
> index 33a214b1a4ce..3d1c3c96e34d 100644
> --- a/arch/x86/kernel/fpu/xstate.c
> +++ b/arch/x86/kernel/fpu/xstate.c
> @@ -13,6 +13,7 @@
>  #include <linux/seq_file.h>
>  #include <linux/proc_fs.h>
>  #include <linux/vmalloc.h>
> +#include <linux/coredump.h>
>  
>  #include <asm/fpu/api.h>
>  #include <asm/fpu/regset.h>
> @@ -87,6 +88,8 @@ static unsigned int xstate_flags[XFEATURE_MAX] 
> __ro_after_init;
>  #define XSTATE_FLAG_SUPERVISOR       BIT(0)
>  #define XSTATE_FLAG_ALIGNED64        BIT(1)
>  
> +static const char owner_name[] = "LINUX";

This needs to move under the CONFIG_COREDUMP below (so says the build
bots).

> +
>  /*
>   * Return whether the system supports a given xfeature.
>   *
> @@ -1837,3 +1840,141 @@ int proc_pid_arch_status(struct seq_file *m, struct 
> pid_namespace *ns,
>       return 0;
>  }
>  #endif /* CONFIG_PROC_PID_ARCH_STATUS */
> +
> +#ifdef CONFIG_COREDUMP
> +static int get_sub_leaf(int custom_xfeat)

Why is this "int"? I don't imagine there are negative features?

> +{
> +     switch (custom_xfeat) {
> +     case FEATURE_XSAVE_YMM:                 return XFEATURE_YMM;
> +     case FEATURE_XSAVE_BNDREGS:             return XFEATURE_BNDREGS;
> +     case FEATURE_XSAVE_BNDCSR:              return XFEATURE_BNDCSR;
> +     case FEATURE_XSAVE_OPMASK:              return XFEATURE_OPMASK;
> +     case FEATURE_XSAVE_ZMM_Hi256:           return XFEATURE_ZMM_Hi256;
> +     case FEATURE_XSAVE_Hi16_ZMM:            return XFEATURE_Hi16_ZMM;
> +     case FEATURE_XSAVE_PT:                  return 
> XFEATURE_PT_UNIMPLEMENTED_SO_FAR;
> +     case FEATURE_XSAVE_PKRU:                return XFEATURE_PKRU;
> +     case FEATURE_XSAVE_PASID:               return XFEATURE_PASID;
> +     case FEATURE_XSAVE_CET_USER:            return XFEATURE_CET_USER;
> +     case FEATURE_XSAVE_CET_SHADOW_STACK:    return 
> XFEATURE_CET_KERNEL_UNUSED;
> +     case FEATURE_XSAVE_HDC:                 return XFEATURE_RSRVD_COMP_13;
> +     case FEATURE_XSAVE_UINTR:               return XFEATURE_RSRVD_COMP_14;
> +     case FEATURE_XSAVE_LBR:                 return XFEATURE_LBR;
> +     case FEATURE_XSAVE_HWP:                 return XFEATURE_RSRVD_COMP_16;
> +     case FEATURE_XSAVE_XTILE_CFG:           return XFEATURE_XTILE_CFG;
> +     case FEATURE_XSAVE_XTILE_DATA:          return XFEATURE_XTILE_DATA;
> +     default:
> +             pr_warn_ratelimited("Not a valid XSAVE Feature.");

This isn't very friendly; it's keeping secrets about the unknown value. :)
Also it's missing a newline. How about:

                pr_warn_ratelimited("Not a known XSAVE Feature: %u\n",
                                    custom_xfeat);

> +             return 0;
> +     }
> +}
> +
> +/*
> + * Dump type, size, offset and flag values for every xfeature that is 
> present.
> + */
> +static int dump_xsave_layout_desc(struct coredump_params *cprm)
> +{
> +     u32 supported_features = 0;
> +     struct xfeat_component xc;
> +     u32 eax, ebx, ecx, edx;
> +     int num_records = 0;
> +     int sub_leaf = 0;
> +     int i;
> +
> +     /* Find supported extended features */
> +     cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx);
> +     supported_features = eax;
> +
> +     for (i = FEATURE_XSAVE_EXTENDED_START;
> +                     i <= FEATURE_XSAVE_EXTENDED_END; i++) {
> +             sub_leaf = get_sub_leaf(i);
> +             if (!sub_leaf)
> +                     continue;
> +             if (supported_features & (1U << sub_leaf)) {
> +                     cpuid_count(XSTATE_CPUID, sub_leaf, &eax, &ebx, &ecx, 
> &edx);
> +                     xc.xfeat_type = i;
> +                     xc.xfeat_sz = eax;
> +                     xc.xfeat_off = ebx;
> +                     /* Reserved for future use */
> +                     xc.xfeat_flags = 0;
> +
> +                     if (!dump_emit(cprm, &xc,
> +                                    sizeof(struct xfeat_component)))
> +                             return 0;
> +                     num_records++;
> +             }
> +     }
> +
> +     return num_records;
> +}
> +
> +static int get_xsave_desc_size(void)

This can return u32: never negative.

> +{
> +     int supported_features = 0;
> +     int xfeatures_count = 0;
> +     u32 eax, ebx, ecx, edx;
> +     int sub_leaf = 0;
> +     int i;

"i" can be u32 and then we can fix the get_sub_leaf() arg type.

> +
> +     /* Find supported extended features */
> +     cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx);
> +     supported_features = eax;
> +
> +     for (i = FEATURE_XSAVE_EXTENDED_START;
> +                     i <= FEATURE_XSAVE_EXTENDED_END; i++) {
> +             sub_leaf = get_sub_leaf(i);
> +             if (!sub_leaf)
> +                     continue;
> +             if (supported_features & (1U << sub_leaf))
> +                     xfeatures_count++;
> +     }
> +
> +     return xfeatures_count * (sizeof(struct xfeat_component));
> +}
> +
> +int elf_coredump_extra_notes_write(struct coredump_params *cprm)
> +{
> +     int num_records = 0;
> +     struct elf_note en;
> +
> +     en.n_namesz = sizeof(owner_name);
> +     en.n_descsz = get_xsave_desc_size();
> +     en.n_type = NT_X86_XSAVE_LAYOUT;
> +
> +     if (!dump_emit(cprm, &en, sizeof(en)))
> +             return 1;
> +     if (!dump_emit(cprm, owner_name, en.n_namesz))
> +             return 1;
> +     if (!dump_align(cprm, 4))
> +             return 1;
> +
> +     num_records = dump_xsave_layout_desc(cprm);
> +     if (!num_records) {
> +             pr_warn_ratelimited("Error adding XSTATE layout ELF note. 
> XSTATE buffer in the core file will be unparseable.");

Missing trailing newline.

> +             return 1;
> +     }
> +
> +     /* Total size should be equal to the number of records */
> +     if ((sizeof(struct xfeat_component) * num_records) != en.n_descsz) {
> +             pr_warn_ratelimited("Error adding XSTATE layout ELF note. The 
> size of the .note section does not match with the total size of the 
> records.");

Same.

> +             return 1;
> +     }
> +
> +     return 0;
> +}
> +
> +/*
> + * Return the size of new note.
> + */
> +int elf_coredump_extra_notes_size(void)
> +{
> +     int size = 0;
> +
> +     /* NOTE Header */
> +     size += sizeof(struct elf_note);
> +     /* name + align */
> +     size += roundup(sizeof(owner_name), 4);
> +     size += get_xsave_desc_size();
> +
> +     return size;
> +}
> +#endif

Since it's a long if/endif, add: /* CONFIG_COREDUMP */ after the endif
here.

> diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
> index 5397b552fbeb..833bcb7e957b 100644
> --- a/fs/binfmt_elf.c
> +++ b/fs/binfmt_elf.c
> @@ -2000,7 +2000,7 @@ static int elf_core_dump(struct coredump_params *cprm)
>       {
>               size_t sz = info.size;
>  
> -             /* For cell spufs */
> +             /* For cell spufs and x86 xstate */
>               sz += elf_coredump_extra_notes_size();
>  
>               phdr4note = kmalloc(sizeof(*phdr4note), GFP_KERNEL);
> @@ -2064,7 +2064,7 @@ static int elf_core_dump(struct coredump_params *cprm)
>       if (!write_note_info(&info, cprm))
>               goto end_coredump;
>  
> -     /* For cell spufs */
> +     /* For cell spufs and x86 xstate */
>       if (elf_coredump_extra_notes_write(cprm))
>               goto end_coredump;
>  
> diff --git a/include/uapi/linux/elf.h b/include/uapi/linux/elf.h
> index b54b313bcf07..e30a9b47dc87 100644
> --- a/include/uapi/linux/elf.h
> +++ b/include/uapi/linux/elf.h
> @@ -411,6 +411,7 @@ typedef struct elf64_shdr {
>  #define NT_X86_XSTATE        0x202           /* x86 extended state using 
> xsave */
>  /* Old binutils treats 0x203 as a CET state */
>  #define NT_X86_SHSTK 0x204           /* x86 SHSTK state */
> +#define NT_X86_XSAVE_LAYOUT  0x205   /* XSAVE layout description */
>  #define NT_S390_HIGH_GPRS    0x300   /* s390 upper register halves */
>  #define NT_S390_TIMER        0x301           /* s390 timer register */
>  #define NT_S390_TODCMP       0x302           /* s390 TOD clock comparator 
> register */
> -- 
> 2.34.1
> 

Otherwise looks good. I'd like to see feedback from Intel folks too.

Thanks for working on this!

-Kees

-- 
Kees Cook

Reply via email to