On Wed, Oct 30, 2024 at 05:15:50PM +0100, Borislav Petkov wrote:
> On Tue, Oct 22, 2024 at 07:36:31PM +0000, Avadhut Naik wrote:
> > From: Yazen Ghannam <yazen.ghan...@amd.com>
> > 
> > A new "FRU Text in MCA" feature is defined where the Field Replaceable
> > Unit (FRU) Text for a device is represented by a string in the new
> > MCA_SYND1 and MCA_SYND2 registers. This feature is supported per MCA
> > bank, and it is advertised by the McaFruTextInMca bit (MCA_CONFIG[9]).
> > 
> > The FRU Text is populated dynamically for each individual error state
> > (MCA_STATUS, MCA_ADDR, et al.). This handles the case where an MCA bank
> > covers multiple devices, for example, a Unified Memory Controller (UMC)
> > bank that manages two DIMMs.
> > 
> > Since MCA_CONFIG[9] is instrumental in decoding FRU Text, it has to be
> > exported through the mce_record tracepoint so that userspace tools like
> > the rasdaemon can determine if FRU Text has been reported through the
> > MCA_SYND1 and MCA_SYND2 registers and output it.
> 
> IOW:
> 
> Author: Yazen Ghannam <yazen.ghan...@amd.com>
> Date:   Tue Oct 22 19:36:31 2024 +0000
> 
>     EDAC/mce_amd: Add support for FRU text in MCA
>     
>     A new "FRU Text in MCA" feature is defined where the Field Replaceable
>     Unit (FRU) Text for a device is represented by a string in the new
>     MCA_SYND1 and MCA_SYND2 registers. This feature is supported per MCA
>     bank, and it is advertised by the McaFruTextInMca bit (MCA_CONFIG[9]).
>     
>     The FRU Text is populated dynamically for each individual error state
>     (MCA_STATUS, MCA_ADDR, et al.). This handles the case where an MCA bank
>     covers multiple devices, for example, a Unified Memory Controller (UMC)
>     bank that manages two DIMMs.
>     
>     If SYND1 and SYND2 are !NULL, then userspace can assume that they
>     contain FRU text information. If they will report other information in
>     the future, then a way of communicating the info type contained must be
>     devised.
>     
>       [ Yazen: Add Avadhut as co-developer for wrapper changes. ]
>       [ bp: Do not expose MCA_CONFIG to userspace yet. ]

The entire struct mce_hw_err gets exposed through the mce tracepoint in
patch 3 of this set.

>     
>     Signed-off-by: Yazen Ghannam <yazen.ghan...@amd.com>
>     Co-developed-by: Avadhut Naik <avadhut.n...@amd.com>
>     Signed-off-by: Avadhut Naik <avadhut.n...@amd.com>
>     Signed-off-by: Borislav Petkov (AMD) <b...@alien8.de>
>     Link: 
> https://lore.kernel.org/r/20241022194158.110073-6-avadhut.n...@amd.com
> 
> diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
> index 4d936ee20e24..649a901ad563 100644
> --- a/arch/x86/include/asm/mce.h
> +++ b/arch/x86/include/asm/mce.h
> @@ -61,6 +61,7 @@
>   *  - TCC bit is present in MCx_STATUS.
>   */
>  #define MCI_CONFIG_MCAX              0x1
> +#define MCI_CONFIG_FRUTEXT   BIT_ULL(9)
>  #define MCI_IPID_MCATYPE     0xFFFF0000
>  #define MCI_IPID_HWID                0xFFF
>  
> @@ -212,6 +213,7 @@ struct mce_hw_err {
>               struct {
>                       u64 synd1;              /* MCA_SYND1 MSR */
>                       u64 synd2;              /* MCA_SYND2 MSR */
> +                     u64 config;             /* MCA_CONFIG MSR */

Anything that is added here will automatically show up in the
tracepoint, since it's a dynamic array. That was one of the reasons to
do the wrapper struct idea, right?

>               } amd;
>       } vendor;
>  };
> diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c
> index 6ca80fff1fea..65ace034af08 100644
> --- a/arch/x86/kernel/cpu/mce/amd.c
> +++ b/arch/x86/kernel/cpu/mce/amd.c
> @@ -796,6 +796,7 @@ static void __log_error(unsigned int bank, u64 status, 
> u64 addr, u64 misc)
>  
>       if (mce_flags.smca) {
>               rdmsrl(MSR_AMD64_SMCA_MCx_IPID(bank), m->ipid);
> +             rdmsrl(MSR_AMD64_SMCA_MCx_CONFIG(bank), err.vendor.amd.config);
>  
>               if (m->status & MCI_STATUS_SYNDV) {
>                       rdmsrl(MSR_AMD64_SMCA_MCx_SYND(bank), m->synd);
> diff --git a/arch/x86/kernel/cpu/mce/apei.c b/arch/x86/kernel/cpu/mce/apei.c
> index 0a89947e47bc..19a1c72fc2bf 100644
> --- a/arch/x86/kernel/cpu/mce/apei.c
> +++ b/arch/x86/kernel/cpu/mce/apei.c
> @@ -155,6 +155,8 @@ int apei_smca_report_x86_error(struct cper_ia_proc_ctx 
> *ctx_info, u64 lapic_id)
>               fallthrough;
>       /* MCA_CONFIG */
>       case 4:
> +             err.vendor.amd.config = *(i_mce + 3);
> +             fallthrough;
>       /* MCA_MISC0 */
>       case 3:
>               m->misc = *(i_mce + 2);
> diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
> index fca23fe16abe..edc2c8033de8 100644
> --- a/arch/x86/kernel/cpu/mce/core.c
> +++ b/arch/x86/kernel/cpu/mce/core.c
> @@ -681,6 +681,7 @@ static noinstr void mce_read_aux(struct mce_hw_err *err, 
> int i)
>  
>       if (mce_flags.smca) {
>               m->ipid = mce_rdmsrl(MSR_AMD64_SMCA_MCx_IPID(i));
> +             err->vendor.amd.config = 
> mce_rdmsrl(MSR_AMD64_SMCA_MCx_CONFIG(i));
>  
>               if (m->status & MCI_STATUS_SYNDV) {
>                       m->synd = mce_rdmsrl(MSR_AMD64_SMCA_MCx_SYND(i));
> diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c
> index 194d9fd47d20..62fcd92bf9d2 100644
> --- a/drivers/edac/mce_amd.c
> +++ b/drivers/edac/mce_amd.c
> @@ -795,6 +795,7 @@ amd_decode_mce(struct notifier_block *nb, unsigned long 
> val, void *data)
>       struct mce *m = (struct mce *)data;
>       struct mce_hw_err *err = to_mce_hw_err(m);
>       unsigned int fam = x86_family(m->cpuid);
> +     u64 mca_config = err->vendor.amd.config;
>       int ecc;
>  
>       if (m->kflags & MCE_HANDLED_CEC)
> @@ -814,11 +815,7 @@ amd_decode_mce(struct notifier_block *nb, unsigned long 
> val, void *data)
>               ((m->status & MCI_STATUS_PCC)   ? "PCC"   : "-"));
>  
>       if (boot_cpu_has(X86_FEATURE_SMCA)) {
> -             u32 low, high;
> -             u32 addr = MSR_AMD64_SMCA_MCx_CONFIG(m->bank);
> -
> -             if (!rdmsr_safe(addr, &low, &high) &&
> -                 (low & MCI_CONFIG_MCAX))
> +             if (mca_config & MCI_CONFIG_MCAX)
>                       pr_cont("|%s", ((m->status & MCI_STATUS_TCC) ? "TCC" : 
> "-"));
>  
>               pr_cont("|%s", ((m->status & MCI_STATUS_SYNDV) ? "SyndV" : 
> "-"));
> @@ -853,8 +850,15 @@ amd_decode_mce(struct notifier_block *nb, unsigned long 
> val, void *data)
>  
>               if (m->status & MCI_STATUS_SYNDV) {
>                       pr_cont(", Syndrome: 0x%016llx\n", m->synd);
> -                     pr_emerg(HW_ERR "Syndrome1: 0x%016llx, Syndrome2: 
> 0x%016llx",
> -                              err->vendor.amd.synd1, err->vendor.amd.synd2);
> +                     if (mca_config & MCI_CONFIG_FRUTEXT) {
> +                             char frutext[17];
> +
> +                             frutext[16] = '\0';
> +                             memcpy(&frutext[0], &err->vendor.amd.synd1, 8);
> +                             memcpy(&frutext[8], &err->vendor.amd.synd2, 8);
> +
> +                             pr_emerg(HW_ERR "FRU Text: %s", frutext);
> +                     }
>               }
>  
>               pr_cont("\n");
> 
>

The only changes I see are dropping a couple of kernel prints. I think
that's probably okay. But I don't think that's what you intend by not
exposing MCA_CONFIG to user space.

Thanks,
Yazen

Reply via email to