From: Aravind Gopalakrishnan <aravind.gopalakrish...@amd.com>
Date: Mon, 29 Feb 2016 16:32:56 -0600
Subject: [PATCH 1/3] x86/mce/AMD, EDAC: Enable error decoding of Scalable MCA
 errors

For Scalable MCA enabled processors, errors are listed per IP block. And
since it is not required for an IP to map to a particular bank, we need
to use HWID and McaType values from the MCx_IPID register to figure out
which IP a given bank represents.

We also have a new bit (TCC) in the MCx_STATUS register to indicate Task
context is corrupt.

Add logic here to decode errors from all known IP blocks for Fam17h
Model 00-0fh and to print TCC errors.

Boris:
- reorganize function placement in drivers/edac/mce_amd.c
- reflow comments

Signed-off-by: Aravind Gopalakrishnan <aravind.gopalakrish...@amd.com>
Cc: "H. Peter Anvin" <h...@zytor.com>
Cc: Ingo Molnar <mi...@redhat.com>
Cc: linux-edac <linux-e...@vger.kernel.org>
Cc: Mauro Carvalho Chehab <mche...@osg.samsung.com>
Cc: Thomas Gleixner <t...@linutronix.de>
Cc: Tony Luck <tony.l...@intel.com>
Cc: x86-ml <x...@kernel.org>
Link: 
http://lkml.kernel.org/r/1456785179-14378-3-git-send-email-aravind.gopalakrish...@amd.com
Signed-off-by: Borislav Petkov <b...@suse.de>
---
 arch/x86/include/asm/mce.h           |  53 ++++++
 arch/x86/kernel/cpu/mcheck/mce_amd.c |  12 ++
 drivers/edac/mce_amd.c               | 342 ++++++++++++++++++++++++++++++++++-
 3 files changed, 406 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index f9d4b8d4baf2..6f1380064471 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -42,6 +42,18 @@
 /* AMD-specific bits */
 #define MCI_STATUS_DEFERRED    (1ULL<<44)  /* declare an uncorrected error */
 #define MCI_STATUS_POISON      (1ULL<<43)  /* access poisonous data */
+#define MCI_STATUS_TCC         (1ULL<<55)  /* Task context corrupt */
+
+/*
+ * McaX field if set indicates a given bank supports MCA extensions:
+ *  - Deferred error interrupt type is specifiable by bank.
+ *  - MCx_MISC0[BlkPtr] field indicates presence of extended MISC registers,
+ *    But should not be used to determine MSR numbers.
+ *  - TCC bit is present in MCx_STATUS.
+ */
+#define MCI_CONFIG_MCAX                0x1
+#define MCI_IPID_MCATYPE       0xFFFF0000
+#define MCI_IPID_HWID          0xFFF
 
 /*
  * Note that the full MCACOD field of IA32_MCi_STATUS MSR is
@@ -93,7 +105,9 @@
 
 /* 'SMCA': AMD64 Scalable MCA */
 #define MSR_AMD64_SMCA_MC0_CONFIG      0xc0002004
+#define MSR_AMD64_SMCA_MC0_IPID                0xc0002005
 #define MSR_AMD64_SMCA_MCx_CONFIG(x)   (MSR_AMD64_SMCA_MC0_CONFIG + 0x10*(x))
+#define MSR_AMD64_SMCA_MCx_IPID(x)     (MSR_AMD64_SMCA_MC0_IPID + 0x10*(x))
 
 /*
  * This structure contains all data related to the MCE log.  Also
@@ -291,4 +305,43 @@ struct cper_sec_mem_err;
 extern void apei_mce_report_mem_error(int corrected,
                                      struct cper_sec_mem_err *mem_err);
 
+/*
+ * Enumerate new IP types and HWID values in AMD processors which support
+ * Scalable MCA.
+ */
+#ifdef CONFIG_X86_MCE_AMD
+enum amd_ip_types {
+       SMCA_F17H_CORE_BLOCK = 0,       /* Core errors */
+       SMCA_DF_BLOCK,                  /* Data Fabric */
+       SMCA_UMC_BLOCK,                 /* Unified Memory Controller */
+       SMCA_PB_BLOCK,                  /* Parameter Block */
+       SMCA_PSP_BLOCK,                 /* Platform Security Processor */
+       SMCA_SMU_BLOCK,                 /* System Management Unit */
+       N_AMD_IP_TYPES
+};
+
+struct amd_hwid {
+       const char *amd_ipname;
+       unsigned int amd_hwid_value;
+};
+
+extern struct amd_hwid amd_hwid_mappings[N_AMD_IP_TYPES];
+
+enum amd_core_mca_blocks {
+       SMCA_LS_BLOCK = 0,      /* Load Store */
+       SMCA_IF_BLOCK,          /* Instruction Fetch */
+       SMCA_L2_CACHE_BLOCK,    /* L2 cache */
+       SMCA_DE_BLOCK,          /* Decoder unit */
+       RES,                    /* Reserved */
+       SMCA_EX_BLOCK,          /* Execution unit */
+       SMCA_FP_BLOCK,          /* Floating Point */
+       SMCA_L3_CACHE_BLOCK     /* L3 cache */
+};
+
+enum amd_df_mca_blocks {
+       SMCA_CS_BLOCK = 0,      /* Coherent Slave */
+       SMCA_PIE_BLOCK          /* Power management, Interrupts, etc */
+};
+#endif
+
 #endif /* _ASM_X86_MCE_H */
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c 
b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index 88de27bd5797..3188cd9eb9b5 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -71,6 +71,18 @@ static const char * const th_names[] = {
        "execution_unit",
 };
 
+/* Define HWID to IP type mappings for Scalable MCA */
+struct amd_hwid amd_hwid_mappings[] =
+{
+       [SMCA_F17H_CORE_BLOCK]  = { "f17h_core",        0xB0 },
+       [SMCA_DF_BLOCK]         = { "data fabric",      0x2E },
+       [SMCA_UMC_BLOCK]        = { "UMC",              0x96 },
+       [SMCA_PB_BLOCK]         = { "param block",      0x5 },
+       [SMCA_PSP_BLOCK]        = { "PSP",              0xFF },
+       [SMCA_SMU_BLOCK]        = { "SMU",              0x1 },
+};
+EXPORT_SYMBOL_GPL(amd_hwid_mappings);
+
 static DEFINE_PER_CPU(struct threshold_bank **, threshold_banks);
 static DEFINE_PER_CPU(unsigned char, bank_map);        /* see which banks are 
on */
 
diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c
index e3a945ce374b..6820d17fea9c 100644
--- a/drivers/edac/mce_amd.c
+++ b/drivers/edac/mce_amd.c
@@ -147,6 +147,135 @@ static const char * const mc6_mce_desc[] = {
        "Status Register File",
 };
 
+/* Scalable MCA error strings */
+static const char * const f17h_ls_mce_desc[] = {
+       "Load queue parity",
+       "Store queue parity",
+       "Miss address buffer payload parity",
+       "L1 TLB parity",
+       "",                                             /* reserved */
+       "DC tag error type 6",
+       "DC tag error type 1",
+       "Internal error type 1",
+       "Internal error type 2",
+       "Sys Read data error thread 0",
+       "Sys read data error thread 1",
+       "DC tag error type 2",
+       "DC data error type 1 (poison comsumption)",
+       "DC data error type 2",
+       "DC data error type 3",
+       "DC tag error type 4",
+       "L2 TLB parity",
+       "PDC parity error",
+       "DC tag error type 3",
+       "DC tag error type 5",
+       "L2 fill data error",
+};
+
+static const char * const f17h_if_mce_desc[] = {
+       "microtag probe port parity error",
+       "IC microtag or full tag multi-hit error",
+       "IC full tag parity",
+       "IC data array parity",
+       "Decoupling queue phys addr parity error",
+       "L0 ITLB parity error",
+       "L1 ITLB parity error",
+       "L2 ITLB parity error",
+       "BPQ snoop parity on Thread 0",
+       "BPQ snoop parity on Thread 1",
+       "L1 BTB multi-match error",
+       "L2 BTB multi-match error",
+};
+
+static const char * const f17h_l2_mce_desc[] = {
+       "L2M tag multi-way-hit error",
+       "L2M tag ECC error",
+       "L2M data ECC error",
+       "HW assert",
+};
+
+static const char * const f17h_de_mce_desc[] = {
+       "uop cache tag parity error",
+       "uop cache data parity error",
+       "Insn buffer parity error",
+       "Insn dispatch queue parity error",
+       "Fetch address FIFO parity",
+       "Patch RAM data parity",
+       "Patch RAM sequencer parity",
+       "uop buffer parity"
+};
+
+static const char * const f17h_ex_mce_desc[] = {
+       "Watchdog timeout error",
+       "Phy register file parity",
+       "Flag register file parity",
+       "Immediate displacement register file parity",
+       "Address generator payload parity",
+       "EX payload parity",
+       "Checkpoint queue parity",
+       "Retire dispatch queue parity",
+};
+
+static const char * const f17h_fp_mce_desc[] = {
+       "Physical register file parity",
+       "Freelist parity error",
+       "Schedule queue parity",
+       "NSQ parity error",
+       "Retire queue parity",
+       "Status register file parity",
+};
+
+static const char * const f17h_l3_mce_desc[] = {
+       "Shadow tag macro ECC error",
+       "Shadow tag macro multi-way-hit error",
+       "L3M tag ECC error",
+       "L3M tag multi-way-hit error",
+       "L3M data ECC error",
+       "XI parity, L3 fill done channel error",
+       "L3 victim queue parity",
+       "L3 HW assert",
+};
+
+static const char * const f17h_cs_mce_desc[] = {
+       "Illegal request from transport layer",
+       "Address violation",
+       "Security violation",
+       "Illegal response from transport layer",
+       "Unexpected response",
+       "Parity error on incoming request or probe response data",
+       "Parity error on incoming read response data",
+       "Atomic request parity",
+       "ECC error on probe filter access",
+};
+
+static const char * const f17h_pie_mce_desc[] = {
+       "HW assert",
+       "Internal PIE register security violation",
+       "Error on GMI link",
+       "Poison data written to internal PIE register",
+};
+
+static const char * const f17h_umc_mce_desc[] = {
+       "DRAM ECC error",
+       "Data poison error on DRAM",
+       "SDP parity error",
+       "Advanced peripheral bus error",
+       "Command/address parity error",
+       "Write data CRC error",
+};
+
+static const char * const f17h_pb_mce_desc[] = {
+       "Parameter Block RAM ECC error",
+};
+
+static const char * const f17h_psp_mce_desc[] = {
+       "PSP RAM ECC or parity error",
+};
+
+static const char * const f17h_smu_mce_desc[] = {
+       "SMU RAM ECC or parity error",
+};
+
 static bool f12h_mc0_mce(u16 ec, u8 xec)
 {
        bool ret = false;
@@ -691,6 +820,193 @@ static void decode_mc6_mce(struct mce *m)
        pr_emerg(HW_ERR "Corrupted MC6 MCE info?\n");
 }
 
+static void decode_f17h_core_errors(u8 xec, unsigned int mca_type)
+{
+       const char * const *error_desc_array;
+       char *ip_name;
+       size_t len;
+
+       switch (mca_type) {
+       case SMCA_LS_BLOCK:
+               error_desc_array = f17h_ls_mce_desc;
+               ip_name = "LS";
+               len = ARRAY_SIZE(f17h_ls_mce_desc) - 1;
+
+               if (xec == 0x4) {
+                       pr_cont("Unrecognized error code from LS MCA bank\n");
+                       return;
+               }
+
+               break;
+
+       case SMCA_IF_BLOCK:
+               error_desc_array = f17h_if_mce_desc;
+               ip_name = "IF";
+               len = ARRAY_SIZE(f17h_if_mce_desc) - 1;
+               break;
+
+       case SMCA_L2_CACHE_BLOCK:
+               error_desc_array = f17h_l2_mce_desc;
+               ip_name = "L2_Cache";
+               len = ARRAY_SIZE(f17h_l2_mce_desc) - 1;
+               break;
+
+       case SMCA_DE_BLOCK:
+               error_desc_array = f17h_de_mce_desc;
+               ip_name = "DE";
+               len = ARRAY_SIZE(f17h_de_mce_desc) - 1;
+               break;
+
+       case SMCA_EX_BLOCK:
+               error_desc_array = f17h_ex_mce_desc;
+               ip_name = "EX";
+               len = ARRAY_SIZE(f17h_ex_mce_desc) - 1;
+               break;
+
+       case SMCA_FP_BLOCK:
+               error_desc_array = f17h_fp_mce_desc;
+               ip_name = "FP";
+               len = ARRAY_SIZE(f17h_fp_mce_desc) - 1;
+               break;
+
+       case SMCA_L3_CACHE_BLOCK:
+               error_desc_array = f17h_l3_mce_desc;
+               ip_name = "L3_Cache";
+               len = ARRAY_SIZE(f17h_l3_mce_desc) - 1;
+               break;
+
+       default:
+               pr_cont("Unrecognized Mca Type value for F17h Core. Unable to 
decode errors\n");
+               return;
+       }
+
+       if (xec > len) {
+               pr_cont("Unrecognized error code from %s MCA bank\n", ip_name);
+               return;
+       }
+
+       pr_cont("%s.\n", error_desc_array[xec]);
+}
+
+static void decode_df_errors(u8 xec, unsigned int mca_type)
+{
+       const char * const *error_desc_array;
+       char *ip_name;
+       size_t len;
+
+       switch (mca_type) {
+       case  SMCA_CS_BLOCK:
+               error_desc_array = f17h_cs_mce_desc;
+               ip_name = "CS";
+               len = ARRAY_SIZE(f17h_cs_mce_desc) - 1;
+               break;
+
+       case SMCA_PIE_BLOCK:
+               error_desc_array = f17h_pie_mce_desc;
+               ip_name = "PIE";
+               len = ARRAY_SIZE(f17h_pie_mce_desc) - 1;
+               break;
+
+       default:
+               pr_cont("Unrecognized Mca Type value for DF. Unable to decode 
errors\n");
+               return;
+       }
+
+       if (xec > len) {
+               pr_cont("Unrecognized error code from %s MCA bank\n", ip_name);
+               return;
+       }
+
+       pr_cont("%s.\n", error_desc_array[xec]);
+}
+
+/* Decode errors according to Scalable MCA specification */
+static void decode_smca_errors(struct mce *m)
+{
+       u32 low, high;
+       u32 addr = MSR_AMD64_SMCA_MCx_IPID(m->bank);
+       unsigned int hwid, mca_type, i;
+       u8 xec = XEC(m->status, xec_mask);
+       const char * const *error_desc_array;
+       char *ip_name;
+       size_t len;
+
+       if (rdmsr_safe(addr, &low, &high)) {
+               pr_emerg("Invalid IP block specified, error information is 
unreliable.\n");
+               return;
+       }
+
+       hwid = high & MCI_IPID_HWID;
+       mca_type = (high & MCI_IPID_MCATYPE) >> 16;
+
+       pr_emerg(HW_ERR "MC%d IPID value: 0x%08x%08x\n", m->bank, high, low);
+
+       /*
+        * Based on hwid and mca_type values,
+        * decode errors from respective IPs.
+        * Note: mca_type values make sense only
+        * in the context of an hwid
+        */
+       for (i = 0; i < ARRAY_SIZE(amd_hwid_mappings); i++)
+               if (amd_hwid_mappings[i].amd_hwid_value == hwid)
+                       break;
+
+       switch (i) {
+       case SMCA_F17H_CORE_BLOCK:
+               ip_name = (mca_type == SMCA_L3_CACHE_BLOCK) ?
+                         "L3 Cache" : "F17h Core";
+               break;
+
+       case SMCA_DF_BLOCK:
+               ip_name = "DF";
+               break;
+
+       case SMCA_UMC_BLOCK:
+               error_desc_array = f17h_umc_mce_desc;
+               ip_name = "UMC";
+               len = ARRAY_SIZE(f17h_umc_mce_desc) - 1;
+               break;
+
+       case SMCA_PB_BLOCK:
+               error_desc_array = f17h_pb_mce_desc;
+               ip_name = "PB";
+               len = ARRAY_SIZE(f17h_pb_mce_desc) - 1;
+               break;
+
+       case SMCA_PSP_BLOCK:
+               error_desc_array = f17h_psp_mce_desc;
+               ip_name = "PSP";
+               len = ARRAY_SIZE(f17h_psp_mce_desc) - 1;
+               break;
+
+       case SMCA_SMU_BLOCK:
+               error_desc_array = f17h_smu_mce_desc;
+               ip_name = "SMU";
+               len = ARRAY_SIZE(f17h_smu_mce_desc) - 1;
+               break;
+
+       default:
+               pr_emerg(HW_ERR "HWID:%d does not match any existing IPs\n", 
hwid);
+               return;
+       }
+
+       pr_emerg(HW_ERR "%s Error: ", ip_name);
+
+       if (i == SMCA_F17H_CORE_BLOCK) {
+               decode_f17h_core_errors(xec, mca_type);
+       } else if (i == SMCA_DF_BLOCK) {
+               decode_df_errors(xec, mca_type);
+       } else {
+               if (xec > len) {
+                       pr_cont("Unrecognized error code from %s MCA bank\n", 
ip_name);
+                       return;
+               }
+
+               pr_cont("%s.\n", error_desc_array[xec]);
+       }
+}
+
+
 static inline void amd_decode_err_code(u16 ec)
 {
        if (INT_ERROR(ec)) {
@@ -752,6 +1068,7 @@ int amd_decode_mce(struct notifier_block *nb, unsigned 
long val, void *data)
        struct mce *m = (struct mce *)data;
        struct cpuinfo_x86 *c = &cpu_data(m->extcpu);
        int ecc;
+       u32 ebx = cpuid_ebx(0x80000007);
 
        if (amd_filter_mce(m))
                return NOTIFY_STOP;
@@ -769,11 +1086,20 @@ int amd_decode_mce(struct notifier_block *nb, unsigned 
long val, void *data)
                ((m->status & MCI_STATUS_PCC)   ? "PCC"   : "-"),
                ((m->status & MCI_STATUS_ADDRV) ? "AddrV" : "-"));
 
-       if (c->x86 == 0x15 || c->x86 == 0x16)
+       if (c->x86 >= 0x15)
                pr_cont("|%s|%s",
                        ((m->status & MCI_STATUS_DEFERRED) ? "Deferred" : "-"),
                        ((m->status & MCI_STATUS_POISON)   ? "Poison"   : "-"));
 
+       if (!!(ebx & BIT(3))) {
+               u32 low, high;
+               u32 addr = MSR_AMD64_SMCA_MCx_CONFIG(m->bank);
+
+               if (!rdmsr_safe(addr, &low, &high) &&
+                   (low & MCI_CONFIG_MCAX))
+                       pr_cont("|%s", ((m->status & MCI_STATUS_TCC) ? "TCC" : 
"-"));
+       }
+
        /* do the two bits[14:13] together */
        ecc = (m->status >> 45) & 0x3;
        if (ecc)
@@ -784,6 +1110,11 @@ int amd_decode_mce(struct notifier_block *nb, unsigned 
long val, void *data)
        if (m->status & MCI_STATUS_ADDRV)
                pr_emerg(HW_ERR "MC%d Error Address: 0x%016llx\n", m->bank, 
m->addr);
 
+       if (!!(ebx & BIT(3))) {
+               decode_smca_errors(m);
+               goto err_code;
+       }
+
        if (!fam_ops)
                goto err_code;
 
@@ -834,6 +1165,7 @@ static struct notifier_block amd_mce_dec_nb = {
 static int __init mce_amd_init(void)
 {
        struct cpuinfo_x86 *c = &boot_cpu_data;
+       u32 ebx = cpuid_ebx(0x80000007);
 
        if (c->x86_vendor != X86_VENDOR_AMD)
                return -ENODEV;
@@ -888,6 +1220,14 @@ static int __init mce_amd_init(void)
                fam_ops->mc2_mce = f16h_mc2_mce;
                break;
 
+       case 0x17:
+               xec_mask = 0x3f;
+               if (!(ebx & BIT(3))) {
+                       printk(KERN_WARNING "Decoding supported only on 
Scalable MCA enabled processors\n");
+                       return 0;
+               }
+               break;
+
        default:
                printk(KERN_WARNING "Huh? What family is it: 0x%x?!\n", c->x86);
                kfree(fam_ops);
-- 
2.3.5


-- 
Regards/Gruss,
    Boris.

ECO tip #101: Trim your mails when you reply.

Reply via email to