[Patch V1] x86, mce: CPU synchronization for broadcast MCE's is surprised by offline CPUs

Ashok Raj Thu, 10 Sep 2015 16:28:44 -0700

Linux has logical CPU offline, supported as shown below.

#echo 0 > /sys/devices/system/cpu/cpuX/online


Hardware doesn't know about OS offlining, hence hardware will always
broadcast any MCE to all CPUs in the system, even it its parked in
cpu_dead.

mce_start() and mce_end() should use cpu_present_map to count CPUs in
rendezvous. Offline cpu is also in the MCE domain, so its going
to execute do_machine_check(). This will increment mce_callin. This
will result in always cpus incrementing would be off by the number
of CPUs offined.

This patch does the following.

- Allow MCE logging from CPUs logically offlined.
- Ensure the offline CPU wil not be choosen as the rendezvous master CPU
- Collect logs from the offline cpu and report them via rendezvous master.

Signed-off-by: Ashok Raj <ashok....@intel.com>
Reviewed-by: Tony Luck <tony.l...@intel.com>
---
 arch/x86/kernel/cpu/mcheck/mce.c | 101 +++++++++++++++++++++++++++++++++++++--
 1 file changed, 96 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 69c7e3c..7c6b8b2 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -769,6 +769,63 @@ out:
 }
 
 /*
+ * We can't call mce_log() for offline CPUs because it uses RCU for
+ * synchronization. (and may call arbitrary driver code via
+ * x86_mce_decoder_chain that may also be surprised at being called
+ * from an offline CPU). Provide enough buffer space to hold a few
+ * errors that can be picked up later. We don't care about overflow
+ * here, since this is supposed to be really rare, so not doing any
+ * tracking for overflow.
+ */
+
+#define OFFLINE_CPU_LOG_LEN    16
+
+struct offline_cpu_mce {
+       unsigned short head;
+       unsigned short tail;
+       struct mce mce_log[OFFLINE_CPU_LOG_LEN];
+};
+
+static struct offline_cpu_mce offline_mce;
+static unsigned int offline_mce_overflow = 0;
+
+/*
+ * Add mce's discovered in offline cpu which will be logged by the
+ * MCE rendezvous master. There is no lock required, since MCE's are
+ * processed one cpu at a time, sequenced by the rendezvous master CPU
+ * Safe to be called only from MCE handler.
+ */
+static int offline_mce_add(struct mce *m)
+{
+       unsigned next;
+
+       next = (offline_mce.tail + 1) % OFFLINE_CPU_LOG_LEN;
+       if (next == offline_mce.head) {
+               offline_mce_overflow++;
+               return -1;
+       }
+
+       offline_mce.mce_log[offline_mce.tail] = *m;
+       offline_mce.tail = next;
+       return 0;
+}
+
+static int offline_mce_get(struct mce *m)
+{
+       int ret = 0;
+
+       if (offline_mce.head == offline_mce.tail)
+               goto out;
+
+       *m = offline_mce.mce_log[offline_mce.head];
+       offline_mce.head = (offline_mce.head + 1) % OFFLINE_CPU_LOG_LEN;
+
+       ret = 1;
+out:
+       return ret;
+}
+
+/*
  * The Monarch's reign.  The Monarch is the CPU who entered
  * the machine check handler first. It waits for the others to
  * raise the exception too and then grades them. When any
@@ -799,13 +856,31 @@ static void mce_reign(void)
        int global_worst = 0;
        char *msg = NULL;
        char *nmsg = NULL;
+       struct mce offline_mce;
+
+
+       /*
+        * If there are any MCE's logged by offline CPU's, lets
+        * gather and report them via mce_log
+        */
+       while (offline_mce_get(&offline_mce))
+               mce_log(&offline_mce);
+
+       if (offline_mce_overflow) {
+               pr_info (HW_ERR "Lost %d errors logged by offline CPUs\n",
+                       offline_mce_overflow);
+               offline_mce_overflow = 0;
+       }
 
        /*
         * This CPU is the Monarch and the other CPUs have run
         * through their handlers.
         * Grade the severity of the errors of all the CPUs.
+        * Intel CPUs broadcast MCE's to all cpus booted.
+        * Even if they are merely parked in the OS for logical offline
+        * they also should process MCE.
         */
-       for_each_possible_cpu(cpu) {
+       for_each_present_cpu(cpu) {
                int severity = mce_severity(&per_cpu(mces_seen, cpu),
                                            mca_cfg.tolerant,
                                            &nmsg, true);
@@ -841,7 +916,7 @@ static void mce_reign(void)
         * Now clear all the mces_seen so that they don't reappear on
         * the next mce.
         */
-       for_each_possible_cpu(cpu)
+       for_each_present_cpu(cpu)
                memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce));
 }
 
@@ -857,8 +932,9 @@ static atomic_t global_nwo;
 static int mce_start(int *no_way_out)
 {
        int order;
-       int cpus = num_online_cpus();
+       int cpus = num_present_cpus();
        u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC;
+       unsigned int this_cpu = smp_processor_id();
 
        if (!timeout)
                return -1;
@@ -868,6 +944,16 @@ static int mce_start(int *no_way_out)
         * global_nwo should be updated before mce_callin
         */
        smp_wmb();
+
+       /*
+        * If this cpu is offline, make sure it won't be elected as
+        * rendezvous master
+        */
+       if (cpu_is_offline(this_cpu)) {
+               while (!atomic_read(&mce_callin))
+                       ndelay(SPINUNIT);
+       }
+
        order = atomic_inc_return(&mce_callin);
 
        /*
@@ -938,7 +1024,7 @@ static int mce_end(int order)
 
        if (order == 1) {
                /* CHECKME: Can this race with a parallel hotplug? */
-               int cpus = num_online_cpus();
+               int cpus = num_present_cpus();
 
                /*
                 * Monarch: Wait for everyone to go through their scanning
@@ -1033,6 +1119,8 @@ void do_machine_check(struct pt_regs *regs, long 
error_code)
        int i;
        int worst = 0;
        int severity;
+       unsigned int cpu = smp_processor_id();
+
        /*
         * Establish sequential order between the CPUs entering the machine
         * check handler.
@@ -1153,7 +1241,10 @@ void do_machine_check(struct pt_regs *regs, long 
error_code)
                if (severity == MCE_AO_SEVERITY && mce_usable_address(&m))
                        mce_ring_add(m.addr >> PAGE_SHIFT);
 
-               mce_log(&m);
+               if (cpu_is_offline(cpu))
+                       offline_mce_add(&m);
+               else
+                       mce_log(&m);
 
                if (severity > worst) {
                        *final = m;
-- 
2.4.3

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[Patch V1] x86, mce: CPU synchronization for broadcast MCE's is surprised by offline CPUs

Reply via email to