Linux has logical CPU offline, supported as shown below. #echo 0 > /sys/devices/system/cpu/cpuX/online
Hardware doesn't know about OS offlining, hence hardware will always broadcast any MCE to all CPUs in the system, even it its parked in cpu_dead. mce_start() and mce_end() should use cpu_present_map to count CPUs in rendezvous. Offline cpu is also in the MCE domain, so its going to execute do_machine_check(). This will increment mce_callin. This will result in always cpus incrementing would be off by the number of CPUs offined. This patch does the following. - Allow MCE logging from CPUs logically offlined. - Ensure the offline CPU wil not be choosen as the rendezvous master CPU - Collect logs from the offline cpu and report them via rendezvous master. Signed-off-by: Ashok Raj <ashok....@intel.com> Reviewed-by: Tony Luck <tony.l...@intel.com> --- arch/x86/kernel/cpu/mcheck/mce.c | 101 +++++++++++++++++++++++++++++++++++++-- 1 file changed, 96 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 69c7e3c..7c6b8b2 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -769,6 +769,63 @@ out: } /* + * We can't call mce_log() for offline CPUs because it uses RCU for + * synchronization. (and may call arbitrary driver code via + * x86_mce_decoder_chain that may also be surprised at being called + * from an offline CPU). Provide enough buffer space to hold a few + * errors that can be picked up later. We don't care about overflow + * here, since this is supposed to be really rare, so not doing any + * tracking for overflow. + */ + +#define OFFLINE_CPU_LOG_LEN 16 + +struct offline_cpu_mce { + unsigned short head; + unsigned short tail; + struct mce mce_log[OFFLINE_CPU_LOG_LEN]; +}; + +static struct offline_cpu_mce offline_mce; +static unsigned int offline_mce_overflow = 0; + +/* + * Add mce's discovered in offline cpu which will be logged by the + * MCE rendezvous master. There is no lock required, since MCE's are + * processed one cpu at a time, sequenced by the rendezvous master CPU + * Safe to be called only from MCE handler. + */ +static int offline_mce_add(struct mce *m) +{ + unsigned next; + + next = (offline_mce.tail + 1) % OFFLINE_CPU_LOG_LEN; + if (next == offline_mce.head) { + offline_mce_overflow++; + return -1; + } + + offline_mce.mce_log[offline_mce.tail] = *m; + offline_mce.tail = next; + return 0; +} + +static int offline_mce_get(struct mce *m) +{ + int ret = 0; + + if (offline_mce.head == offline_mce.tail) + goto out; + + *m = offline_mce.mce_log[offline_mce.head]; + offline_mce.head = (offline_mce.head + 1) % OFFLINE_CPU_LOG_LEN; + + ret = 1; +out: + return ret; +} + +/* * The Monarch's reign. The Monarch is the CPU who entered * the machine check handler first. It waits for the others to * raise the exception too and then grades them. When any @@ -799,13 +856,31 @@ static void mce_reign(void) int global_worst = 0; char *msg = NULL; char *nmsg = NULL; + struct mce offline_mce; + + + /* + * If there are any MCE's logged by offline CPU's, lets + * gather and report them via mce_log + */ + while (offline_mce_get(&offline_mce)) + mce_log(&offline_mce); + + if (offline_mce_overflow) { + pr_info (HW_ERR "Lost %d errors logged by offline CPUs\n", + offline_mce_overflow); + offline_mce_overflow = 0; + } /* * This CPU is the Monarch and the other CPUs have run * through their handlers. * Grade the severity of the errors of all the CPUs. + * Intel CPUs broadcast MCE's to all cpus booted. + * Even if they are merely parked in the OS for logical offline + * they also should process MCE. */ - for_each_possible_cpu(cpu) { + for_each_present_cpu(cpu) { int severity = mce_severity(&per_cpu(mces_seen, cpu), mca_cfg.tolerant, &nmsg, true); @@ -841,7 +916,7 @@ static void mce_reign(void) * Now clear all the mces_seen so that they don't reappear on * the next mce. */ - for_each_possible_cpu(cpu) + for_each_present_cpu(cpu) memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce)); } @@ -857,8 +932,9 @@ static atomic_t global_nwo; static int mce_start(int *no_way_out) { int order; - int cpus = num_online_cpus(); + int cpus = num_present_cpus(); u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC; + unsigned int this_cpu = smp_processor_id(); if (!timeout) return -1; @@ -868,6 +944,16 @@ static int mce_start(int *no_way_out) * global_nwo should be updated before mce_callin */ smp_wmb(); + + /* + * If this cpu is offline, make sure it won't be elected as + * rendezvous master + */ + if (cpu_is_offline(this_cpu)) { + while (!atomic_read(&mce_callin)) + ndelay(SPINUNIT); + } + order = atomic_inc_return(&mce_callin); /* @@ -938,7 +1024,7 @@ static int mce_end(int order) if (order == 1) { /* CHECKME: Can this race with a parallel hotplug? */ - int cpus = num_online_cpus(); + int cpus = num_present_cpus(); /* * Monarch: Wait for everyone to go through their scanning @@ -1033,6 +1119,8 @@ void do_machine_check(struct pt_regs *regs, long error_code) int i; int worst = 0; int severity; + unsigned int cpu = smp_processor_id(); + /* * Establish sequential order between the CPUs entering the machine * check handler. @@ -1153,7 +1241,10 @@ void do_machine_check(struct pt_regs *regs, long error_code) if (severity == MCE_AO_SEVERITY && mce_usable_address(&m)) mce_ring_add(m.addr >> PAGE_SHIFT); - mce_log(&m); + if (cpu_is_offline(cpu)) + offline_mce_add(&m); + else + mce_log(&m); if (severity > worst) { *final = m; -- 2.4.3 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/