From: Ewout van Bekkum <ew...@google.com>

The CMCI poll interval was updated to pick the minimum interval between
the original 30 seconds and the check_interval divided by 8 (minimum of
3 polls).

This resolves a bug where the CMCI storm handler is unable to return to
interrupt mode from polling mode, if the check_interval shorter than the
CMCI poll interval. This problem is caused by the mce_timer_fn function
which only allows the poll interval to be incremented up to the
check_interval, while the mce_intel_adjust_timer function requires the
poll interval to be greater than the CMCI poll interval before leaving
the CMCI_STORM_ACTIVE state.

Signed-off-by: Ewout van Bekkum <ew...@google.com>
Signed-off-by: Havard Skinnemoen <hskinnem...@google.com>
---
 arch/x86/kernel/cpu/mcheck/mce-internal.h |  1 +
 arch/x86/kernel/cpu/mcheck/mce.c          |  5 +++++
 arch/x86/kernel/cpu/mcheck/mce_intel.c    | 15 +++++++++++----
 3 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h 
b/arch/x86/kernel/cpu/mcheck/mce-internal.h
index 09edd0b..2f0b1e8 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-internal.h
+++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h
@@ -40,6 +40,7 @@ static inline void cmci_disable_bank(int bank) { }
 #endif
 
 void mce_timer_kick(unsigned long interval);
+unsigned long current_check_interval(void);
 
 #ifdef CONFIG_ACPI_APEI
 int apei_write_mce(struct mce *m);
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index bb92f38..1ebdd34 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -1265,6 +1265,11 @@ void mce_log_therm_throt_event(__u64 status)
  */
 static unsigned long check_interval = 5 * 60; /* 5 minutes */
 
+unsigned long current_check_interval(void)
+{
+       return check_interval;
+}
+
 static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */
 static DEFINE_PER_CPU(struct timer_list, mce_timer);
 
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c 
b/arch/x86/kernel/cpu/mcheck/mce_intel.c
index 9a316b2..26eb8d3 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c
@@ -45,10 +45,17 @@ static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned);
 static DEFINE_SPINLOCK(cmci_discover_lock);
 
 #define CMCI_THRESHOLD         1
-#define CMCI_POLL_INTERVAL     (30 * HZ)
 #define CMCI_STORM_INTERVAL    (1 * HZ)
 #define CMCI_STORM_THRESHOLD   15
 
+/*
+ * Poll every 30 seconds unless the current check_interval / 8 is smaller.
+ */
+static unsigned long cmci_poll_interval(void)
+{
+       return min(30UL * HZ, current_check_interval() * HZ / 8);
+}
+
 static DEFINE_PER_CPU(unsigned long, cmci_time_stamp);
 static DEFINE_PER_CPU(unsigned int, cmci_storm_cnt);
 static DEFINE_PER_CPU(unsigned int, cmci_storm_state);
@@ -101,7 +108,7 @@ unsigned long mce_intel_adjust_timer(unsigned long interval)
 {
        int r;
 
-       if (interval < CMCI_POLL_INTERVAL)
+       if (interval < cmci_poll_interval())
                return interval;
 
        switch (__this_cpu_read(cmci_storm_state)) {
@@ -128,7 +135,7 @@ unsigned long mce_intel_adjust_timer(unsigned long interval)
                        cmci_reenable();
                        cmci_recheck();
                }
-               return CMCI_POLL_INTERVAL;
+               return cmci_poll_interval();
        default:
                /*
                 * We have shiny weather. Let the poll do whatever it
@@ -178,7 +185,7 @@ static bool cmci_storm_detect(void)
        cmci_storm_disable_banks();
        __this_cpu_write(cmci_storm_state, CMCI_STORM_ACTIVE);
        r = atomic_add_return(1, &cmci_storm_on_cpus);
-       mce_timer_kick(CMCI_POLL_INTERVAL);
+       mce_timer_kick(cmci_poll_interval());
 
        if (r == 1)
                pr_notice("CMCI storm detected: switching to poll mode\n");
-- 
2.0.0.526.g5318336

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to