MWAITX/MWAIT does not let the cpu core go into C1 state on AMD processors.
The cpu core still consumes less power while waiting, and has faster exit
from waiting than "Halt". This patch implements an interface using the
kernel parameter "idle=" to configure mwaitx type and timer value.

If "idle=mwaitx", the timeout will be set as the maximum value
((2^64 - 1) * TSC cycle).
If "idle=mwaitx,100", the timeout will be set as 100ns.
If the processor doesn't support MWAITX, then halt is used.

Signed-off-by: Huang Rui <ray.hu...@amd.com>
---
 arch/x86/include/asm/mwait.h     |  2 +
 arch/x86/include/asm/processor.h |  2 +-
 arch/x86/kernel/process.c        | 79 ++++++++++++++++++++++++++++++++++++++++
 3 files changed, 82 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h
index b91136f..c4e51e7 100644
--- a/arch/x86/include/asm/mwait.h
+++ b/arch/x86/include/asm/mwait.h
@@ -14,6 +14,8 @@
 #define CPUID5_ECX_INTERRUPT_BREAK     0x2
 
 #define MWAIT_ECX_INTERRUPT_BREAK      0x1
+#define MWAITX_ECX_TIMER_ENABLE                0x2
+#define MWAITX_EBX_WAIT_TIMEOUT                0xffffffff
 
 static inline void __monitor(const void *eax, unsigned long ecx,
                             unsigned long edx)
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 23ba676..0f60e94 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -733,7 +733,7 @@ extern unsigned long                
boot_option_idle_override;
 extern bool                    amd_e400_c1e_detected;
 
 enum idle_boot_override {IDLE_NO_OVERRIDE=0, IDLE_HALT, IDLE_NOMWAIT,
-                        IDLE_POLL};
+                        IDLE_POLL, IDLE_MWAITX};
 
 extern void enable_sep_cpu(void);
 extern int sysenter_setup(void);
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 6e338e3..9d68193 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -30,6 +30,7 @@
 #include <asm/debugreg.h>
 #include <asm/nmi.h>
 #include <asm/tlbflush.h>
+#include <asm/x86_init.h>
 
 /*
  * per-CPU TSS segments. Threads are completely 'soft' on Linux,
@@ -276,6 +277,7 @@ unsigned long boot_option_idle_override = IDLE_NO_OVERRIDE;
 EXPORT_SYMBOL(boot_option_idle_override);
 
 static void (*x86_idle)(void);
+static unsigned long idle_param;
 
 #ifndef CONFIG_SMP
 static inline void play_dead(void)
@@ -444,6 +446,17 @@ static int prefer_mwait_c1_over_halt(const struct 
cpuinfo_x86 *c)
        return 1;
 }
 
+static int not_support_mwaitx(const struct cpuinfo_x86 *c)
+{
+       if (c->x86_vendor != X86_VENDOR_AMD)
+               return 1;
+
+       if (!cpu_has(c, X86_FEATURE_MWAITT))
+               return 1;
+
+       return 0;
+}
+
 /*
  * MONITOR/MWAIT with no hints, used for default default C1 state.
  * This invokes MWAIT with interrutps enabled and no flags,
@@ -470,12 +483,45 @@ static void mwait_idle(void)
        __current_clr_polling();
 }
 
+/*
+ * AMD Excavator processors support the new MONITORX/MWAITX instructions.
+ * The function is similar to mwait but with a timer. On AMD platforms
+ * mwaitx does not let the core go into C1 state. This provides for a
+ * faster waiting exit speed. The user can configure the idle method and
+ * timer value via the idle kernel parameter.
+ */
+static void mwaitx_idle(void)
+{
+       unsigned long ebx, ecx;
+
+       ebx = idle_param;
+       ecx = MWAITX_ECX_TIMER_ENABLE;
+
+       if (!current_set_polling_and_test()) {
+               __monitorx((void *)&current_thread_info()->flags, 0, 0);
+               if (!need_resched())
+                       __sti_mwaitx(0, ebx, ecx);
+               else
+                       local_irq_enable();
+       } else {
+               local_irq_enable();
+       }
+       __current_clr_polling();
+}
+
 void select_idle_routine(const struct cpuinfo_x86 *c)
 {
 #ifdef CONFIG_SMP
        if (boot_option_idle_override == IDLE_POLL && smp_num_siblings > 1)
                pr_warn_once("WARNING: polling idle and HT enabled, performance 
may degrade\n");
 #endif
+
+       if (boot_option_idle_override == IDLE_MWAITX &&
+           not_support_mwaitx(c)) {
+               pr_warn_once("WARNING: mwaitx not supported, using default idle 
support\n");
+               x86_idle = default_idle;
+       }
+
        if (x86_idle || boot_option_idle_override == IDLE_POLL)
                return;
 
@@ -499,6 +545,8 @@ void __init init_amd_e400_c1e_mask(void)
 
 static int __init idle_setup(char *str)
 {
+       unsigned long timeout, tsc_freq;
+
        if (!str)
                return -EINVAL;
 
@@ -524,6 +572,37 @@ static int __init idle_setup(char *str)
                 * of boot_option_idle_override.
                 */
                boot_option_idle_override = IDLE_NOMWAIT;
+       } else if (!strncmp(str, "mwaitx", 6)) {
+               /*
+                * If the boot option of "idle=mwaitx" is added, it means
+                * that mwaitx will be enabled if current processor
+                * supports it. If not supported, use default_idle.
+                */
+               x86_idle = mwaitx_idle;
+               boot_option_idle_override = IDLE_MWAITX;
+               str += 6;
+               if (str && (str[0] == ',')) {
+                       if (kstrtoul(str + 1, 0, &timeout)) {
+                               pr_warn_once("WARNING: timer value should be 
numerical\n");
+                               return -1;
+                       }
+
+                       tsc_freq = x86_platform.calibrate_tsc();
+                       if (!tsc_freq) {
+                               pr_warn_once("WARNING: can not calculate TSC 
khz\n");
+                               return -1;
+                       }
+
+                       /*
+                        * TSC loops (EBX input) = Timer(nsec) *
+                        * TSC freq(khz) / 1000000
+                        */
+                       timeout = timeout * tsc_freq;
+                       do_div(timeout, 1000000);
+
+                       idle_param = timeout;
+               } else
+                       idle_param = MWAITX_EBX_WAIT_TIMEOUT;
        } else
                return -1;
 
-- 
2.1.0

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to