This is a strawman proposal to simplify the idle implementation, eliminate
a race

Benefits over current code:
 - ttwu_queue_remote doesn't use an IPI unless needed
 - The diffstat should speak for itself :)
 - Less racy.  Spurious IPIs are possible, but only in narrow windows or
   when two wakeups occur in rapid succession.
 - Seems to work (?)

Issues:
 - Am I doing the percpu stuff right?
 - Needs work on non-x86 architectures
 - The !CONFIG_SMP case needs to be checked
 - Is "idlepoll" a good name for the new code?  It doesn't have *that*
   much to do with the idle state.  Maybe cpukick?

If this turns out okay, TIF_NEED_RESCHED could possibly be deleted as well.

Signed-off-by: Andy Lutomirski <l...@amacapital.net>
---
 arch/x86/include/asm/mwait.h       |  22 +++----
 arch/x86/include/asm/thread_info.h |   2 -
 arch/x86/kernel/apm_32.c           |  12 ----
 include/linux/idlepoll.h           |  85 ++++++++++++++++++++++++++
 include/linux/preempt.h            |   5 --
 include/linux/sched.h              | 120 -------------------------------------
 kernel/cpu/idle.c                  |  60 ++++++++++++-------
 kernel/sched/core.c                |  93 +++++++++++-----------------
 8 files changed, 167 insertions(+), 232 deletions(-)
 create mode 100644 include/linux/idlepoll.h

diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h
index 1da25a5..8addd29 100644
--- a/arch/x86/include/asm/mwait.h
+++ b/arch/x86/include/asm/mwait.h
@@ -2,6 +2,7 @@
 #define _ASM_X86_MWAIT_H
 
 #include <linux/sched.h>
+#include <linux/idlepoll.h>
 
 #define MWAIT_SUBSTATE_MASK            0xf
 #define MWAIT_CSTATE_MASK              0xf
@@ -42,18 +43,17 @@ static inline void __mwait(unsigned long eax, unsigned long 
ecx)
  */
 static inline void mwait_idle_with_hints(unsigned long eax, unsigned long ecx)
 {
-       if (!current_set_polling_and_test()) {
-               if (static_cpu_has(X86_FEATURE_CLFLUSH_MONITOR)) {
-                       mb();
-                       clflush((void *)&current_thread_info()->flags);
-                       mb();
-               }
-
-               __monitor((void *)&current_thread_info()->flags, 0, 0);
-               if (!need_resched())
-                       __mwait(eax, ecx);
+       idlepoll_begin_poll();
+       if (static_cpu_has(X86_FEATURE_CLFLUSH_MONITOR)) {
+               mb();
+               clflush(idlepoll_poll_ptr());
+               mb();
        }
-       current_clr_polling();
+
+       __monitor(idlepoll_poll_ptr(), 0, 0);
+       if (idlepoll_keep_polling())
+               __mwait(eax, ecx);
+       idlepoll_end_poll();
 }
 
 #endif /* _ASM_X86_MWAIT_H */
diff --git a/arch/x86/include/asm/thread_info.h 
b/arch/x86/include/asm/thread_info.h
index e1940c0..caa3f63 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -234,8 +234,6 @@ static inline struct thread_info *current_thread_info(void)
  * have to worry about atomic accesses.
  */
 #define TS_COMPAT              0x0002  /* 32bit syscall active (64BIT)*/
-#define TS_POLLING             0x0004  /* idle task polling need_resched,
-                                          skip sending interrupt */
 #define TS_RESTORE_SIGMASK     0x0008  /* restore signal mask in do_signal() */
 
 #ifndef __ASSEMBLY__
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 3ab0343..5848744 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -841,24 +841,12 @@ static int apm_do_idle(void)
        u32 eax;
        u8 ret = 0;
        int idled = 0;
-       int polling;
        int err = 0;
 
-       polling = !!(current_thread_info()->status & TS_POLLING);
-       if (polling) {
-               current_thread_info()->status &= ~TS_POLLING;
-               /*
-                * TS_POLLING-cleared state must be visible before we
-                * test NEED_RESCHED:
-                */
-               smp_mb();
-       }
        if (!need_resched()) {
                idled = 1;
                ret = apm_bios_call_simple(APM_FUNC_IDLE, 0, 0, &eax, &err);
        }
-       if (polling)
-               current_thread_info()->status |= TS_POLLING;
 
        if (!idled)
                return 0;
diff --git a/include/linux/idlepoll.h b/include/linux/idlepoll.h
new file mode 100644
index 0000000..072bc7e
--- /dev/null
+++ b/include/linux/idlepoll.h
@@ -0,0 +1,85 @@
+/*
+ * idlepoll.h: definitions and inlines for lockless wake-from-idle.
+ *
+ * Copyright (c) 2014 Andy Lutomirski (l...@amacapital.net)
+ */
+#ifndef __IDLEPOLL_H
+#define __IDLEPOLL_H
+
+#include <linux/percpu-defs.h>
+#include <asm/atomic.h>
+
+DECLARE_PER_CPU_ALIGNED(atomic_t, idlepoll_state);
+
+#define IDLEPOLL_NOT_POLLING   0  /* wakers must send IPI */
+#define IDLEPOLL_POLLING       1  /* ok to wake using idlepoll_state */
+#define IDLEPOLL_KICKED                2  /* __idlepoll_kicked will be called 
soon */
+
+void __idlepoll_kicked(void);
+
+/**
+ * idlepoll_poll_ptr - Address to use for hardware idle polling
+ *
+ * On architectures with an idle-until-an-address-is-written operations,
+ * this returns the address to use.
+ */
+static inline void *idlepoll_poll_ptr(void)
+{
+       return this_cpu_ptr(&idlepoll_state);
+}
+
+/**
+ * idlepoll_begin_poll - Address to use for hardware idle polling
+ *
+ * If an idle implementation polls, call this before polling.
+ */
+static inline void idlepoll_begin_poll(void)
+{
+       BUG_ON(atomic_read(this_cpu_ptr(&idlepoll_state)) == IDLEPOLL_POLLING);
+
+       /*
+        * Mark us polling.  Note: we don't particularly care what
+        * what the previous value was.  If it was IDLEPOLL_KICKED, then
+        * an IPI is on its way.
+        */
+       atomic_set(this_cpu_ptr(&idlepoll_state), IDLEPOLL_POLLING);
+
+       /*
+        * No barrier here: we assume that whoever calls us has enough
+        * barriers to notice idlepoll_state changing.
+        */
+}
+
+/**
+ * idlepoll_end_poll - Address to use for hardware idle polling
+ *
+ * If an idle implementation polls, call this after polling.
+ */
+static inline void idlepoll_end_poll(void)
+{
+       /*
+        * It's possible to be in IDLEPOLL_NOT_POLLING: an IPI
+        * could have come in and beaten us to the punch.
+        */
+       if (atomic_xchg(this_cpu_ptr(&idlepoll_state), IDLEPOLL_NOT_POLLING) ==
+           IDLEPOLL_KICKED)
+               __idlepoll_kicked();
+}
+
+/**
+ * idlepoll_keep_polling - Should a polling idle implementation stop polling?
+ *
+ * If an idle implementation polls, it should check this before pausing
+ * or asking hardware to wait.
+ */
+static inline bool idlepoll_keep_polling(void)
+{
+       return atomic_read(this_cpu_ptr(&idlepoll_state)) == IDLEPOLL_POLLING
+               && !test_preempt_need_resched();
+}
+
+#ifdef CONFIG_SMP
+void idlepoll_kick_cpu(int cpu);
+#endif
+
+#endif
diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index de83b4e..04a3f39 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -138,11 +138,6 @@ do { \
 do { \
        set_preempt_need_resched(); \
 } while (0)
-#define preempt_fold_need_resched() \
-do { \
-       if (tif_need_resched()) \
-               set_preempt_need_resched(); \
-} while (0)
 
 #ifdef CONFIG_PREEMPT_NOTIFIERS
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index a781dec..508df16 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2661,126 +2661,6 @@ static inline int spin_needbreak(spinlock_t *lock)
 #endif
 }
 
-/*
- * Idle thread specific functions to determine the need_resched
- * polling state. We have two versions, one based on TS_POLLING in
- * thread_info.status and one based on TIF_POLLING_NRFLAG in
- * thread_info.flags
- */
-#ifdef TS_POLLING
-static inline int tsk_is_polling(struct task_struct *p)
-{
-       return task_thread_info(p)->status & TS_POLLING;
-}
-static inline void __current_set_polling(void)
-{
-       current_thread_info()->status |= TS_POLLING;
-}
-
-static inline bool __must_check current_set_polling_and_test(void)
-{
-       __current_set_polling();
-
-       /*
-        * Polling state must be visible before we test NEED_RESCHED,
-        * paired by resched_task()
-        */
-       smp_mb();
-
-       return unlikely(tif_need_resched());
-}
-
-static inline void __current_clr_polling(void)
-{
-       current_thread_info()->status &= ~TS_POLLING;
-}
-
-static inline bool __must_check current_clr_polling_and_test(void)
-{
-       __current_clr_polling();
-
-       /*
-        * Polling state must be visible before we test NEED_RESCHED,
-        * paired by resched_task()
-        */
-       smp_mb();
-
-       return unlikely(tif_need_resched());
-}
-#elif defined(TIF_POLLING_NRFLAG)
-static inline int tsk_is_polling(struct task_struct *p)
-{
-       return test_tsk_thread_flag(p, TIF_POLLING_NRFLAG);
-}
-
-static inline void __current_set_polling(void)
-{
-       set_thread_flag(TIF_POLLING_NRFLAG);
-}
-
-static inline bool __must_check current_set_polling_and_test(void)
-{
-       __current_set_polling();
-
-       /*
-        * Polling state must be visible before we test NEED_RESCHED,
-        * paired by resched_task()
-        *
-        * XXX: assumes set/clear bit are identical barrier wise.
-        */
-       smp_mb__after_clear_bit();
-
-       return unlikely(tif_need_resched());
-}
-
-static inline void __current_clr_polling(void)
-{
-       clear_thread_flag(TIF_POLLING_NRFLAG);
-}
-
-static inline bool __must_check current_clr_polling_and_test(void)
-{
-       __current_clr_polling();
-
-       /*
-        * Polling state must be visible before we test NEED_RESCHED,
-        * paired by resched_task()
-        */
-       smp_mb__after_clear_bit();
-
-       return unlikely(tif_need_resched());
-}
-
-#else
-static inline int tsk_is_polling(struct task_struct *p) { return 0; }
-static inline void __current_set_polling(void) { }
-static inline void __current_clr_polling(void) { }
-
-static inline bool __must_check current_set_polling_and_test(void)
-{
-       return unlikely(tif_need_resched());
-}
-static inline bool __must_check current_clr_polling_and_test(void)
-{
-       return unlikely(tif_need_resched());
-}
-#endif
-
-static inline void current_clr_polling(void)
-{
-       __current_clr_polling();
-
-       /*
-        * Ensure we check TIF_NEED_RESCHED after we clear the polling bit.
-        * Once the bit is cleared, we'll get IPIs with every new
-        * TIF_NEED_RESCHED and the IPI handler, scheduler_ipi(), will also
-        * fold.
-        */
-       smp_mb(); /* paired with resched_task() */
-
-       preempt_fold_need_resched();
-}
-
 static __always_inline bool need_resched(void)
 {
        return unlikely(tif_need_resched());
diff --git a/kernel/cpu/idle.c b/kernel/cpu/idle.c
index 277f494..5f772f5 100644
--- a/kernel/cpu/idle.c
+++ b/kernel/cpu/idle.c
@@ -6,12 +6,14 @@
 #include <linux/tick.h>
 #include <linux/mm.h>
 #include <linux/stackprotector.h>
+#include <linux/idlepoll.h>
 
 #include <asm/tlb.h>
 
 #include <trace/events/power.h>
 
 static int __read_mostly cpu_idle_force_poll;
+DEFINE_PER_CPU(atomic_t, idlepoll_state) ____cacheline_aligned;
 
 void cpu_idle_poll_ctrl(bool enable)
 {
@@ -44,13 +46,39 @@ static inline int cpu_idle_poll(void)
        rcu_idle_enter();
        trace_cpu_idle_rcuidle(0, smp_processor_id());
        local_irq_enable();
-       while (!tif_need_resched())
+
+       idlepoll_begin_poll();
+       while (idlepoll_keep_polling())
                cpu_relax();
+       idlepoll_end_poll();
+
        trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
        rcu_idle_exit();
        return 1;
 }
 
+#ifdef CONFIG_SMP
+DEFINE_PER_CPU_ALIGNED(atomic_t, idlepoll_state);
+EXPORT_SYMBOL_GPL(idlepoll_state);
+
+/**
+ * idlepoll_kick_cpu - Cause a target cpu to call __idlepoll_kicked
+ *
+ * This implies a strong enough barrier that any writes before
+ * idlepoll_kick_cpu will be visible before __idlepoll_kicked is called.
+ *
+ * Do NOT call this on the current cpu.  Call from any context: no
+ * locks are required or taken.
+ */
+void idlepoll_kick_cpu(int cpu)
+{
+       if (atomic_xchg(per_cpu_ptr(&idlepoll_state, cpu), IDLEPOLL_KICKED) ==
+           IDLEPOLL_NOT_POLLING)
+               smp_send_reschedule(cpu);
+}
+EXPORT_SYMBOL_GPL(idlepoll_kick_cpu);
+#endif
+
 /* Weak implementations for optional arch specific functions */
 void __weak arch_cpu_idle_prepare(void) { }
 void __weak arch_cpu_idle_enter(void) { }
@@ -65,12 +93,13 @@ void __weak arch_cpu_idle(void)
 /*
  * Generic idle loop implementation
  */
+
 static void cpu_idle_loop(void)
 {
        while (1) {
                tick_nohz_idle_enter();
 
-               while (!need_resched()) {
+               while (!test_preempt_need_resched()) {
                        check_pgt_cache();
                        rmb();
 
@@ -92,30 +121,16 @@ static void cpu_idle_loop(void)
                        if (cpu_idle_force_poll || 
tick_check_broadcast_expired()) {
                                cpu_idle_poll();
                        } else {
-                               if (!current_clr_polling_and_test()) {
-                                       stop_critical_timings();
-                                       rcu_idle_enter();
-                                       arch_cpu_idle();
-                                       WARN_ON_ONCE(irqs_disabled());
-                                       rcu_idle_exit();
-                                       start_critical_timings();
-                               } else {
-                                       local_irq_enable();
-                               }
-                               __current_set_polling();
+                               stop_critical_timings();
+                               rcu_idle_enter();
+                               arch_cpu_idle();
+                               WARN_ON_ONCE(irqs_disabled());
+                               rcu_idle_exit();
+                               start_critical_timings();
                        }
                        arch_cpu_idle_exit();
                }
 
-               /*
-                * Since we fell out of the loop above, we know
-                * TIF_NEED_RESCHED must be set, propagate it into
-                * PREEMPT_NEED_RESCHED.
-                *
-                * This is required because for polling idle loops we will
-                * not have had an IPI to fold the state for us.
-                */
-               preempt_set_need_resched();
                tick_nohz_idle_exit();
                schedule_preempt_disabled();
        }
@@ -138,7 +153,6 @@ void cpu_startup_entry(enum cpuhp_state state)
         */
        boot_init_stack_canary();
 #endif
-       __current_set_polling();
        arch_cpu_idle_prepare();
        cpu_idle_loop();
 }
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index b46131e..b771c46 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -73,6 +73,7 @@
 #include <linux/init_task.h>
 #include <linux/binfmts.h>
 #include <linux/context_tracking.h>
+#include <linux/idlepoll.h>
 
 #include <asm/switch_to.h>
 #include <asm/tlb.h>
@@ -504,6 +505,18 @@ static inline void init_hrtick(void)
 }
 #endif /* CONFIG_SCHED_HRTICK */
 
+void resched_cpu(int cpu)
+{
+#ifdef CONFIG_SMP
+       if (cpu == smp_processor_id())
+               set_preempt_need_resched();
+       else
+               idlepoll_kick_cpu(cpu);
+#else
+       set_preempt_need_resched();
+#endif
+}
+
 /*
  * resched_task - mark a task 'to be rescheduled now'.
  *
@@ -513,36 +526,16 @@ static inline void init_hrtick(void)
  */
 void resched_task(struct task_struct *p)
 {
-       int cpu;
-
        lockdep_assert_held(&task_rq(p)->lock);
 
+       /* XXX: this is probably unnecessary. */
        if (test_tsk_need_resched(p))
                return;
 
+       /* XXX: so is this. */
        set_tsk_need_resched(p);
 
-       cpu = task_cpu(p);
-       if (cpu == smp_processor_id()) {
-               set_preempt_need_resched();
-               return;
-       }
-
-       /* NEED_RESCHED must be visible before we test polling */
-       smp_mb();
-       if (!tsk_is_polling(p))
-               smp_send_reschedule(cpu);
-}
-
-void resched_cpu(int cpu)
-{
-       struct rq *rq = cpu_rq(cpu);
-       unsigned long flags;
-
-       if (!raw_spin_trylock_irqsave(&rq->lock, flags))
-               return;
-       resched_task(cpu_curr(cpu));
-       raw_spin_unlock_irqrestore(&rq->lock, flags);
+       resched_cpu(task_cpu(p));
 }
 
 #ifdef CONFIG_SMP
@@ -586,32 +579,10 @@ unlock:
  */
 static void wake_up_idle_cpu(int cpu)
 {
-       struct rq *rq = cpu_rq(cpu);
-
        if (cpu == smp_processor_id())
                return;
 
-       /*
-        * This is safe, as this function is called with the timer
-        * wheel base lock of (cpu) held. When the CPU is on the way
-        * to idle and has not yet set rq->curr to idle then it will
-        * be serialized on the timer wheel base lock and take the new
-        * timer into account automatically.
-        */
-       if (rq->curr != rq->idle)
-               return;
-
-       /*
-        * We can set TIF_RESCHED on the idle task of the other CPU
-        * lockless. The worst case is that the other CPU runs the
-        * idle task through an additional NOOP schedule()
-        */
-       set_tsk_need_resched(rq->idle);
-
-       /* NEED_RESCHED must be visible before we test polling */
-       smp_mb();
-       if (!tsk_is_polling(rq->idle))
-               smp_send_reschedule(cpu);
+       idlepoll_kick_cpu(cpu);
 }
 
 static bool wake_up_full_nohz_cpu(int cpu)
@@ -1493,19 +1464,23 @@ static void sched_ttwu_pending(void)
        raw_spin_unlock(&rq->lock);
 }
 
+void __idlepoll_kicked(void)
+{
+       set_preempt_need_resched();
+       sched_ttwu_pending();
+}
+EXPORT_SYMBOL_GPL(__idlepoll_kicked);
+
 void scheduler_ipi(void)
 {
-       /*
-        * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting
-        * TIF_NEED_RESCHED remotely (for the first time) will also send
-        * this IPI.
-        */
-       preempt_fold_need_resched();
+       irq_enter();
 
-       if (llist_empty(&this_rq()->wake_list)
-                       && !tick_nohz_full_cpu(smp_processor_id())
-                       && !got_nohz_idle_kick())
-               return;
+       if (atomic_xchg(this_cpu_ptr(&idlepoll_state), IDLEPOLL_NOT_POLLING) ==
+           IDLEPOLL_KICKED)
+               __idlepoll_kicked();
+
+       if (!tick_nohz_full_cpu(smp_processor_id()) && !got_nohz_idle_kick())
+               goto out;
 
        /*
         * Not all reschedule IPI handlers call irq_enter/irq_exit, since
@@ -1520,9 +1495,7 @@ void scheduler_ipi(void)
         * however a fair share of IPIs are still resched only so this would
         * somewhat pessimize the simple resched case.
         */
-       irq_enter();
        tick_nohz_full_check();
-       sched_ttwu_pending();
 
        /*
         * Check if someone kicked us for doing the nohz idle load balance.
@@ -1531,13 +1504,15 @@ void scheduler_ipi(void)
                this_rq()->idle_balance = 1;
                raise_softirq_irqoff(SCHED_SOFTIRQ);
        }
+
+out:
        irq_exit();
 }
 
 static void ttwu_queue_remote(struct task_struct *p, int cpu)
 {
        if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list))
-               smp_send_reschedule(cpu);
+               idlepoll_kick_cpu(cpu);
 }
 
 bool cpus_share_cache(int this_cpu, int that_cpu)
-- 
1.8.5.3

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to