The branch main has been updated by markj:

URL: 
https://cgit.FreeBSD.org/src/commit/?id=03f868b163ad46d6f7cb03dc46fb83ca01fb8f69

commit 03f868b163ad46d6f7cb03dc46fb83ca01fb8f69
Author:     Mark Johnston <ma...@freebsd.org>
AuthorDate: 2022-07-14 14:24:25 +0000
Commit:     Mark Johnston <ma...@freebsd.org>
CommitDate: 2022-07-14 14:28:01 +0000

    x86: Add a required store-load barrier in cpu_idle()
    
    ULE's tdq_notify() tries to avoid delivering IPIs to the idle thread.
    In particular, it tries to detect whether the idle thread is running.
    There are two mechanisms for this:
    - tdq_cpu_idle, an MI flag which is set prior to calling cpu_idle().  If
      tdq_cpu_idle == 0, then no IPI is needed;
    - idle_state, an x86-specific state flag which is updated after
      cpu_idleclock() is called.
    
    The implementation of the second mechanism is racy; the race can cause a
    CPU to go to sleep with pending work.  Specifically, cpu_idle_*() set
    idle_state = STATE_SLEEPING, then check for pending work by loading the
    tdq_load field of the CPU's runqueue.  These operations can be reordered
    so that the idle thread observes tdq_load == 0, and tdq_notify()
    observes idle_state == STATE_RUNNING.
    
    Some counters indicate that the idle_state check in tdq_notify()
    frequently elides an IPI.  So, fix the problem by inserting a fence
    after the store to idle_state, immediately before idling the CPU.
    
    PR:             264867
    Reviewed by:    mav, kib, jhb
    MFC after:      1 month
    Sponsored by:   The FreeBSD Foundation
    Differential Revision:  https://reviews.freebsd.org/D35777
---
 sys/x86/x86/cpu_machdep.c | 103 ++++++++++++++++++++++++++++------------------
 1 file changed, 62 insertions(+), 41 deletions(-)

diff --git a/sys/x86/x86/cpu_machdep.c b/sys/x86/x86/cpu_machdep.c
index fa11f64e2779..040438043c73 100644
--- a/sys/x86/x86/cpu_machdep.c
+++ b/sys/x86/x86/cpu_machdep.c
@@ -52,6 +52,7 @@ __FBSDID("$FreeBSD$");
 #include "opt_maxmem.h"
 #include "opt_mp_watchdog.h"
 #include "opt_platform.h"
+#include "opt_sched.h"
 #ifdef __i386__
 #include "opt_apic.h"
 #endif
@@ -532,32 +533,25 @@ static int        idle_mwait = 1;         /* Use 
MONITOR/MWAIT for short idle. */
 SYSCTL_INT(_machdep, OID_AUTO, idle_mwait, CTLFLAG_RWTUN, &idle_mwait,
     0, "Use MONITOR/MWAIT for short idle");
 
-static void
-cpu_idle_acpi(sbintime_t sbt)
+static bool
+cpu_idle_enter(int *statep, int newstate)
 {
-       int *state;
+       KASSERT(atomic_load_int(statep) == STATE_RUNNING,
+           ("%s: state %d", __func__, atomic_load_int(statep)));
 
-       state = &PCPU_PTR(monitorbuf)->idle_state;
-       atomic_store_int(state, STATE_SLEEPING);
-
-       /* See comments in cpu_idle_hlt(). */
-       disable_intr();
-       if (sched_runnable())
-               enable_intr();
-       else if (cpu_idle_hook)
-               cpu_idle_hook(sbt);
-       else
-               acpi_cpu_c1();
-       atomic_store_int(state, STATE_RUNNING);
-}
-
-static void
-cpu_idle_hlt(sbintime_t sbt)
-{
-       int *state;
-
-       state = &PCPU_PTR(monitorbuf)->idle_state;
-       atomic_store_int(state, STATE_SLEEPING);
+       /*
+        * A fence is needed to prevent reordering of the load in
+        * sched_runnable() with this store to the idle state word.  Without it,
+        * cpu_idle_wakeup() can observe the state as STATE_RUNNING after having
+        * added load to the queue, and elide an IPI.  Then, sched_runnable()
+        * can observe tdq_load == 0, so the CPU ends up idling with pending
+        * work.  tdq_notify() similarly ensures that a prior update to tdq_load
+        * is visible before calling cpu_idle_wakeup().
+        */
+       atomic_store_int(statep, newstate);
+#if defined(SCHED_ULE) && defined(SMP)
+       atomic_thread_fence_seq_cst();
+#endif
 
        /*
         * Since we may be in a critical section from cpu_idle(), if
@@ -576,35 +570,62 @@ cpu_idle_hlt(sbintime_t sbt)
         * interrupt.
         */
        disable_intr();
-       if (sched_runnable())
+       if (sched_runnable()) {
                enable_intr();
-       else
-               acpi_cpu_c1();
-       atomic_store_int(state, STATE_RUNNING);
+               atomic_store_int(statep, STATE_RUNNING);
+               return (false);
+       } else {
+               return (true);
+       }
 }
 
 static void
-cpu_idle_mwait(sbintime_t sbt)
+cpu_idle_exit(int *statep)
+{
+       atomic_store_int(statep, STATE_RUNNING);
+}
+
+static void
+cpu_idle_acpi(sbintime_t sbt)
 {
        int *state;
 
        state = &PCPU_PTR(monitorbuf)->idle_state;
-       atomic_store_int(state, STATE_MWAIT);
+       if (cpu_idle_enter(state, STATE_SLEEPING)) {
+               if (cpu_idle_hook)
+                       cpu_idle_hook(sbt);
+               else
+                       acpi_cpu_c1();
+               cpu_idle_exit(state);
+       }
+}
 
-       /* See comments in cpu_idle_hlt(). */
-       disable_intr();
-       if (sched_runnable()) {
+static void
+cpu_idle_hlt(sbintime_t sbt)
+{
+       int *state;
+
+       state = &PCPU_PTR(monitorbuf)->idle_state;
+       if (cpu_idle_enter(state, STATE_SLEEPING)) {
+               acpi_cpu_c1();
                atomic_store_int(state, STATE_RUNNING);
-               enable_intr();
-               return;
        }
+}
 
-       cpu_monitor(state, 0, 0);
-       if (atomic_load_int(state) == STATE_MWAIT)
-               __asm __volatile("sti; mwait" : : "a" (MWAIT_C1), "c" (0));
-       else
-               enable_intr();
-       atomic_store_int(state, STATE_RUNNING);
+static void
+cpu_idle_mwait(sbintime_t sbt)
+{
+       int *state;
+
+       state = &PCPU_PTR(monitorbuf)->idle_state;
+       if (cpu_idle_enter(state, STATE_MWAIT)) {
+               cpu_monitor(state, 0, 0);
+               if (atomic_load_int(state) == STATE_MWAIT)
+                       __asm __volatile("sti; mwait" : : "a" (MWAIT_C1), "c" 
(0));
+               else
+                       enable_intr();
+               cpu_idle_exit(state);
+       }
 }
 
 static void

Reply via email to