SCX_KICK_WAIT causes kick_cpus_irq_workfn() to busy-wait using
smp_cond_load_acquire() until the target CPU's current SCX task has been
context-switched out (its kick_sync counter advanced).

If multiple CPUs each issue SCX_KICK_WAIT targeting one another
concurrently — e.g. CPU A waits for CPU B, B waits for CPU C, C waits for
CPU A — all CPUs can end up wedged inside smp_cond_load_acquire()
simultaneously.  Because each victim CPU is spinning in hardirq/irq_work
context, it cannot reschedule, so no kick_sync counter ever advances and
the system deadlocks.

Fix this by serializing access to the wait loop behind a global raw
spinlock (scx_kick_wait_lock).  Only one CPU at a time may execute the
wait loop; any other CPU that has SCX_KICK_WAIT work to do and fails to
acquire the lock records itself in scx_kick_wait_pending and returns.
When the active waiter finishes and releases the lock, it replays the
pending set by re-queuing each pending CPU's kick_cpus_irq_work, ensuring
no wait request is silently dropped.

This is deliberately a coarse serialization: multiple simultaneous wait
operations now run sequentially, increasing latency.  In exchange,
deadlocks are impossible regardless of the cycle length (A->B->C->...->A).

Also clear scx_kick_wait_pending in free_kick_syncs() so that any stale
bits left by a CPU that deferred just as the scheduler exited are reset
before the next scheduler instance loads.

Fixes: 90e55164dad4 ("sched_ext: Implement SCX_KICK_WAIT")
Signed-off-by: Christian Loehle <[email protected]>
---
 kernel/sched/ext.c | 45 +++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 43 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 26a6ac2f8826..b63ae13d0486 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -89,6 +89,19 @@ struct scx_kick_syncs {
 
 static DEFINE_PER_CPU(struct scx_kick_syncs __rcu *, scx_kick_syncs);
 
+/*
+ * Serialize %SCX_KICK_WAIT processing across CPUs to avoid wait cycles.
+ * Callers failing to acquire @scx_kick_wait_lock defer by recording
+ * themselves in @scx_kick_wait_pending and are retriggered when the active
+ * waiter completes.
+ *
+ * Lock ordering: @scx_kick_wait_lock is always acquired before
+ * @scx_kick_wait_pending_lock; the two are never taken in the opposite order.
+ */
+static DEFINE_RAW_SPINLOCK(scx_kick_wait_lock);
+static DEFINE_RAW_SPINLOCK(scx_kick_wait_pending_lock);
+static cpumask_t scx_kick_wait_pending;
+
 /*
  * Direct dispatch marker.
  *
@@ -4279,6 +4292,13 @@ static void free_kick_syncs(void)
                if (to_free)
                        kvfree_rcu(to_free, rcu);
        }
+
+       /*
+        * Clear any CPUs that were waiting for the lock when the scheduler
+        * exited.  Their irq_work has already returned so no in-flight
+        * waiter can observe the stale bits on the next enable.
+        */
+       cpumask_clear(&scx_kick_wait_pending);
 }
 
 static void scx_disable_workfn(struct kthread_work *work)
@@ -5647,8 +5667,9 @@ static void kick_cpus_irq_workfn(struct irq_work 
*irq_work)
        struct rq *this_rq = this_rq();
        struct scx_rq *this_scx = &this_rq->scx;
        struct scx_kick_syncs __rcu *ksyncs_pcpu = 
__this_cpu_read(scx_kick_syncs);
-       bool should_wait = false;
+       bool should_wait = !cpumask_empty(this_scx->cpus_to_wait);
        unsigned long *ksyncs;
+       s32 this_cpu = cpu_of(this_rq);
        s32 cpu;
 
        if (unlikely(!ksyncs_pcpu)) {
@@ -5672,6 +5693,17 @@ static void kick_cpus_irq_workfn(struct irq_work 
*irq_work)
        if (!should_wait)
                return;
 
+       if (!raw_spin_trylock(&scx_kick_wait_lock)) {
+               raw_spin_lock(&scx_kick_wait_pending_lock);
+               cpumask_set_cpu(this_cpu, &scx_kick_wait_pending);
+               raw_spin_unlock(&scx_kick_wait_pending_lock);
+               return;
+       }
+
+       raw_spin_lock(&scx_kick_wait_pending_lock);
+       cpumask_clear_cpu(this_cpu, &scx_kick_wait_pending);
+       raw_spin_unlock(&scx_kick_wait_pending_lock);
+
        for_each_cpu(cpu, this_scx->cpus_to_wait) {
                unsigned long *wait_kick_sync = &cpu_rq(cpu)->scx.kick_sync;
 
@@ -5686,11 +5718,20 @@ static void kick_cpus_irq_workfn(struct irq_work 
*irq_work)
                 * task is picked subsequently. The latter is necessary to break
                 * the wait when $cpu is taken by a higher sched class.
                 */
-               if (cpu != cpu_of(this_rq))
+               if (cpu != this_cpu)
                        smp_cond_load_acquire(wait_kick_sync, VAL != 
ksyncs[cpu]);
 
                cpumask_clear_cpu(cpu, this_scx->cpus_to_wait);
        }
+
+       raw_spin_unlock(&scx_kick_wait_lock);
+
+       raw_spin_lock(&scx_kick_wait_pending_lock);
+       for_each_cpu(cpu, &scx_kick_wait_pending) {
+               cpumask_clear_cpu(cpu, &scx_kick_wait_pending);
+               irq_work_queue(&cpu_rq(cpu)->scx.kick_cpus_irq_work);
+       }
+       raw_spin_unlock(&scx_kick_wait_pending_lock);
 }
 
 /**
-- 
2.34.1


Reply via email to