On 3/17/26 08:23, Christian Loehle wrote:
> On 3/16/26 22:26, Christian Loehle wrote:
>> On 3/16/26 17:46, Tejun Heo wrote:
>>> Hello,
>>>
>>> On Mon, Mar 16, 2026 at 10:02:48AM +0000, Christian Loehle wrote:
>>>> @@ -5686,11 +5718,20 @@ static void kick_cpus_irq_workfn(struct irq_work
>>>> *irq_work)
>>>> * task is picked subsequently. The latter is necessary to break
>>>> * the wait when $cpu is taken by a higher sched class.
>>>> */
>>>> - if (cpu != cpu_of(this_rq))
>>>> + if (cpu != this_cpu)
>>>> smp_cond_load_acquire(wait_kick_sync, VAL !=
>>>> ksyncs[cpu]);
>>>
>>> Given that irq_work is executed at the end of IRQ handling, we can just
>>> reschedule the irq work when the condition is not met (or separate that out
>>> into its own irq_work). That way, I think we can avoid the global lock.
>>>
>> I'll go poke at it some more, but I think it's not guaranteed that B actually
>> advances kick_sync if A keeps kicking. At least not if the handling is in
>> HARD irqwork?
>> Or what would the separated out irq work do differently?
>
> So in my particular example I do the SCX_KICK_WAIT in ops.enqueue(), which is
> fair, but
> I don't think we can delay calling that until we've advanced our local
> kick_sync and if we
> don't we end up in the deadlock, even if e.g. we separate out the retry (and
> make that lazy),
> because then the local CPU is able to continuously issue new kicks (which
> will have to
> be handled by the non-retry path) without advancing it's own kick_sync.
> The closest thing to that I can get working is separating out the
> SCX_KICK_WAIT entirely and
> make that lazy. In practice though that would realistically make the
> SCX_KICK_WAIT latency
> most likely a lot higher than with the global lock, is that what you had in
> mind?
> Or am I missing something here?
Something like the attached, for completeness
From c2e882b4c54680a57bc0817b3c9819ff5f3bbd21 Mon Sep 17 00:00:00 2001
From: Christian Loehle <[email protected]>
Date: Tue, 17 Mar 2026 09:07:49 +0000
Subject: [PATCH 1/2] sched_ext: Prevent SCX_KICK_WAIT deadlocks with lazy irq_work
SCX_KICK_WAIT waits for the target CPU's kick_sync to change after
rescheduling it. If multiple CPUs form a wait cycle concurrently,
e.g. A waits for B, B for C and C for A, and the wait runs from
hard irq_work, each CPU can end up spinning in irq context waiting
for another CPU which cannot reschedule. No kick_sync advances and
the system deadlocks.
Deferring only the retry path is not enough. SCX_KICK_WAIT can be
issued from ops.enqueue(), so a CPU can keep arming fresh waits before
it goes through another SCX scheduling cycle and advances its own
kick_sync. If the initial wait handling still runs on hard irq_work,
a wait cycle can keep rearming without making progress.
Split the kick paths instead. Keep normal and idle kicks on the
existing hard irq_work so prompt rescheduling behavior is unchanged,
but route the full SCX_KICK_WAIT handling (i.e. initial kick, kick_sync
snapshot and retries) through a dedicated lazy irq_work.
This gives wait cycles a context that can yield and be retriggered
instead of pinning all participants in hard irq context, eliminating
the deadlock without serializing unrelated kicks (e.g. through a global
spinlock).
Fixes: 90e55164dad4 ("sched_ext: Implement SCX_KICK_WAIT")
Signed-off-by: Christian Loehle <[email protected]>
---
kernel/sched/ext.c | 131 ++++++++++++++++++++++++++++++-------------
kernel/sched/sched.h | 2 +
2 files changed, 94 insertions(+), 39 deletions(-)
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 26a6ac2f8826..8d133cc9c5f4 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -4713,6 +4713,9 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
if (!cpumask_empty(rq->scx.cpus_to_wait))
dump_line(&ns, " cpus_to_wait : %*pb",
cpumask_pr_args(rq->scx.cpus_to_wait));
+ if (!cpumask_empty(rq->scx.cpus_to_wait_kick))
+ dump_line(&ns, " cpus_to_wait_kick: %*pb",
+ cpumask_pr_args(rq->scx.cpus_to_wait_kick));
used = seq_buf_used(&ns);
if (SCX_HAS_OP(sch, dump_cpu)) {
@@ -5583,12 +5586,43 @@ static bool can_skip_idle_kick(struct rq *rq)
return !is_idle_task(rq->curr) && !(rq->scx.flags & SCX_RQ_IN_BALANCE);
}
-static bool kick_one_cpu(s32 cpu, struct rq *this_rq, unsigned long *ksyncs)
+static void kick_one_cpu(s32 cpu, struct rq *this_rq)
+{
+ struct rq *rq = cpu_rq(cpu);
+ struct scx_rq *this_scx = &this_rq->scx;
+ const struct sched_class *cur_class;
+ unsigned long flags;
+
+ raw_spin_rq_lock_irqsave(rq, flags);
+ cur_class = rq->curr->sched_class;
+
+ /*
+ * During CPU hotplug, a CPU may depend on kicking itself to make
+ * forward progress. Allow kicking self regardless of online state. If
+ * @cpu is running a higher class task, we have no control over @cpu.
+ * Skip kicking.
+ */
+ if ((cpu_online(cpu) || cpu == cpu_of(this_rq)) &&
+ !sched_class_above(cur_class, &ext_sched_class)) {
+ if (cpumask_test_cpu(cpu, this_scx->cpus_to_preempt)) {
+ if (cur_class == &ext_sched_class)
+ rq->curr->scx.slice = 0;
+ cpumask_clear_cpu(cpu, this_scx->cpus_to_preempt);
+ }
+
+ resched_curr(rq);
+ } else {
+ cpumask_clear_cpu(cpu, this_scx->cpus_to_preempt);
+ }
+
+ raw_spin_rq_unlock_irqrestore(rq, flags);
+}
+
+static void kick_one_cpu_wait(s32 cpu, struct rq *this_rq, unsigned long *ksyncs)
{
struct rq *rq = cpu_rq(cpu);
struct scx_rq *this_scx = &this_rq->scx;
const struct sched_class *cur_class;
- bool should_wait = false;
unsigned long flags;
raw_spin_rq_lock_irqsave(rq, flags);
@@ -5609,12 +5643,10 @@ static bool kick_one_cpu(s32 cpu, struct rq *this_rq, unsigned long *ksyncs)
}
if (cpumask_test_cpu(cpu, this_scx->cpus_to_wait)) {
- if (cur_class == &ext_sched_class) {
+ if (cur_class == &ext_sched_class)
ksyncs[cpu] = rq->scx.kick_sync;
- should_wait = true;
- } else {
+ else
cpumask_clear_cpu(cpu, this_scx->cpus_to_wait);
- }
}
resched_curr(rq);
@@ -5624,8 +5656,6 @@ static bool kick_one_cpu(s32 cpu, struct rq *this_rq, unsigned long *ksyncs)
}
raw_spin_rq_unlock_irqrestore(rq, flags);
-
- return should_wait;
}
static void kick_one_cpu_if_idle(s32 cpu, struct rq *this_rq)
@@ -5646,20 +5676,10 @@ static void kick_cpus_irq_workfn(struct irq_work *irq_work)
{
struct rq *this_rq = this_rq();
struct scx_rq *this_scx = &this_rq->scx;
- struct scx_kick_syncs __rcu *ksyncs_pcpu = __this_cpu_read(scx_kick_syncs);
- bool should_wait = false;
- unsigned long *ksyncs;
s32 cpu;
- if (unlikely(!ksyncs_pcpu)) {
- pr_warn_once("kick_cpus_irq_workfn() called with NULL scx_kick_syncs");
- return;
- }
-
- ksyncs = rcu_dereference_bh(ksyncs_pcpu)->syncs;
-
for_each_cpu(cpu, this_scx->cpus_to_kick) {
- should_wait |= kick_one_cpu(cpu, this_rq, ksyncs);
+ kick_one_cpu(cpu, this_rq);
cpumask_clear_cpu(cpu, this_scx->cpus_to_kick);
cpumask_clear_cpu(cpu, this_scx->cpus_to_kick_if_idle);
}
@@ -5668,29 +5688,51 @@ static void kick_cpus_irq_workfn(struct irq_work *irq_work)
kick_one_cpu_if_idle(cpu, this_rq);
cpumask_clear_cpu(cpu, this_scx->cpus_to_kick_if_idle);
}
+}
- if (!should_wait)
+static void kick_wait_irq_workfn(struct irq_work *irq_work)
+{
+ struct rq *this_rq = this_rq();
+ struct scx_rq *this_scx = &this_rq->scx;
+ struct scx_kick_syncs __rcu *ksyncs_pcpu = __this_cpu_read(scx_kick_syncs);
+ unsigned long *ksyncs;
+ s32 this_cpu = cpu_of(this_rq);
+ s32 cpu;
+
+ if (unlikely(!ksyncs_pcpu)) {
+ pr_warn_once("kick_wait_irq_workfn() called with NULL scx_kick_syncs");
return;
+ }
+
+ ksyncs = rcu_dereference_bh(ksyncs_pcpu)->syncs;
+
+ for_each_cpu(cpu, this_scx->cpus_to_wait_kick) {
+ kick_one_cpu_wait(cpu, this_rq, ksyncs);
+ cpumask_clear_cpu(cpu, this_scx->cpus_to_wait_kick);
+ }
for_each_cpu(cpu, this_scx->cpus_to_wait) {
unsigned long *wait_kick_sync = &cpu_rq(cpu)->scx.kick_sync;
/*
- * Busy-wait until the task running at the time of kicking is no
- * longer running. This can be used to implement e.g. core
- * scheduling.
+ * Check whether the target CPU has gone through a scheduling
+ * cycle since SCX_KICK_WAIT was issued by testing kick_sync.
+ * If the condition is already met, or this CPU kicked itself,
+ * clear from the wait mask. Otherwise leave it set and
+ * re-queue below: we yield and retry without ever blocking,
+ * which eliminates the possibility of deadlock when multiple
+ * CPUs issue SCX_KICK_WAIT targeting each other in a cycle.
*
- * smp_cond_load_acquire() pairs with store_releases in
- * pick_task_scx() and put_prev_task_scx(). The former breaks
- * the wait if SCX's scheduling path is entered even if the same
- * task is picked subsequently. The latter is necessary to break
- * the wait when $cpu is taken by a higher sched class.
+ * smp_load_acquire() pairs with the smp_store_release() in
+ * pick_task_scx() and put_prev_task_scx().
*/
- if (cpu != cpu_of(this_rq))
- smp_cond_load_acquire(wait_kick_sync, VAL != ksyncs[cpu]);
-
- cpumask_clear_cpu(cpu, this_scx->cpus_to_wait);
+ if (cpu == this_cpu ||
+ smp_load_acquire(wait_kick_sync) != ksyncs[cpu])
+ cpumask_clear_cpu(cpu, this_scx->cpus_to_wait);
}
+
+ if (!cpumask_empty(this_scx->cpus_to_wait))
+ irq_work_queue(&this_rq->scx.kick_wait_irq_work);
}
/**
@@ -5794,8 +5836,10 @@ void __init init_sched_ext_class(void)
BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_kick_if_idle, GFP_KERNEL, n));
BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_preempt, GFP_KERNEL, n));
BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_wait, GFP_KERNEL, n));
+ BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_wait_kick, GFP_KERNEL, n));
rq->scx.deferred_irq_work = IRQ_WORK_INIT_HARD(deferred_irq_workfn);
rq->scx.kick_cpus_irq_work = IRQ_WORK_INIT_HARD(kick_cpus_irq_workfn);
+ rq->scx.kick_wait_irq_work = IRQ_WORK_INIT_LAZY(kick_wait_irq_workfn);
if (cpu_online(cpu))
cpu_rq(cpu)->scx.flags |= SCX_RQ_ONLINE;
@@ -6543,16 +6587,25 @@ static void scx_kick_cpu(struct scx_sched *sch, s32 cpu, u64 flags)
raw_spin_rq_unlock(target_rq);
}
cpumask_set_cpu(cpu, this_rq->scx.cpus_to_kick_if_idle);
+ irq_work_queue(&this_rq->scx.kick_cpus_irq_work);
} else {
- cpumask_set_cpu(cpu, this_rq->scx.cpus_to_kick);
-
- if (flags & SCX_KICK_PREEMPT)
- cpumask_set_cpu(cpu, this_rq->scx.cpus_to_preempt);
- if (flags & SCX_KICK_WAIT)
+ if (flags & SCX_KICK_WAIT) {
cpumask_set_cpu(cpu, this_rq->scx.cpus_to_wait);
- }
+ cpumask_set_cpu(cpu, this_rq->scx.cpus_to_wait_kick);
+
+ if (flags & SCX_KICK_PREEMPT)
+ cpumask_set_cpu(cpu, this_rq->scx.cpus_to_preempt);
+
+ irq_work_queue(&this_rq->scx.kick_wait_irq_work);
+ } else {
+ cpumask_set_cpu(cpu, this_rq->scx.cpus_to_kick);
+
+ if (flags & SCX_KICK_PREEMPT)
+ cpumask_set_cpu(cpu, this_rq->scx.cpus_to_preempt);
- irq_work_queue(&this_rq->scx.kick_cpus_irq_work);
+ irq_work_queue(&this_rq->scx.kick_cpus_irq_work);
+ }
+ }
out:
local_irq_restore(irq_flags);
}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 43bbf0693cca..010c0821d230 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -805,11 +805,13 @@ struct scx_rq {
cpumask_var_t cpus_to_kick_if_idle;
cpumask_var_t cpus_to_preempt;
cpumask_var_t cpus_to_wait;
+ cpumask_var_t cpus_to_wait_kick;
unsigned long kick_sync;
local_t reenq_local_deferred;
struct balance_callback deferred_bal_cb;
struct irq_work deferred_irq_work;
struct irq_work kick_cpus_irq_work;
+ struct irq_work kick_wait_irq_work;
struct scx_dispatch_q bypass_dsq;
};
#endif /* CONFIG_SCHED_CLASS_EXT */
--
2.34.1