On 3/17/26 08:23, Christian Loehle wrote:
> On 3/16/26 22:26, Christian Loehle wrote:
>> On 3/16/26 17:46, Tejun Heo wrote:
>>> Hello,
>>>
>>> On Mon, Mar 16, 2026 at 10:02:48AM +0000, Christian Loehle wrote:
>>>> @@ -5686,11 +5718,20 @@ static void kick_cpus_irq_workfn(struct irq_work 
>>>> *irq_work)
>>>>             * task is picked subsequently. The latter is necessary to break
>>>>             * the wait when $cpu is taken by a higher sched class.
>>>>             */
>>>> -          if (cpu != cpu_of(this_rq))
>>>> +          if (cpu != this_cpu)
>>>>                    smp_cond_load_acquire(wait_kick_sync, VAL != 
>>>> ksyncs[cpu]);
>>>
>>> Given that irq_work is executed at the end of IRQ handling, we can just
>>> reschedule the irq work when the condition is not met (or separate that out
>>> into its own irq_work). That way, I think we can avoid the global lock.
>>>
>> I'll go poke at it some more, but I think it's not guaranteed that B actually
>> advances kick_sync if A keeps kicking. At least not if the handling is in 
>> HARD irqwork?
>> Or what would the separated out irq work do differently?
> 
> So in my particular example I do the SCX_KICK_WAIT in ops.enqueue(), which is 
> fair, but
> I don't think we can delay calling that until we've advanced our local 
> kick_sync and if we
> don't we end up in the deadlock, even if e.g. we separate out the retry (and 
> make that lazy),
> because then the local CPU is able to continuously issue new kicks (which 
> will have to
> be handled by the non-retry path) without advancing it's own kick_sync.
> The closest thing to that I can get working is separating out the 
> SCX_KICK_WAIT entirely and
> make that lazy. In practice though that would realistically make the 
> SCX_KICK_WAIT latency
> most likely a lot higher than with the global lock, is that what you had in 
> mind?
> Or am I missing something here?

Something like the attached, for completeness
From c2e882b4c54680a57bc0817b3c9819ff5f3bbd21 Mon Sep 17 00:00:00 2001
From: Christian Loehle <[email protected]>
Date: Tue, 17 Mar 2026 09:07:49 +0000
Subject: [PATCH 1/2] sched_ext: Prevent SCX_KICK_WAIT deadlocks with lazy irq_work

SCX_KICK_WAIT waits for the target CPU's kick_sync to change after
rescheduling it. If multiple CPUs form a wait cycle concurrently,
e.g. A waits for B, B for C and C for A, and the wait runs from
hard irq_work, each CPU can end up spinning in irq context waiting
for another CPU which cannot reschedule. No kick_sync advances and
the system deadlocks.

Deferring only the retry path is not enough. SCX_KICK_WAIT can be
issued from ops.enqueue(), so a CPU can keep arming fresh waits before
it goes through another SCX scheduling cycle and advances its own
kick_sync. If the initial wait handling still runs on hard irq_work,
a wait cycle can keep rearming without making progress.

Split the kick paths instead. Keep normal and idle kicks on the
existing hard irq_work so prompt rescheduling behavior is unchanged,
but route the full SCX_KICK_WAIT handling (i.e. initial kick, kick_sync
snapshot and retries) through a dedicated lazy irq_work.
This gives wait cycles a context that can yield and be retriggered
instead of pinning all participants in hard irq context, eliminating
the deadlock without serializing unrelated kicks (e.g. through a global
spinlock).

Fixes: 90e55164dad4 ("sched_ext: Implement SCX_KICK_WAIT")
Signed-off-by: Christian Loehle <[email protected]>
---
 kernel/sched/ext.c   | 131 ++++++++++++++++++++++++++++++-------------
 kernel/sched/sched.h |   2 +
 2 files changed, 94 insertions(+), 39 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 26a6ac2f8826..8d133cc9c5f4 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -4713,6 +4713,9 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
 		if (!cpumask_empty(rq->scx.cpus_to_wait))
 			dump_line(&ns, "  cpus_to_wait   : %*pb",
 				  cpumask_pr_args(rq->scx.cpus_to_wait));
+		if (!cpumask_empty(rq->scx.cpus_to_wait_kick))
+			dump_line(&ns, "  cpus_to_wait_kick: %*pb",
+				  cpumask_pr_args(rq->scx.cpus_to_wait_kick));
 
 		used = seq_buf_used(&ns);
 		if (SCX_HAS_OP(sch, dump_cpu)) {
@@ -5583,12 +5586,43 @@ static bool can_skip_idle_kick(struct rq *rq)
 	return !is_idle_task(rq->curr) && !(rq->scx.flags & SCX_RQ_IN_BALANCE);
 }
 
-static bool kick_one_cpu(s32 cpu, struct rq *this_rq, unsigned long *ksyncs)
+static void kick_one_cpu(s32 cpu, struct rq *this_rq)
+{
+	struct rq *rq = cpu_rq(cpu);
+	struct scx_rq *this_scx = &this_rq->scx;
+	const struct sched_class *cur_class;
+	unsigned long flags;
+
+	raw_spin_rq_lock_irqsave(rq, flags);
+	cur_class = rq->curr->sched_class;
+
+	/*
+	 * During CPU hotplug, a CPU may depend on kicking itself to make
+	 * forward progress. Allow kicking self regardless of online state. If
+	 * @cpu is running a higher class task, we have no control over @cpu.
+	 * Skip kicking.
+	 */
+	if ((cpu_online(cpu) || cpu == cpu_of(this_rq)) &&
+	    !sched_class_above(cur_class, &ext_sched_class)) {
+		if (cpumask_test_cpu(cpu, this_scx->cpus_to_preempt)) {
+			if (cur_class == &ext_sched_class)
+				rq->curr->scx.slice = 0;
+			cpumask_clear_cpu(cpu, this_scx->cpus_to_preempt);
+		}
+
+		resched_curr(rq);
+	} else {
+		cpumask_clear_cpu(cpu, this_scx->cpus_to_preempt);
+	}
+
+	raw_spin_rq_unlock_irqrestore(rq, flags);
+}
+
+static void kick_one_cpu_wait(s32 cpu, struct rq *this_rq, unsigned long *ksyncs)
 {
 	struct rq *rq = cpu_rq(cpu);
 	struct scx_rq *this_scx = &this_rq->scx;
 	const struct sched_class *cur_class;
-	bool should_wait = false;
 	unsigned long flags;
 
 	raw_spin_rq_lock_irqsave(rq, flags);
@@ -5609,12 +5643,10 @@ static bool kick_one_cpu(s32 cpu, struct rq *this_rq, unsigned long *ksyncs)
 		}
 
 		if (cpumask_test_cpu(cpu, this_scx->cpus_to_wait)) {
-			if (cur_class == &ext_sched_class) {
+			if (cur_class == &ext_sched_class)
 				ksyncs[cpu] = rq->scx.kick_sync;
-				should_wait = true;
-			} else {
+			else
 				cpumask_clear_cpu(cpu, this_scx->cpus_to_wait);
-			}
 		}
 
 		resched_curr(rq);
@@ -5624,8 +5656,6 @@ static bool kick_one_cpu(s32 cpu, struct rq *this_rq, unsigned long *ksyncs)
 	}
 
 	raw_spin_rq_unlock_irqrestore(rq, flags);
-
-	return should_wait;
 }
 
 static void kick_one_cpu_if_idle(s32 cpu, struct rq *this_rq)
@@ -5646,20 +5676,10 @@ static void kick_cpus_irq_workfn(struct irq_work *irq_work)
 {
 	struct rq *this_rq = this_rq();
 	struct scx_rq *this_scx = &this_rq->scx;
-	struct scx_kick_syncs __rcu *ksyncs_pcpu = __this_cpu_read(scx_kick_syncs);
-	bool should_wait = false;
-	unsigned long *ksyncs;
 	s32 cpu;
 
-	if (unlikely(!ksyncs_pcpu)) {
-		pr_warn_once("kick_cpus_irq_workfn() called with NULL scx_kick_syncs");
-		return;
-	}
-
-	ksyncs = rcu_dereference_bh(ksyncs_pcpu)->syncs;
-
 	for_each_cpu(cpu, this_scx->cpus_to_kick) {
-		should_wait |= kick_one_cpu(cpu, this_rq, ksyncs);
+		kick_one_cpu(cpu, this_rq);
 		cpumask_clear_cpu(cpu, this_scx->cpus_to_kick);
 		cpumask_clear_cpu(cpu, this_scx->cpus_to_kick_if_idle);
 	}
@@ -5668,29 +5688,51 @@ static void kick_cpus_irq_workfn(struct irq_work *irq_work)
 		kick_one_cpu_if_idle(cpu, this_rq);
 		cpumask_clear_cpu(cpu, this_scx->cpus_to_kick_if_idle);
 	}
+}
 
-	if (!should_wait)
+static void kick_wait_irq_workfn(struct irq_work *irq_work)
+{
+	struct rq *this_rq = this_rq();
+	struct scx_rq *this_scx = &this_rq->scx;
+	struct scx_kick_syncs __rcu *ksyncs_pcpu = __this_cpu_read(scx_kick_syncs);
+	unsigned long *ksyncs;
+	s32 this_cpu = cpu_of(this_rq);
+	s32 cpu;
+
+	if (unlikely(!ksyncs_pcpu)) {
+		pr_warn_once("kick_wait_irq_workfn() called with NULL scx_kick_syncs");
 		return;
+	}
+
+	ksyncs = rcu_dereference_bh(ksyncs_pcpu)->syncs;
+
+	for_each_cpu(cpu, this_scx->cpus_to_wait_kick) {
+		kick_one_cpu_wait(cpu, this_rq, ksyncs);
+		cpumask_clear_cpu(cpu, this_scx->cpus_to_wait_kick);
+	}
 
 	for_each_cpu(cpu, this_scx->cpus_to_wait) {
 		unsigned long *wait_kick_sync = &cpu_rq(cpu)->scx.kick_sync;
 
 		/*
-		 * Busy-wait until the task running at the time of kicking is no
-		 * longer running. This can be used to implement e.g. core
-		 * scheduling.
+		 * Check whether the target CPU has gone through a scheduling
+		 * cycle since SCX_KICK_WAIT was issued by testing kick_sync.
+		 * If the condition is already met, or this CPU kicked itself,
+		 * clear from the wait mask. Otherwise leave it set and
+		 * re-queue below: we yield and retry without ever blocking,
+		 * which eliminates the possibility of deadlock when multiple
+		 * CPUs issue SCX_KICK_WAIT targeting each other in a cycle.
 		 *
-		 * smp_cond_load_acquire() pairs with store_releases in
-		 * pick_task_scx() and put_prev_task_scx(). The former breaks
-		 * the wait if SCX's scheduling path is entered even if the same
-		 * task is picked subsequently. The latter is necessary to break
-		 * the wait when $cpu is taken by a higher sched class.
+		 * smp_load_acquire() pairs with the smp_store_release() in
+		 * pick_task_scx() and put_prev_task_scx().
 		 */
-		if (cpu != cpu_of(this_rq))
-			smp_cond_load_acquire(wait_kick_sync, VAL != ksyncs[cpu]);
-
-		cpumask_clear_cpu(cpu, this_scx->cpus_to_wait);
+		if (cpu == this_cpu ||
+		    smp_load_acquire(wait_kick_sync) != ksyncs[cpu])
+			cpumask_clear_cpu(cpu, this_scx->cpus_to_wait);
 	}
+
+	if (!cpumask_empty(this_scx->cpus_to_wait))
+		irq_work_queue(&this_rq->scx.kick_wait_irq_work);
 }
 
 /**
@@ -5794,8 +5836,10 @@ void __init init_sched_ext_class(void)
 		BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_kick_if_idle, GFP_KERNEL, n));
 		BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_preempt, GFP_KERNEL, n));
 		BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_wait, GFP_KERNEL, n));
+		BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_wait_kick, GFP_KERNEL, n));
 		rq->scx.deferred_irq_work = IRQ_WORK_INIT_HARD(deferred_irq_workfn);
 		rq->scx.kick_cpus_irq_work = IRQ_WORK_INIT_HARD(kick_cpus_irq_workfn);
+		rq->scx.kick_wait_irq_work = IRQ_WORK_INIT_LAZY(kick_wait_irq_workfn);
 
 		if (cpu_online(cpu))
 			cpu_rq(cpu)->scx.flags |= SCX_RQ_ONLINE;
@@ -6543,16 +6587,25 @@ static void scx_kick_cpu(struct scx_sched *sch, s32 cpu, u64 flags)
 			raw_spin_rq_unlock(target_rq);
 		}
 		cpumask_set_cpu(cpu, this_rq->scx.cpus_to_kick_if_idle);
+		irq_work_queue(&this_rq->scx.kick_cpus_irq_work);
 	} else {
-		cpumask_set_cpu(cpu, this_rq->scx.cpus_to_kick);
-
-		if (flags & SCX_KICK_PREEMPT)
-			cpumask_set_cpu(cpu, this_rq->scx.cpus_to_preempt);
-		if (flags & SCX_KICK_WAIT)
+		if (flags & SCX_KICK_WAIT) {
 			cpumask_set_cpu(cpu, this_rq->scx.cpus_to_wait);
-	}
+			cpumask_set_cpu(cpu, this_rq->scx.cpus_to_wait_kick);
+
+			if (flags & SCX_KICK_PREEMPT)
+				cpumask_set_cpu(cpu, this_rq->scx.cpus_to_preempt);
+
+			irq_work_queue(&this_rq->scx.kick_wait_irq_work);
+		} else {
+			cpumask_set_cpu(cpu, this_rq->scx.cpus_to_kick);
+
+			if (flags & SCX_KICK_PREEMPT)
+				cpumask_set_cpu(cpu, this_rq->scx.cpus_to_preempt);
 
-	irq_work_queue(&this_rq->scx.kick_cpus_irq_work);
+			irq_work_queue(&this_rq->scx.kick_cpus_irq_work);
+		}
+	}
 out:
 	local_irq_restore(irq_flags);
 }
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 43bbf0693cca..010c0821d230 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -805,11 +805,13 @@ struct scx_rq {
 	cpumask_var_t		cpus_to_kick_if_idle;
 	cpumask_var_t		cpus_to_preempt;
 	cpumask_var_t		cpus_to_wait;
+	cpumask_var_t		cpus_to_wait_kick;
 	unsigned long		kick_sync;
 	local_t			reenq_local_deferred;
 	struct balance_callback	deferred_bal_cb;
 	struct irq_work		deferred_irq_work;
 	struct irq_work		kick_cpus_irq_work;
+	struct irq_work		kick_wait_irq_work;
 	struct scx_dispatch_q	bypass_dsq;
 };
 #endif /* CONFIG_SCHED_CLASS_EXT */
-- 
2.34.1

Reply via email to