On Fri, Oct 04, 2024 at 12:17:04PM +0200, Sebastian Andrzej Siewior wrote: > A timer/ hrtimer softirq is raised in-IRQ context. With threaded > interrupts enabled or on PREEMPT_RT this leads to waking the ksoftirqd > for the processing of the softirq. > Once the ksoftirqd is marked as pending (or is running) it will collect > all raised softirqs. This in turn means that a softirq which would have > been processed at the end of the threaded interrupt, which runs at an > elevated priority, is now moved to ksoftirqd which runs at SCHED_OTHER > priority and competes with every regular task for CPU resources. > This introduces long delays on heavy loaded systems and is not desired > especially if the system is not overloaded by the softirqs. > > Split the TIMER_SOFTIRQ and HRTIMER_SOFTIRQ processing into a dedicated > timers thread and let it run at the lowest SCHED_FIFO priority. > Wake-ups for RT tasks happen from hardirq context so only timer_list timers > and hrtimers for "regular" tasks are processed here. The higher priority > ensures that wakeups are performed before scheduling SCHED_OTHER tasks. > > Using a dedicated variable to store the pending softirq bits values > ensure that the timer are not accidentally picked up by ksoftirqd and > other threaded interrupts. > It shouldn't be picked up by ksoftirqd since it runs at lower priority. > However if ksoftirqd is already running while a timer fires, then > ksoftird will be PI-boosted due to the BH-lock to ktimer's priority. > Ideally we try to avoid having ksoftirqd running. > > The timer thread can pick up pending softirqs from ksoftirqd but only > if the softirq load is high. It is not be desired that the picked up > softirqs are processed at SCHED_FIFO priority under high softirq load > but this can already happen by a PI-boost by a force-threaded interrupt. > > [ frede...@kernel.org: rcutorture.c fixes, storm fix by introduction of > local_pending_timers() for tick_nohz_next_event() ] > > [ junxiao.ch...@intel.com: Ensure ktimersd gets woken up even if a > softirq is currently served. ] > > Signed-off-by: Sebastian Andrzej Siewior <bige...@linutronix.de>
For the rcutorture pieces: Reviewed-by: Paul E. McKenney <paul...@kernel.org> > --- > include/linux/interrupt.h | 29 ++++++++++++++ > kernel/rcu/rcutorture.c | 6 +++ > kernel/softirq.c | 82 ++++++++++++++++++++++++++++++++++++++- > kernel/time/hrtimer.c | 4 +- > kernel/time/tick-sched.c | 2 +- > kernel/time/timer.c | 2 +- > 6 files changed, 120 insertions(+), 5 deletions(-) > > diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h > index 457151f9f263d..4a4f367cd6864 100644 > --- a/include/linux/interrupt.h > +++ b/include/linux/interrupt.h > @@ -616,6 +616,35 @@ extern void __raise_softirq_irqoff(unsigned int nr); > extern void raise_softirq_irqoff(unsigned int nr); > extern void raise_softirq(unsigned int nr); > > +#ifdef CONFIG_PREEMPT_RT > +DECLARE_PER_CPU(struct task_struct *, timersd); > +DECLARE_PER_CPU(unsigned long, pending_timer_softirq); > + > +extern void raise_timer_softirq(void); > +extern void raise_hrtimer_softirq(void); > + > +static inline unsigned int local_pending_timers(void) > +{ > + return __this_cpu_read(pending_timer_softirq); > +} > + > +#else > +static inline void raise_timer_softirq(void) > +{ > + raise_softirq(TIMER_SOFTIRQ); > +} > + > +static inline void raise_hrtimer_softirq(void) > +{ > + raise_softirq_irqoff(HRTIMER_SOFTIRQ); > +} > + > +static inline unsigned int local_pending_timers(void) > +{ > + return local_softirq_pending(); > +} > +#endif > + > DECLARE_PER_CPU(struct task_struct *, ksoftirqd); > > static inline struct task_struct *this_cpu_ksoftirqd(void) > diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c > index bb75dbf5c800c..609687fd742d5 100644 > --- a/kernel/rcu/rcutorture.c > +++ b/kernel/rcu/rcutorture.c > @@ -2440,6 +2440,12 @@ static int rcutorture_booster_init(unsigned int cpu) > WARN_ON_ONCE(!t); > sp.sched_priority = 2; > sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); > +#ifdef CONFIG_PREEMPT_RT > + t = per_cpu(timersd, cpu); > + WARN_ON_ONCE(!t); > + sp.sched_priority = 2; > + sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); > +#endif > } > > /* Don't allow time recalculation while creating a new task. */ > diff --git a/kernel/softirq.c b/kernel/softirq.c > index d082e7840f880..2d847405e5a7f 100644 > --- a/kernel/softirq.c > +++ b/kernel/softirq.c > @@ -624,6 +624,24 @@ static inline void tick_irq_exit(void) > #endif > } > > +#ifdef CONFIG_PREEMPT_RT > +DEFINE_PER_CPU(struct task_struct *, timersd); > +DEFINE_PER_CPU(unsigned long, pending_timer_softirq); > + > +static void wake_timersd(void) > +{ > + struct task_struct *tsk = __this_cpu_read(timersd); > + > + if (tsk) > + wake_up_process(tsk); > +} > + > +#else > + > +static inline void wake_timersd(void) { } > + > +#endif > + > static inline void __irq_exit_rcu(void) > { > #ifndef __ARCH_IRQ_EXIT_IRQS_DISABLED > @@ -636,6 +654,10 @@ static inline void __irq_exit_rcu(void) > if (!in_interrupt() && local_softirq_pending()) > invoke_softirq(); > > + if (IS_ENABLED(CONFIG_PREEMPT_RT) && local_pending_timers() && > + !(in_nmi() | in_hardirq())) > + wake_timersd(); > + > tick_irq_exit(); > } > > @@ -971,12 +993,70 @@ static struct smp_hotplug_thread softirq_threads = { > .thread_comm = "ksoftirqd/%u", > }; > > +#ifdef CONFIG_PREEMPT_RT > +static void timersd_setup(unsigned int cpu) > +{ > + sched_set_fifo_low(current); > +} > + > +static int timersd_should_run(unsigned int cpu) > +{ > + return local_pending_timers(); > +} > + > +static void run_timersd(unsigned int cpu) > +{ > + unsigned int timer_si; > + > + ksoftirqd_run_begin(); > + > + timer_si = local_pending_timers(); > + __this_cpu_write(pending_timer_softirq, 0); > + or_softirq_pending(timer_si); > + > + __do_softirq(); > + > + ksoftirqd_run_end(); > +} > + > +static void raise_ktimers_thread(unsigned int nr) > +{ > + trace_softirq_raise(nr); > + __this_cpu_or(pending_timer_softirq, 1 << nr); > +} > + > +void raise_hrtimer_softirq(void) > +{ > + raise_ktimers_thread(HRTIMER_SOFTIRQ); > +} > + > +void raise_timer_softirq(void) > +{ > + unsigned long flags; > + > + local_irq_save(flags); > + raise_ktimers_thread(TIMER_SOFTIRQ); > + wake_timersd(); > + local_irq_restore(flags); > +} > + > +static struct smp_hotplug_thread timer_threads = { > + .store = &timersd, > + .setup = timersd_setup, > + .thread_should_run = timersd_should_run, > + .thread_fn = run_timersd, > + .thread_comm = "ktimers/%u", > +}; > +#endif > + > static __init int spawn_ksoftirqd(void) > { > cpuhp_setup_state_nocalls(CPUHP_SOFTIRQ_DEAD, "softirq:dead", NULL, > takeover_tasklets); > BUG_ON(smpboot_register_percpu_thread(&softirq_threads)); > - > +#ifdef CONFIG_PREEMPT_RT > + BUG_ON(smpboot_register_percpu_thread(&timer_threads)); > +#endif > return 0; > } > early_initcall(spawn_ksoftirqd); > diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c > index cddcd08ea827f..133d49f703d93 100644 > --- a/kernel/time/hrtimer.c > +++ b/kernel/time/hrtimer.c > @@ -1811,7 +1811,7 @@ void hrtimer_interrupt(struct clock_event_device *dev) > if (!ktime_before(now, cpu_base->softirq_expires_next)) { > cpu_base->softirq_expires_next = KTIME_MAX; > cpu_base->softirq_activated = 1; > - raise_softirq_irqoff(HRTIMER_SOFTIRQ); > + raise_hrtimer_softirq(); > } > > __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD); > @@ -1906,7 +1906,7 @@ void hrtimer_run_queues(void) > if (!ktime_before(now, cpu_base->softirq_expires_next)) { > cpu_base->softirq_expires_next = KTIME_MAX; > cpu_base->softirq_activated = 1; > - raise_softirq_irqoff(HRTIMER_SOFTIRQ); > + raise_hrtimer_softirq(); > } > > __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD); > diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c > index 753a184c70907..efa3181607a2b 100644 > --- a/kernel/time/tick-sched.c > +++ b/kernel/time/tick-sched.c > @@ -859,7 +859,7 @@ static void tick_nohz_restart(struct tick_sched *ts, > ktime_t now) > > static inline bool local_timer_softirq_pending(void) > { > - return local_softirq_pending() & BIT(TIMER_SOFTIRQ); > + return local_pending_timers() & BIT(TIMER_SOFTIRQ); > } > > /* > diff --git a/kernel/time/timer.c b/kernel/time/timer.c > index 0fc9d066a7be4..79f0dc73ac436 100644 > --- a/kernel/time/timer.c > +++ b/kernel/time/timer.c > @@ -2499,7 +2499,7 @@ static void run_local_timers(void) > */ > if (time_after_eq(jiffies, READ_ONCE(base->next_expiry)) || > (i == BASE_DEF && tmigr_requires_handle_remote())) { > - raise_softirq(TIMER_SOFTIRQ); > + raise_timer_softirq(); > return; > } > } > -- > 2.45.2 >