The compound branch of rcu_read_unlock_special() arms one of the scheduler, RCU_SOFTIRQ (raise_softirq_irqoff()) or irq_work_queue_on() in order to report a deferred quiescent state at a later time.
However, that is not enough in scenarios where an interrupts-disabled section spans the preempt_enable() of a preempt-disabled section: rcu_read_lock(); // receive IPI for expedited GP preempt_disable(); rcu_read_unlock(); // sets the need-resched flag local_irq_disable(); preempt_enable(); // cannot reschedule: IRQs are off local_irq_enable(); // now outside the compound RCU read-side critical section, but the // expedited GP is still held up Introduce a per-CPU bounded-delay rescue hrtimer, armed from the compound branch when an expedited GP needs this CPU's quiescent state, that reports the deferred QS via rcu_preempt_deferred_qs_try_report() once a clean (non-reader, non-compound) context is reached. To keep the rescue from firing once one of the normal mechanisms has already reported the quiescent state, cancel it from rcu_preempt_deferred_qs_irqrestore() -- the common path through which the deferred QS is reported. Without this the timer keeps firing (and re-arming) only to find the work already done; cancelling on the natural report path avoids the great majority of those fires (observed as a ~90% reduction in rescue-timer fires under rcutorture TREE03 with rcutorture.gp_exp=1). Signed-off-by: Joel Fernandes <[email protected]> --- kernel/rcu/tree.c | 1 + kernel/rcu/tree.h | 1 + kernel/rcu/tree_plugin.h | 70 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 72 insertions(+) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index c4352d0c3876..0cacbe642235 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -27,6 +27,7 @@ #include <linux/sched.h> #include <linux/sched/debug.h> #include <linux/nmi.h> +#include <linux/hrtimer.h> #include <linux/atomic.h> #include <linux/bitops.h> #include <linux/export.h> diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index dc03231b2d46..58b120bf0b7b 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -218,6 +218,7 @@ struct rcu_data { /* during and after the last grace */ /* period it is aware of. */ struct irq_work defer_qs_iw; /* Obtain later scheduler attention. */ + struct hrtimer defer_qs_iw_rescue;/* Rescue timer for deferred-QS. */ int defer_qs_pending; /* irqwork or softirq pending? */ struct work_struct strict_work; /* Schedule readers for strict GPs. */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 6f5d31e3f1a3..324d08c7a91a 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -592,6 +592,18 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) local_irq_restore(flags); return; } + + /* + * A natural report path reached the deferred quiescent state before + * the bounded-delay rescue hrtimer fired. Cancel any pending rescue + * on this CPU so it does not fire only to find the quiescent state + * already reported. Use hrtimer_try_to_cancel() rather than + * hrtimer_cancel(): interrupts are disabled here and the timer is + * HARD/PINNED, so a callback that is already running must not be + * waited on (in that case this is a harmless no-op). + */ + hrtimer_try_to_cancel(&rdp->defer_qs_iw_rescue); + t->rcu_read_unlock_special.s = 0; if (special.b.need_qs) { if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD)) { @@ -772,6 +784,54 @@ static void rcu_preempt_deferred_qs_handler(struct irq_work *iwp) rcu_defer_qs_clear(rdp); } +/* + * Bounded-delay rescue timeout for the deferred-QS reporting. + * + * The compound branch of rcu_read_unlock_special() arms either the + * scheduler, RCU_SOFTIRQ (raise_softirq_irqoff) or irq_work_queue_on() in order + * to report a deferred QS at a later time. + * + * However, that is not enough as in scenarios where local_irq_disable()d + * sections span the preempt_enable() call of a preempt-disabled section: + * + * rcu_read_lock(); + * // receive IPI for exp GP + * preempt_disable(); + * rcu_read_unlock(); // Set the "need reschedule" flag. + * local_irq_disable(); + * preempt_enable(); // Cannot reschedule as IRQs are off. + * local_irq_enable(); + * // Now outside the compound RCU read-side critical section + * // however, the expedited GP is still held up. + * + * Introduce a rescue timer, firing every 50 micro seconds after the last + * rcu_read_unlock() call, to fix this. + */ +static int defer_qs_rescue_delay_us = 50; +module_param(defer_qs_rescue_delay_us, int, 0644); +MODULE_PARM_DESC(defer_qs_rescue_delay_us, + "Microseconds before the rescue timer fires a deferred-QS report."); + +static enum hrtimer_restart +rcu_preempt_deferred_qs_rescue(struct hrtimer *hrtp) +{ + lockdep_assert_irqs_disabled(); + + /* + * Still inside a reader / compound section: deboosting is unsafe, so + * rearm and retry after a bounded delay. Once clean, + * rcu_preempt_deferred_qs_try_report() reports the deferred QS and + * releases any boost in the current task's context (or is a no-op if + * natural recovery already landed). + */ + if (!rcu_preempt_deferred_qs_try_report(current)) { + hrtimer_forward_now(hrtp, + us_to_ktime(defer_qs_rescue_delay_us)); + return HRTIMER_RESTART; + } + return HRTIMER_NORESTART; +} + /* * Check if expedited grace period processing during unlock is needed. * @@ -892,6 +952,13 @@ static void rcu_read_unlock_special(struct task_struct *t) irq_work_queue_on(&rdp->defer_qs_iw, rdp->cpu); } } + // Bounded-delay rescue: arm whenever the compound branch + // entered with a pending deferred-QS / deboost obligation, + // regardless of which mechanism above was chosen. + if (needs_exp && cpu_online(rdp->cpu)) + hrtimer_start(&rdp->defer_qs_iw_rescue, + us_to_ktime(defer_qs_rescue_delay_us), + HRTIMER_MODE_REL_PINNED_HARD); local_irq_restore(flags); return; } @@ -1033,6 +1100,9 @@ dump_blkd_tasks(struct rcu_node *rnp, int ncheck) static void rcu_preempt_deferred_qs_init(struct rcu_data *rdp) { rdp->defer_qs_iw = IRQ_WORK_INIT_HARD(rcu_preempt_deferred_qs_handler); + hrtimer_setup(&rdp->defer_qs_iw_rescue, + rcu_preempt_deferred_qs_rescue, + CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED_HARD); } #else /* #ifdef CONFIG_PREEMPT_RCU */ -- 2.34.1

