In 4.15 without CONFIG_PREEMPT we observed expand_fdtable() taking about 10 seconds for synchronize_sched() to complete, when most of the other threads were running KVM guests.
In vcpu_run() there's a loop with the fairly common construct: if (need_resched()) { … local unlocks … cond_resched(); … local locks … } But because need_resched() wasn't true (until half the RCU warning time was completed and rcu_implicit_dynticks_qs() calls resched_cpu()), that never happens and cond_resched() is never called. In cond_resched() there is an unconditional call to rcu_all_qs() which would DTRT. Now, there's a simple way to fix it for the specific case of KVM — we can find a place we can just call rcu_all_qs(), something like this: diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 00520711..a304693 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7214,6 +7214,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) */ smp_mb__after_srcu_read_unlock(); + /* Force quiescent state (if requested) before entering guest mode */ + rcu_all_qs(); + /* * This handles the case where a posted interrupt was * notified with kvm_vcpu_kick. But I wonder if we should attempt to fix the general case by making need_resched() return true when an RCU quiescent state is needed. To do that without having an out-of-line function call in kernel/rcu/tree.c would look something like the patch below. Paul, did you say you had other ideas about how to export/inline it? Alternatively — or perhaps additionally — shouldn't CPUs which are currently in guest mode be counted as quiescent anyway? Or is that something we'll only ever want to do in full NOHZ mode? diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index b3dbf95..2f8a3bd 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -126,6 +126,7 @@ static inline bool rcu_is_watching(void) { return true; } /* Avoid RCU read-side critical sections leaking across. */ static inline void rcu_all_qs(void) { barrier(); } +static inline bool rcu_urgent_qs_requested(void) { return false; } /* RCUtree hotplug events */ #define rcutree_prepare_cpu NULL diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index 37d6fd3..d20b987 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -30,6 +30,36 @@ #ifndef __LINUX_RCUTREE_H #define __LINUX_RCUTREE_H +/* + * Dynticks per-CPU state. + */ +struct rcu_dynticks { + long long dynticks_nesting; /* Track irq/process nesting level. */ + /* Process level is worth LLONG_MAX/2. */ + int dynticks_nmi_nesting; /* Track NMI nesting level. */ + atomic_t dynticks; /* Even value for idle, else odd. */ + bool rcu_need_heavy_qs; /* GP old, need heavy quiescent state. */ + unsigned long rcu_qs_ctr; /* Light universal quiescent state ctr. */ + bool rcu_urgent_qs; /* GP old need light quiescent state. */ +#ifdef CONFIG_RCU_FAST_NO_HZ + bool all_lazy; /* Are all CPU's CBs lazy? */ + unsigned long nonlazy_posted; + /* # times non-lazy CBs posted to CPU. */ + unsigned long nonlazy_posted_snap; + /* idle-period nonlazy_posted snapshot. */ + unsigned long last_accelerate; + /* Last jiffy CBs were accelerated. */ + unsigned long last_advance_all; + /* Last jiffy CBs were all advanced. */ + int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */ +#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ +}; +DECLARE_PER_CPU(struct rcu_dynticks, rcu_dynticks); +static __always_inline bool rcu_urgent_qs_requested(void) +{ + return unlikely(raw_cpu_read(rcu_dynticks.rcu_urgent_qs)); +} + void rcu_note_context_switch(bool preempt); int rcu_needs_cpu(u64 basem, u64 *nextevt); void rcu_cpu_stall_reset(void); diff --git a/include/linux/sched.h b/include/linux/sched.h index e4d4e60..89f5814 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1616,7 +1616,8 @@ static inline int spin_needbreak(spinlock_t *lock) static __always_inline bool need_resched(void) { - return unlikely(tif_need_resched()); + return unlikely(tif_need_resched()) || + rcu_urgent_qs_requested(); } /* diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index f9c0ca2..cf1c66c 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -264,10 +264,11 @@ void rcu_bh_qs(void) #define rcu_eqs_special_exit() do { } while (0) #endif -static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { +DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE, .dynticks = ATOMIC_INIT(RCU_DYNTICK_CTRL_CTR), }; +EXPORT_SYMBOL(rcu_dynticks); /* for need_resched() */ /* * There's a few places, currently just in the tracing infrastructure, diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 46a5d19..462b25b 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -34,31 +34,6 @@ #include "rcu_segcblist.h" -/* - * Dynticks per-CPU state. - */ -struct rcu_dynticks { - long long dynticks_nesting; /* Track irq/process nesting level. */ - /* Process level is worth LLONG_MAX/2. */ - int dynticks_nmi_nesting; /* Track NMI nesting level. */ - atomic_t dynticks; /* Even value for idle, else odd. */ - bool rcu_need_heavy_qs; /* GP old, need heavy quiescent state. */ - unsigned long rcu_qs_ctr; /* Light universal quiescent state ctr. */ - bool rcu_urgent_qs; /* GP old need light quiescent state. */ -#ifdef CONFIG_RCU_FAST_NO_HZ - bool all_lazy; /* Are all CPU's CBs lazy? */ - unsigned long nonlazy_posted; - /* # times non-lazy CBs posted to CPU. */ - unsigned long nonlazy_posted_snap; - /* idle-period nonlazy_posted snapshot. */ - unsigned long last_accelerate; - /* Last jiffy CBs were accelerated. */ - unsigned long last_advance_all; - /* Last jiffy CBs were all advanced. */ - int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */ -#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ -}; - /* RCU's kthread states for tracing. */ #define RCU_KTHREAD_STOPPED 0 #define RCU_KTHREAD_RUNNING 1 -- dwmw2