On CPU unplug tasks which are in a migrate disabled region cannot be pushed to a different CPU until they returned to migrateable state.
Account the number of tasks on a runqueue which are in a migrate disabled section and make the hotplug wait mechanism respect that. Originally-by: Scott Wood <sw...@redhat.com> Signed-off-by: Thomas Gleixner <t...@linutronix.de> --- kernel/sched/core.c | 38 ++++++++++++++++++++++++++++++++++---- kernel/sched/sched.h | 4 ++++ 2 files changed, 38 insertions(+), 4 deletions(-) --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -494,6 +494,11 @@ static bool task_self_migration(struct t return true; } +static inline bool rq_has_pinned_tasks(struct rq *rq) +{ + return rq->nr_pinned > 0; +} + #else /* defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT) */ static inline void task_lock_migration_ctrl(struct task_struct *p) { } static inline void task_unlock_migration_ctrl(struct task_struct *p) { } @@ -504,6 +509,10 @@ static bool task_self_migration(struct t { return false; } +static inline bool rq_has_pinned_tasks(struct rq *rq) +{ + return false; +} #endif /* !(defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT)) */ /* @@ -3591,6 +3600,12 @@ void migrate_disable(void) if (!current->migration_ctrl.disable_cnt) { raw_spin_lock_irqsave(¤t->pi_lock, flags); current->migration_ctrl.disable_cnt++; + /* + * Account the pinned task in the runqueue so that an + * eventual CPU hot unplug operation will wait until + * this task left the migrate disabled section. + */ + this_rq()->nr_pinned++; raw_spin_unlock_irqrestore(¤t->pi_lock, flags); } else { current->migration_ctrl.disable_cnt++; @@ -3619,6 +3634,13 @@ void migrate_enable(void) p->migration_ctrl.pending = NULL; /* + * Adjust the number of pinned tasks in the runqueue. No further + * action required here. An eventually waiting CPU hot unplug + * operation will be woken up once the CPU goes through idle. + */ + this_rq()->nr_pinned--; + + /* * If the task was never scheduled out while in the migrate * disabled region and there is no migration request pending, * return. @@ -6989,8 +7011,13 @@ static bool balance_push(struct rq *rq) * last task to vanish. The rcuwait_active() check is * accurate here because the waiter is pinned on this CPU * and can't obviously be running in parallel. + * + * On RT kernels this also has to check whether there are + * pinned and scheduled out tasks on the runqueue. They + * need to leave the migrate disabled section first. */ - if (!rq->nr_running && rcuwait_active(&rq->hotplug_wait)) { + if (!rq->nr_running && !rq_has_pinned_tasks(rq) && + rcuwait_active(&rq->hotplug_wait)) { raw_spin_unlock(&rq->lock); rcuwait_wake_up(&rq->hotplug_wait); raw_spin_lock(&rq->lock); @@ -7033,13 +7060,16 @@ static void balance_push_set(int cpu, bo * Invoked from a CPUs hotplug control thread after the CPU has been marked * inactive. All tasks which are not per CPU kernel threads are either * pushed off this CPU now via balance_push() or placed on a different CPU - * during wakeup. Wait until the CPU is quiescent. + * during wakeup. Wait until the CPU is quiescent. On RT kernels this also + * waits for pinned non-runnable tasks to leave the migrate disabled + * section. */ static void balance_hotplug_wait(void) { struct rq *rq = this_rq(); - rcuwait_wait_event(&rq->hotplug_wait, rq->nr_running == 1, + rcuwait_wait_event(&rq->hotplug_wait, + rq->nr_running == 1 && !rq_has_pinned_tasks(rq), TASK_UNINTERRUPTIBLE); } @@ -7279,7 +7309,7 @@ int sched_cpu_dying(unsigned int cpu) BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); set_rq_offline(rq); } - BUG_ON(rq->nr_running != 1); + BUG_ON(rq->nr_running != 1 || rq_has_pinned_tasks(rq)); rq_unlock_irqrestore(rq, &rf); calc_load_migrate(rq); --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1053,6 +1053,10 @@ struct rq { /* Must be inspected within a rcu lock section */ struct cpuidle_state *idle_state; #endif + +#if defined(CONFIG_PREEMPT_RT) && defined(CONFIG_SMP) + unsigned int nr_pinned; +#endif }; #ifdef CONFIG_FAIR_GROUP_SCHED