On Sat, 2015-05-30 at 15:08 +0200, Mike Galbraith wrote: > On Thu, 2015-05-28 at 15:53 +0200, Peter Zijlstra wrote: > > On Thu, May 28, 2015 at 09:43:52AM +0200, Mike Galbraith wrote: > > > Hi Peter, > > > > > > I'm not seeing what prevents pull_task() from yanking a task out from > > > under __sched_setscheduler(). A box sprinkling smoldering 3.0 kernel > > > wreckage all over my bugzilla mbox isn't seeing it either ;-) > > > > Say, how easy can that thing be reproduced? > > > > The below is compile tested only, but it might just work if I didn't > > miss anything :-) > > Seems trying to make the target invisible to balancing created a new > race: dequeue target, do stuff that may drop rq->lock while it's > dequeued, target sneaks into schedule(), dequeues itself (#2), boom.
Well, the below (ick) plugged it up, but... I don't see why we can't just say no in can_migrate_task() if ->pi_lock is held. It plugged the original hole in a lot less lines. Hohum, time to go pretend I have something better to do on a sunny Sunday morning ;-) massive_intr_x-6213 [007] d... 170.579339: pull_rt_task: yup, pulled massive_intr_x-6213 [002] d... 170.580114: pull_rt_task: yup, pulled massive_intr_x-6213 [006] d... 170.586083: pull_rt_task: yup, pulled <...>-6237 [006] d... 170.593878: __schedule: saving the day --- kernel/sched/core.c | 43 +++++++++++++++++++++++++++++++++++-------- kernel/sched/deadline.c | 6 +++--- kernel/sched/rt.c | 11 +++++++++-- kernel/sched/sched.h | 10 +++++++++- 4 files changed, 56 insertions(+), 14 deletions(-) --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2745,9 +2745,18 @@ static void __sched __schedule(void) * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE) * done by the caller to avoid the race with signal_wake_up(). */ +dequeued: smp_mb__before_spinlock(); raw_spin_lock_irq(&rq->lock); + if (unlikely(!task_on_rq_queued(prev))) { + trace_printk("saving the day\n"); + tracing_off(); + raw_spin_unlock_irq(&rq->lock); + cpu_relax(); + goto dequeued; + } + rq->clock_skip_update <<= 1; /* promote REQ to ACT */ switch_count = &prev->nivcsw; @@ -3013,8 +3022,10 @@ void rt_mutex_setprio(struct task_struct prev_class = p->sched_class; queued = task_on_rq_queued(p); running = task_current(rq, p); - if (queued) + if (queued) { dequeue_task(rq, p, 0); + p->on_rq = TASK_ON_RQ_DEQUEUED; + } if (running) put_prev_task(rq, p); @@ -3067,8 +3078,10 @@ void rt_mutex_setprio(struct task_struct if (running) p->sched_class->set_curr_task(rq); - if (queued) + if (queued) { enqueue_task(rq, p, enqueue_flag); + p->on_rq = TASK_ON_RQ_QUEUED; + } /* * Both switched_to() and prio_changed() are allowed to drop @rq->lock; @@ -3114,8 +3127,10 @@ void set_user_nice(struct task_struct *p goto out_unlock; } queued = task_on_rq_queued(p); - if (queued) + if (queued) { dequeue_task(rq, p, 0); + p->on_rq = TASK_ON_RQ_DEQUEUED; + } p->static_prio = NICE_TO_PRIO(nice); set_load_weight(p); @@ -3125,6 +3140,7 @@ void set_user_nice(struct task_struct *p if (queued) { enqueue_task(rq, p, 0); + p->on_rq = TASK_ON_RQ_QUEUED; /* * If the task increased its priority or is running and * lowered its priority, then reschedule its CPU: @@ -3628,8 +3644,10 @@ static int __sched_setscheduler(struct t queued = task_on_rq_queued(p); running = task_current(rq, p); - if (queued) + if (queued) { dequeue_task(rq, p, 0); + p->on_rq = TASK_ON_RQ_DEQUEUED; + } if (running) put_prev_task(rq, p); @@ -3656,6 +3674,7 @@ static int __sched_setscheduler(struct t * increased (user space view). */ enqueue_task(rq, p, oldprio <= p->prio ? ENQUEUE_HEAD : 0); + p->on_rq = TASK_ON_RQ_QUEUED; } /* @@ -4943,8 +4962,10 @@ void sched_setnuma(struct task_struct *p queued = task_on_rq_queued(p); running = task_current(rq, p); - if (queued) + if (queued) { dequeue_task(rq, p, 0); + p->on_rq = TASK_ON_RQ_DEQUEUED; + } if (running) put_prev_task(rq, p); @@ -4952,8 +4973,10 @@ void sched_setnuma(struct task_struct *p if (running) p->sched_class->set_curr_task(rq); - if (queued) + if (queued) { enqueue_task(rq, p, 0); + p->on_rq = TASK_ON_RQ_QUEUED; + } task_rq_unlock(rq, p, &flags); } #endif @@ -7587,8 +7610,10 @@ void sched_move_task(struct task_struct running = task_current(rq, tsk); queued = task_on_rq_queued(tsk); - if (queued) + if (queued) { dequeue_task(rq, tsk, 0); + tsk->on_rq = TASK_ON_RQ_DEQUEUED; + } if (unlikely(running)) put_prev_task(rq, tsk); @@ -7611,8 +7636,10 @@ void sched_move_task(struct task_struct if (unlikely(running)) tsk->sched_class->set_curr_task(rq); - if (queued) + if (queued) { enqueue_task(rq, tsk, 0); + tsk->on_rq = TASK_ON_RQ_QUEUED; + } task_rq_unlock(rq, tsk, &flags); } --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -607,7 +607,7 @@ static enum hrtimer_restart dl_task_time * We can be both throttled and !queued. Replenish the counter * but do not enqueue -- wait for our wakeup to do that. */ - if (!task_on_rq_queued(p)) { + if (!task_on_rq_queued_or_dequeued(p)) { replenish_dl_entity(dl_se, dl_se); goto unlock; } @@ -1526,7 +1526,7 @@ static int pull_dl_task(struct rq *this_ dl_time_before(p->dl.deadline, this_rq->dl.earliest_dl.curr))) { WARN_ON(p == src_rq->curr); - WARN_ON(!task_on_rq_queued(p)); + WARN_ON(!task_on_rq_queued_or_dequeued(p)); /* * Then we pull iff p has actually an earlier @@ -1707,7 +1707,7 @@ static void switched_from_dl(struct rq * * this is the right place to try to pull some other one * from an overloaded cpu, if any. */ - if (!task_on_rq_queued(p) || rq->dl.dl_nr_running) + if (!task_on_rq_queued_or_dequeued(p) || rq->dl.dl_nr_running) return; if (pull_dl_task(rq)) --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1656,7 +1656,7 @@ static struct rq *find_lock_lowest_rq(st !cpumask_test_cpu(lowest_rq->cpu, tsk_cpus_allowed(task)) || task_running(rq, task) || - !task_on_rq_queued(task))) { + !task_on_rq_queued_or_dequeued(task))) { double_unlock_balance(rq, lowest_rq); lowest_rq = NULL; @@ -1953,10 +1953,14 @@ static int pull_rt_task(struct rq *this_ int this_cpu = this_rq->cpu, ret = 0, cpu; struct task_struct *p; struct rq *src_rq; + int task_dequeued = 0; if (likely(!rt_overloaded(this_rq))) return 0; + if (this_rq->curr->on_rq == TASK_ON_RQ_DEQUEUED) + task_dequeued = 1; + /* * Match the barrier from rt_set_overloaded; this guarantees that if we * see overloaded we must also see the rto_mask bit. @@ -2035,6 +2039,9 @@ static int pull_rt_task(struct rq *this_ double_unlock_balance(this_rq, src_rq); } + if (ret && task_dequeued) + trace_printk("yup, pulled\n"); + return ret; } @@ -2133,7 +2140,7 @@ static void switched_from_rt(struct rq * * we may need to handle the pulling of RT tasks * now. */ - if (!task_on_rq_queued(p) || rq->rt.rt_nr_running) + if (!task_on_rq_queued_or_dequeued(p) || rq->rt.rt_nr_running) return; if (pull_rt_task(rq)) --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -19,7 +19,8 @@ struct cpuidle_state; /* task_struct::on_rq states: */ #define TASK_ON_RQ_QUEUED 1 -#define TASK_ON_RQ_MIGRATING 2 +#define TASK_ON_RQ_DEQUEUED 2 +#define TASK_ON_RQ_MIGRATING 3 extern __read_mostly int scheduler_running; @@ -1034,6 +1035,13 @@ static inline int task_on_rq_queued(stru return p->on_rq == TASK_ON_RQ_QUEUED; } +static inline int task_on_rq_queued_or_dequeued(struct task_struct *p) +{ + if (p->on_rq == TASK_ON_RQ_QUEUED) + return 1; + return p->on_rq == TASK_ON_RQ_DEQUEUED; +} + static inline int task_on_rq_migrating(struct task_struct *p) { return p->on_rq == TASK_ON_RQ_MIGRATING; -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/