Re: sched_setscheduler() vs idle_balance() race

Mike Galbraith Sat, 30 May 2015 23:39:52 -0700

On Sat, 2015-05-30 at 15:08 +0200, Mike Galbraith wrote:
> On Thu, 2015-05-28 at 15:53 +0200, Peter Zijlstra wrote:
> > On Thu, May 28, 2015 at 09:43:52AM +0200, Mike Galbraith wrote:
> > > Hi Peter,
> > > 
> > > I'm not seeing what prevents pull_task() from yanking a task out from
> > > under __sched_setscheduler().  A box sprinkling smoldering 3.0 kernel
> > > wreckage all over my bugzilla mbox isn't seeing it either ;-)
> > 
> > Say, how easy can that thing be reproduced?
> > 
> > The below is compile tested only, but it might just work if I didn't
> > miss anything :-)
> 
> Seems trying to make the target invisible to balancing created a new
> race: dequeue target, do stuff that may drop rq->lock while it's
> dequeued, target sneaks into schedule(), dequeues itself (#2), boom.


Well, the below (ick) plugged it up, but...

I don't see why we can't just say no in can_migrate_task() if ->pi_lock
is held.  It plugged the original hole in a lot less lines.

Hohum, time to go pretend I have something better to do on a sunny
Sunday morning ;-)

  massive_intr_x-6213  [007] d...   170.579339: pull_rt_task: yup, pulled
  massive_intr_x-6213  [002] d...   170.580114: pull_rt_task: yup, pulled
  massive_intr_x-6213  [006] d...   170.586083: pull_rt_task: yup, pulled
           <...>-6237  [006] d...   170.593878: __schedule: saving the day

---
 kernel/sched/core.c     |   43 +++++++++++++++++++++++++++++++++++--------
 kernel/sched/deadline.c |    6 +++---
 kernel/sched/rt.c       |   11 +++++++++--
 kernel/sched/sched.h    |   10 +++++++++-
 4 files changed, 56 insertions(+), 14 deletions(-)

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2745,9 +2745,18 @@ static void __sched __schedule(void)
         * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)
         * done by the caller to avoid the race with signal_wake_up().
         */
+dequeued:
        smp_mb__before_spinlock();
        raw_spin_lock_irq(&rq->lock);
 
+       if (unlikely(!task_on_rq_queued(prev))) {
+               trace_printk("saving the day\n");
+               tracing_off();
+               raw_spin_unlock_irq(&rq->lock);
+               cpu_relax();
+               goto dequeued;
+       }
+
        rq->clock_skip_update <<= 1; /* promote REQ to ACT */
 
        switch_count = &prev->nivcsw;
@@ -3013,8 +3022,10 @@ void rt_mutex_setprio(struct task_struct
        prev_class = p->sched_class;
        queued = task_on_rq_queued(p);
        running = task_current(rq, p);
-       if (queued)
+       if (queued) {
                dequeue_task(rq, p, 0);
+               p->on_rq = TASK_ON_RQ_DEQUEUED;
+       }
        if (running)
                put_prev_task(rq, p);
 
@@ -3067,8 +3078,10 @@ void rt_mutex_setprio(struct task_struct
 
        if (running)
                p->sched_class->set_curr_task(rq);
-       if (queued)
+       if (queued) {
                enqueue_task(rq, p, enqueue_flag);
+               p->on_rq = TASK_ON_RQ_QUEUED;
+       }
 
        /*
         * Both switched_to() and prio_changed() are allowed to drop @rq->lock;
@@ -3114,8 +3127,10 @@ void set_user_nice(struct task_struct *p
                goto out_unlock;
        }
        queued = task_on_rq_queued(p);
-       if (queued)
+       if (queued) {
                dequeue_task(rq, p, 0);
+               p->on_rq = TASK_ON_RQ_DEQUEUED;
+       }
 
        p->static_prio = NICE_TO_PRIO(nice);
        set_load_weight(p);
@@ -3125,6 +3140,7 @@ void set_user_nice(struct task_struct *p
 
        if (queued) {
                enqueue_task(rq, p, 0);
+               p->on_rq = TASK_ON_RQ_QUEUED;
                /*
                 * If the task increased its priority or is running and
                 * lowered its priority, then reschedule its CPU:
@@ -3628,8 +3644,10 @@ static int __sched_setscheduler(struct t
 
        queued = task_on_rq_queued(p);
        running = task_current(rq, p);
-       if (queued)
+       if (queued) {
                dequeue_task(rq, p, 0);
+               p->on_rq = TASK_ON_RQ_DEQUEUED;
+       }
        if (running)
                put_prev_task(rq, p);
 
@@ -3656,6 +3674,7 @@ static int __sched_setscheduler(struct t
                 * increased (user space view).
                 */
                enqueue_task(rq, p, oldprio <= p->prio ? ENQUEUE_HEAD : 0);
+               p->on_rq = TASK_ON_RQ_QUEUED;
        }
 
        /*
@@ -4943,8 +4962,10 @@ void sched_setnuma(struct task_struct *p
        queued = task_on_rq_queued(p);
        running = task_current(rq, p);
 
-       if (queued)
+       if (queued) {
                dequeue_task(rq, p, 0);
+               p->on_rq = TASK_ON_RQ_DEQUEUED;
+       }
        if (running)
                put_prev_task(rq, p);
 
@@ -4952,8 +4973,10 @@ void sched_setnuma(struct task_struct *p
 
        if (running)
                p->sched_class->set_curr_task(rq);
-       if (queued)
+       if (queued) {
                enqueue_task(rq, p, 0);
+               p->on_rq = TASK_ON_RQ_QUEUED;
+       }
        task_rq_unlock(rq, p, &flags);
 }
 #endif
@@ -7587,8 +7610,10 @@ void sched_move_task(struct task_struct
        running = task_current(rq, tsk);
        queued = task_on_rq_queued(tsk);
 
-       if (queued)
+       if (queued) {
                dequeue_task(rq, tsk, 0);
+               tsk->on_rq = TASK_ON_RQ_DEQUEUED;
+       }
        if (unlikely(running))
                put_prev_task(rq, tsk);
 
@@ -7611,8 +7636,10 @@ void sched_move_task(struct task_struct
 
        if (unlikely(running))
                tsk->sched_class->set_curr_task(rq);
-       if (queued)
+       if (queued) {
                enqueue_task(rq, tsk, 0);
+               tsk->on_rq = TASK_ON_RQ_QUEUED;
+       }
 
        task_rq_unlock(rq, tsk, &flags);
 }
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -607,7 +607,7 @@ static enum hrtimer_restart dl_task_time
         * We can be both throttled and !queued. Replenish the counter
         * but do not enqueue -- wait for our wakeup to do that.
         */
-       if (!task_on_rq_queued(p)) {
+       if (!task_on_rq_queued_or_dequeued(p)) {
                replenish_dl_entity(dl_se, dl_se);
                goto unlock;
        }
@@ -1526,7 +1526,7 @@ static int pull_dl_task(struct rq *this_
                     dl_time_before(p->dl.deadline,
                                    this_rq->dl.earliest_dl.curr))) {
                        WARN_ON(p == src_rq->curr);
-                       WARN_ON(!task_on_rq_queued(p));
+                       WARN_ON(!task_on_rq_queued_or_dequeued(p));
 
                        /*
                         * Then we pull iff p has actually an earlier
@@ -1707,7 +1707,7 @@ static void switched_from_dl(struct rq *
         * this is the right place to try to pull some other one
         * from an overloaded cpu, if any.
         */
-       if (!task_on_rq_queued(p) || rq->dl.dl_nr_running)
+       if (!task_on_rq_queued_or_dequeued(p) || rq->dl.dl_nr_running)
                return;
 
        if (pull_dl_task(rq))
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1656,7 +1656,7 @@ static struct rq *find_lock_lowest_rq(st
                                     !cpumask_test_cpu(lowest_rq->cpu,
                                                       tsk_cpus_allowed(task)) 
||
                                     task_running(rq, task) ||
-                                    !task_on_rq_queued(task))) {
+                                    !task_on_rq_queued_or_dequeued(task))) {
 
                                double_unlock_balance(rq, lowest_rq);
                                lowest_rq = NULL;
@@ -1953,10 +1953,14 @@ static int pull_rt_task(struct rq *this_
        int this_cpu = this_rq->cpu, ret = 0, cpu;
        struct task_struct *p;
        struct rq *src_rq;
+       int task_dequeued = 0;
 
        if (likely(!rt_overloaded(this_rq)))
                return 0;
 
+       if (this_rq->curr->on_rq == TASK_ON_RQ_DEQUEUED)
+               task_dequeued = 1;
+
        /*
         * Match the barrier from rt_set_overloaded; this guarantees that if we
         * see overloaded we must also see the rto_mask bit.
@@ -2035,6 +2039,9 @@ static int pull_rt_task(struct rq *this_
                double_unlock_balance(this_rq, src_rq);
        }
 
+       if (ret && task_dequeued)
+               trace_printk("yup, pulled\n");
+
        return ret;
 }
 
@@ -2133,7 +2140,7 @@ static void switched_from_rt(struct rq *
         * we may need to handle the pulling of RT tasks
         * now.
         */
-       if (!task_on_rq_queued(p) || rq->rt.rt_nr_running)
+       if (!task_on_rq_queued_or_dequeued(p) || rq->rt.rt_nr_running)
                return;
 
        if (pull_rt_task(rq))
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -19,7 +19,8 @@ struct cpuidle_state;
 
 /* task_struct::on_rq states: */
 #define TASK_ON_RQ_QUEUED      1
-#define TASK_ON_RQ_MIGRATING   2
+#define TASK_ON_RQ_DEQUEUED    2
+#define TASK_ON_RQ_MIGRATING   3
 
 extern __read_mostly int scheduler_running;
 
@@ -1034,6 +1035,13 @@ static inline int task_on_rq_queued(stru
        return p->on_rq == TASK_ON_RQ_QUEUED;
 }
 
+static inline int task_on_rq_queued_or_dequeued(struct task_struct *p)
+{
+       if (p->on_rq == TASK_ON_RQ_QUEUED)
+               return 1;
+       return p->on_rq == TASK_ON_RQ_DEQUEUED;
+}
+
 static inline int task_on_rq_migrating(struct task_struct *p)
 {
        return p->on_rq == TASK_ON_RQ_MIGRATING;


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: sched_setscheduler() vs idle_balance() race

Reply via email to