fair: Fix attaching task sched avgs twice when switching to fair or changing task group

Vincent Guittot Thu, 16 Jun 2016 09:31:12 -0700

Le Wednesday 15 Jun 2016 à 17:22:17 (+0200), Peter Zijlstra a écrit :
> On Wed, Jun 15, 2016 at 09:46:53AM +0200, Vincent Guittot wrote:
> > I still have concerned with this change of the behavior that  attaches
> > the task only when it is enqueued. The load avg of the task will not
> > be decayed between the time we move it into its new group until its
> > enqueue. With this change, a task's load can stay high whereas it has
> > slept for the last couple of seconds. Then, its load and utilization
> > is no more accounted anywhere in the mean time just because we have
> > moved the task which will be enqueued on the same rq.
> > A task should always be attached to a cfs_rq and its load/utilization
> > should always be accounted on a cfs_rq and decayed for its sleep
> > period
> 
> OK; so I think I agree with that. Does the below (completely untested,
> hasn't even been near a compiler) look reasonable?
> 
> The general idea is to always attach to a cfs_rq -- through
> post_init_entity_util_avg(). This covers both the new task isn't
> attached yet and was never in the fair class to begin with issues.


Your patch ensures that a task will be attached to a cfs_rq and fix the issue 
raised by Yuyang because of se->avg.last_update_time = 0 at init. During the 
test the following message has raised  "BUG: using smp_processor_id() in 
preemptible [00000000] code: systemd/1" because of cfs_rq_util_change that is 
called in attach_entity_load_avg

With patch [1] for the init of cfs_rq side, all use cases will be covered 
regarding the issue linked to a last_update_time set to 0 at init
[1] https://lkml.org/lkml/2016/5/30/508

> 
> That only leaves a tiny hole in fork() where the task is hashed but
> hasn't yet passed through wake_up_new_task() in which someone can do
> cgroup move on it. That is closed with TASK_NEW and can_attach()
> refusing those tasks.
> 

But a new fair task is still detached and attached from/to task_group with 
cgroup_post_fork()-->ss->fork(child)-->cpu_cgroup_fork()-->sched_move_task()-->task_move_group_fair().
cpu_cgroup_can_attach is not used in this path and sched_move_task do the move 
unconditionally for fair task.

With your patch, we still have the sequence

sched_fork()
    set_task_cpu()
cgroup_post_fork()--> ... --> task_move_group_fair()
    detach_task_cfs_rq()
    set_task_rq()
    attach_task_cfs_rq()
wake_up_new_task()
    select_task_rq() can select a new cpu
    set_task_cpu()
        migrate_task_rq_fair if the new_cpu != task_cpu
             remove_load()
        __set_task_cpu
    post_init_entity_util_avg
        attach_task_cfs_rq()
    activate_task
        enqueue_task

In fact, cpu_cgroup_fork needs a small part of sched_move_task so we can just 
call this small part directly instead sched_move_task. And the task doesn't 
really migrate because it is not yet attached so we need the sequence:
sched_fork()
    __set_task_cpu()
cgroup_post_fork()--> ... --> task_move_group_fair()
    set_task_rq() to set task group and runqueue
wake_up_new_task()
    select_task_rq() can select a new cpu
    __set_task_cpu
    post_init_entity_util_avg
        attach_task_cfs_rq()
    activate_task
        enqueue_task

The patch below on top of your patch, ensures that we follow the right sequence 
:

---
 kernel/sched/core.c | 60 +++++++++++++++++++++++++++++++++++------------------
 1 file changed, 40 insertions(+), 20 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 7895689a..a21e3dc 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2373,7 +2373,7 @@ int sched_fork(unsigned long clone_flags, struct 
task_struct *p)
         * Silence PROVE_RCU.
         */
        raw_spin_lock_irqsave(&p->pi_lock, flags);
-       set_task_cpu(p, cpu);
+       __set_task_cpu(p, cpu);
        if (p->sched_class->task_fork)
                p->sched_class->task_fork(p);
        raw_spin_unlock_irqrestore(&p->pi_lock, flags);
@@ -2515,7 +2515,7 @@ void wake_up_new_task(struct task_struct *p)
         *  - cpus_allowed can change in the fork path
         *  - any previously selected cpu might disappear through hotplug
         */
-       set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
+       __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
 #endif
        /* Post initialize new task's util average when its cfs_rq is set */
        post_init_entity_util_avg(&p->se);
@@ -7715,6 +7715,35 @@ void sched_offline_group(struct task_group *tg)
        spin_unlock_irqrestore(&task_group_lock, flags);
 }
 
+/* Set task's runqueue and group
+ *     In case of a move between group, we update src and dst group
+ *     thanks to sched_class->task_move_group. Otherwise, we just need to set
+ *     runqueue and group pointers. The task will be attached to the runqueue
+ *     during its wake up.
+ */
+static void sched_set_group(struct task_struct *tsk, bool move)
+{
+       struct task_group *tg;
+
+       /*
+        * All callers are synchronized by task_rq_lock(); we do not use RCU
+        * which is pointless here. Thus, we pass "true" to task_css_check()
+        * to prevent lockdep warnings.
+        */
+       tg = container_of(task_css_check(tsk, cpu_cgrp_id, true),
+                         struct task_group, css);
+       tg = autogroup_task_group(tsk, tg);
+       tsk->sched_task_group = tg;
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+       if (move && tsk->sched_class->task_move_group)
+               tsk->sched_class->task_move_group(tsk);
+       else
+#endif
+               set_task_rq(tsk, task_cpu(tsk));
+
+}
+
 /* change task's runqueue when it moves between groups.
  *     The caller of this function should have put the task in its new group
  *     by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to
@@ -7722,7 +7751,6 @@ void sched_offline_group(struct task_group *tg)
  */
 void sched_move_task(struct task_struct *tsk)
 {
-       struct task_group *tg;
        int queued, running;
        struct rq_flags rf;
        struct rq *rq;
@@ -7737,22 +7765,7 @@ void sched_move_task(struct task_struct *tsk)
        if (unlikely(running))
                put_prev_task(rq, tsk);
 
-       /*
-        * All callers are synchronized by task_rq_lock(); we do not use RCU
-        * which is pointless here. Thus, we pass "true" to task_css_check()
-        * to prevent lockdep warnings.
-        */
-       tg = container_of(task_css_check(tsk, cpu_cgrp_id, true),
-                         struct task_group, css);
-       tg = autogroup_task_group(tsk, tg);
-       tsk->sched_task_group = tg;
-
-#ifdef CONFIG_FAIR_GROUP_SCHED
-       if (tsk->sched_class->task_move_group)
-               tsk->sched_class->task_move_group(tsk);
-       else
-#endif
-               set_task_rq(tsk, task_cpu(tsk));
+       sched_set_group(tsk, true);
 
        if (unlikely(running))
                tsk->sched_class->set_curr_task(rq);
@@ -8182,7 +8195,14 @@ static void cpu_cgroup_css_free(struct 
cgroup_subsys_state *css)
 
 static void cpu_cgroup_fork(struct task_struct *task)
 {
-       sched_move_task(task);
+       struct rq_flags rf;
+       struct rq *rq;
+
+       rq = task_rq_lock(task, &rf);
+
+       sched_set_group(task, false);
+
+       task_rq_unlock(rq, task, &rf);
 }
 
 static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
-- 
1.9.1

Re: [PATCH v6 1/4] sched/fair: Fix attaching task sched avgs twice when switching to fair or changing task group

Reply via email to