The cpuset_attach_task() was introduced in commit 42a11bf5c543
("cgroup/cpuset: Make cpuset_fork() handle CLONE_INTO_CGROUP properly")
to enable the CLONE_INTO_CGROUP flag of clone(2) to behave more like
moving a task from one cpuset into another one. That commits didn't
move the mpol_rebind_mm() and cpuset_migrate_mm() calls for group leader
into cpuset_attach_task().

When the CLONE_INTO_CGROUP flag is used without CLONE_THREAD, the new
task is its own group leader. So it is still not equivalent to moving
task between cpusets in this case. Make CLONE_INTO_CGROUP behaves
more close to cpuset_attach() by moving the mpol_rebind_mm() and
cpuset_migrate_mm() calls inside cpuset_attach_task().

Also move the stack local cpus_updated, mems_updated and queue_task_work
flags into attach_ctx so that these flags can be accessed inside and
outside of cpuset_attach_task(). The cpuset_fork() function is updated
to set up these flags and do memory migration if necessary.

Signed-off-by: Waiman Long <[email protected]>
---
 kernel/cgroup/cpuset.c | 104 ++++++++++++++++++++++++-----------------
 1 file changed, 60 insertions(+), 44 deletions(-)

diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index e830ba13be9b..ef14ee821b4b 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -362,6 +362,9 @@ static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq);
  */
 static struct {
        int in_progress;
+       bool cpus_updated;
+       bool mems_updated;
+       bool task_work_queued;
        struct cpuset *old_cs;  /* Source cpuset */
        nodemask_t nodemask_to;
 } attach_ctx;
@@ -3171,6 +3174,8 @@ static cpumask_var_t cpus_attach;
 
 static void cpuset_attach_task(struct cpuset *cs, struct task_struct *task)
 {
+       struct mm_struct *mm;
+
        lockdep_assert_cpuset_lock_held();
 
        if (cs != &top_cpuset)
@@ -3184,28 +3189,60 @@ static void cpuset_attach_task(struct cpuset *cs, 
struct task_struct *task)
         */
        WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach));
 
+       if (cpuset_v2() && !attach_ctx.mems_updated)
+               return;
+
        cpuset_change_task_nodemask(task, &attach_ctx.nodemask_to);
        cpuset1_update_task_spread_flags(cs, task);
+
+       if ((task != task->group_leader) ||
+           (!is_memory_migrate(cs) && !attach_ctx.mems_updated))
+               return;
+
+       /*
+        * Change mm for threadgroup leader. This is expensive and may
+        * sleep and should be moved outside migration path proper.
+        */
+       mm = get_task_mm(task);
+       if (mm) {
+               struct cpuset *oldcs = attach_ctx.old_cs;
+
+               mpol_rebind_mm(mm, &cs->effective_mems);
+
+               /*
+                * old_mems_allowed is the same with mems_allowed
+                * here, except if this task is being moved
+                * automatically due to hotplug.  In that case
+                * @mems_allowed has been updated and is empty, so
+                * @old_mems_allowed is the right nodesets that we
+                * migrate mm from.
+                */
+               if (is_memory_migrate(cs)) {
+                       cpuset_migrate_mm(mm, &oldcs->old_mems_allowed,
+                                         &attach_ctx.nodemask_to);
+                       attach_ctx.task_work_queued = true;
+               } else {
+                       mmput(mm);
+               }
+       }
 }
 
 static void cpuset_attach(struct cgroup_taskset *tset)
 {
        struct task_struct *task;
-       struct task_struct *leader;
        struct cgroup_subsys_state *css;
        struct cpuset *cs;
        struct cpuset *oldcs = attach_ctx.old_cs;
-       bool cpus_updated, mems_updated;
-       bool queue_task_work = false;
 
        cgroup_taskset_first(tset, &css);
        cs = css_cs(css);
 
        lockdep_assert_cpus_held();     /* see cgroup_attach_lock() */
        mutex_lock(&cpuset_mutex);
-       cpus_updated = !cpumask_equal(cs->effective_cpus,
-                                     oldcs->effective_cpus);
-       mems_updated = !nodes_equal(cs->effective_mems, oldcs->effective_mems);
+       attach_ctx.task_work_queued = false;
+
+       attach_ctx.cpus_updated = !cpumask_equal(cs->effective_cpus, 
oldcs->effective_cpus);
+       attach_ctx.mems_updated = !nodes_equal(cs->effective_mems, 
oldcs->effective_mems);
        guarantee_online_mems(cs, &attach_ctx.nodemask_to);
 
        /*
@@ -3214,46 +3251,14 @@ static void cpuset_attach(struct cgroup_taskset *tset)
         * and mems. In that case, we can optimize out by skipping the task
         * iteration and update.
         */
-       if (cpuset_v2() && !cpus_updated && !mems_updated)
+       if (cpuset_v2() && !attach_ctx.cpus_updated && !attach_ctx.mems_updated)
                goto out;
 
        cgroup_taskset_for_each(task, css, tset)
                cpuset_attach_task(cs, task);
 
-       /*
-        * Change mm for all threadgroup leaders. This is expensive and may
-        * sleep and should be moved outside migration path proper. Skip it
-        * if there is no change in effective_mems and CS_MEMORY_MIGRATE is
-        * not set.
-        */
-       if (!is_memory_migrate(cs) && !mems_updated)
-               goto out;
-
-       cgroup_taskset_for_each_leader(leader, css, tset) {
-               struct mm_struct *mm = get_task_mm(leader);
-
-               if (mm) {
-                       mpol_rebind_mm(mm, &cs->effective_mems);
-
-                       /*
-                        * old_mems_allowed is the same with mems_allowed
-                        * here, except if this task is being moved
-                        * automatically due to hotplug.  In that case
-                        * @mems_allowed has been updated and is empty, so
-                        * @old_mems_allowed is the right nodesets that we
-                        * migrate mm from.
-                        */
-                       if (is_memory_migrate(cs)) {
-                               cpuset_migrate_mm(mm, &oldcs->old_mems_allowed,
-                                                 &attach_ctx.nodemask_to);
-                               queue_task_work = true;
-                       } else
-                               mmput(mm);
-               }
-       }
-
 out:
-       if (queue_task_work)
+       if (attach_ctx.task_work_queued)
                schedule_flush_migrate_mm();
        cs->old_mems_allowed = attach_ctx.nodemask_to;
 
@@ -3700,15 +3705,14 @@ static void cpuset_cancel_fork(struct task_struct 
*task, struct css_set *cset)
  */
 static void cpuset_fork(struct task_struct *task)
 {
-       struct cpuset *cs;
-       bool same_cs;
+       struct cpuset *cs, *oldcs;
 
        rcu_read_lock();
        cs = task_cs(task);
-       same_cs = (cs == task_cs(current));
+       oldcs = task_cs(current);
        rcu_read_unlock();
 
-       if (same_cs) {
+       if (cs == oldcs) {
                if (cs == &top_cpuset)
                        return;
 
@@ -3720,7 +3724,19 @@ static void cpuset_fork(struct task_struct *task)
        /* CLONE_INTO_CGROUP */
        mutex_lock(&cpuset_mutex);
        guarantee_online_mems(cs, &attach_ctx.nodemask_to);
+       cs->old_mems_allowed = attach_ctx.nodemask_to;
+
+       /*
+        * Assume CPUs and memory nodes are updated
+        * A CLONE_INTO_CGROUP operation should have taken the cgroup mutex
+        * and so there shouldn't be a competing cpuset_attach() operation.
+        */
+       attach_ctx.cpus_updated = attach_ctx.mems_updated = true;
+       attach_ctx.task_work_queued = false;
+       attach_ctx.old_cs = oldcs;
        cpuset_attach_task(cs, task);
+       if (attach_ctx.task_work_queued)
+               schedule_flush_migrate_mm();
 
        dec_attach_in_progress_locked();
        mutex_unlock(&cpuset_mutex);
-- 
2.54.0


Reply via email to