The only case where the cgroup_taskset structure requires task migration to multiple cpusets is when enabling a cpuset controller in cgroup v2 where the newly created child cpusets inherits the same effective CPUs and memory nodes from the parent. In that case, task migration can happen directly with no update to tasks' CPU and memory nodes assignment and no further work needed from the cpuset side except updating nr_deadline_tasks when DL tasks are involved and setting old_mems_allowed in the child cpusets.
Do that by tracking all the destination cpusets with a new dst_cs_head singly linked list. The reset_migrate_dl_data() function is integrated into clear_attach_data() so that it can be used for both source and destination cpusets. It is assumed that a given cpuset cannot be both a source and a destination cpuset. If such condition happens or when there are multiple destination cpusets with CPU or memory nodes changes, the current code will not handle it correctly. So it will print a warning and fail the attach operation in these unexpected cases as we will have to enhance the code to support this if such use cases are valid and not coding errors. Signed-off-by: Waiman Long <[email protected]> --- kernel/cgroup/cpuset-internal.h | 1 + kernel/cgroup/cpuset.c | 115 ++++++++++++++++++++------------ 2 files changed, 72 insertions(+), 44 deletions(-) diff --git a/kernel/cgroup/cpuset-internal.h b/kernel/cgroup/cpuset-internal.h index 6636cf5ce326..8cbfacc5f315 100644 --- a/kernel/cgroup/cpuset-internal.h +++ b/kernel/cgroup/cpuset-internal.h @@ -149,6 +149,7 @@ struct cpuset { * For linking impacted cpusets during an attach operation. */ struct llist_node attach_node; + bool attach_source; /* partition root state */ int partition_root_state; diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index e9e97c6765f0..09b3ad52c639 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -366,10 +366,12 @@ static struct { bool cpus_updated; bool mems_updated; bool task_work_queued; + bool many_dest_cs; /* Have many destination cpusets */ struct cpuset *old_cs; /* Source cpuset */ nodemask_t nodemask_to; } attach_ctx; static LLIST_HEAD(src_cs_head); +static LLIST_HEAD(dst_cs_head); static inline void check_insane_mems_config(nodemask_t *nodes) { @@ -3025,8 +3027,23 @@ static int cpuset_can_attach_check(struct cpuset *cs, struct cpuset *oldcs, if (!oldcs) return 0; - if (!llist_on_list(&oldcs->attach_node)) + /* + * The same cpuset cannot be both a source and a destination. + * The current code does not support that, print a warning and + * fail the attach if so. + */ + if (WARN_ON_ONCE((!oldcs->attach_source && + llist_on_list(&oldcs->attach_node)) || + cs->attach_source)) + return -EINVAL; + + if (!llist_on_list(&oldcs->attach_node)) { llist_add(&oldcs->attach_node, &src_cs_head); + oldcs->attach_source = true; + } + + if (!llist_on_list(&cs->attach_node)) + llist_add(&cs->attach_node, &dst_cs_head); cpus_updated = !cpumask_equal(cs->effective_cpus, oldcs->effective_cpus); mems_updated = !nodes_equal(cs->effective_mems, oldcs->effective_mems); @@ -3056,35 +3073,31 @@ static int cpuset_can_attach_check(struct cpuset *cs, struct cpuset *oldcs, return 0; } -static int cpuset_reserve_dl_bw(struct cpuset *cs) +static int cpuset_reserve_dl_bw(void) { + struct cpuset *cs; int cpu, ret; - if (!cs->sum_migrate_dl_bw) - return 0; + llist_for_each_entry(cs, dst_cs_head.first, attach_node) { + if (!cs->sum_migrate_dl_bw) + continue; - cpu = cpumask_any_and(cpu_active_mask, cs->effective_cpus); - if (unlikely(cpu >= nr_cpu_ids)) - return -EINVAL; + cpu = cpumask_any_and(cpu_active_mask, cs->effective_cpus); + if (unlikely(cpu >= nr_cpu_ids)) + return -EINVAL; - ret = dl_bw_alloc(cpu, cs->sum_migrate_dl_bw); - if (ret) - return ret; + ret = dl_bw_alloc(cpu, cs->sum_migrate_dl_bw); + if (ret) + return ret; - cs->dl_bw_cpu = cpu; + cs->dl_bw_cpu = cpu; + } return 0; } -static void reset_migrate_dl_data(struct cpuset *cs) -{ - cs->nr_migrate_dl_tasks = 0; - cs->sum_migrate_dl_bw = 0; - cs->dl_bw_cpu = -1; -} - /* * Clear and optionally apply (@cancel is false) the attach related data in the - * source cpusets. + * source or destination cpuset. */ static void clear_attach_data(struct llist_head *head, bool cancel) { @@ -3096,8 +3109,13 @@ static void clear_attach_data(struct llist_head *head, bool cancel) if (cs->nr_migrate_dl_tasks) { if (!cancel) cs->nr_deadline_tasks += cs->nr_migrate_dl_tasks; + else if (cs->dl_bw_cpu >= 0) /* && cacnel */ + dl_bw_free(cs->dl_bw_cpu, cs->sum_migrate_dl_bw); cs->nr_migrate_dl_tasks = 0; + cs->sum_migrate_dl_bw = 0; + cs->dl_bw_cpu = -1; } + cs->attach_source = false; } } @@ -3118,6 +3136,7 @@ static int cpuset_can_attach(struct cgroup_taskset *tset) mutex_lock(&cpuset_mutex); attach_ctx.cpus_updated = false; attach_ctx.mems_updated = false; + attach_ctx.many_dest_cs = false; /* Check to see if task is allowed in the cpuset */ ret = cpuset_can_attach_check(cs, oldcs, &setsched_check); @@ -3142,9 +3161,13 @@ static int cpuset_can_attach(struct cgroup_taskset *tset) * selected as attach_ctx.old_cs. */ cgroup_taskset_for_each(task, css, tset) { + struct cpuset *new_cs = css_cs(css); struct cpuset *new_oldcs = task_cs(task); - if (new_oldcs != oldcs) { + if ((new_oldcs != oldcs) || (new_cs != cs)) { + if (new_cs != cs) + attach_ctx.many_dest_cs = true; + cs = new_cs; oldcs = new_oldcs; ret = cpuset_can_attach_check(cs, oldcs, &setsched_check); if (ret) @@ -3178,12 +3201,28 @@ static int cpuset_can_attach(struct cgroup_taskset *tset) } } - ret = cpuset_reserve_dl_bw(cs); + /* + * The only case where there are multiple destination cpusets for + * task migration is when enabling a v2 cpuset controllers where + * tasks will be migrated to multiple child cpusets from a parent + * cpuset with the same effective CPUs and memory nodes. IOW, + * both attach_cpus_updated and attach_mems_updated should be false. + * If not, it is a condition that the current code cannot handled. + * Print a warning and abort the attach operation as further code + * change will be needed. + */ + if (WARN_ON_ONCE(attach_ctx.many_dest_cs && (!cpuset_v2() || + attach_ctx.cpus_updated || attach_ctx.mems_updated))) { + ret = -EINVAL; + goto out_unlock; + } + + ret = cpuset_reserve_dl_bw(); out_unlock: if (ret) { - reset_migrate_dl_data(cs); clear_attach_data(&src_cs_head, true); + clear_attach_data(&dst_cs_head, true); } else { attach_ctx.in_progress++; } @@ -3194,22 +3233,10 @@ static int cpuset_can_attach(struct cgroup_taskset *tset) static void cpuset_cancel_attach(struct cgroup_taskset *tset) { - struct cgroup_subsys_state *css; - struct cpuset *cs; - - cgroup_taskset_first(tset, &css); - cs = css_cs(css); - mutex_lock(&cpuset_mutex); dec_attach_in_progress_locked(); clear_attach_data(&src_cs_head, true); - - if (cs->dl_bw_cpu >= 0) - dl_bw_free(cs->dl_bw_cpu, cs->sum_migrate_dl_bw); - - if (cs->nr_migrate_dl_tasks) - reset_migrate_dl_data(cs); - + clear_attach_data(&dst_cs_head, true); mutex_unlock(&cpuset_mutex); } @@ -3293,25 +3320,25 @@ static void cpuset_attach(struct cgroup_taskset *tset) * In the default hierarchy, enabling cpuset in the child cgroups * will trigger a cpuset_attach() call with no change in effective cpus * and mems. In that case, we can optimize out by skipping the task - * iteration and update. + * iteration and updatebut the destination cpuset list is iterated to + * set old_mems_allowed. */ - if (cpuset_v2() && !attach_ctx.cpus_updated && !attach_ctx.mems_updated) + if (cpuset_v2() && !attach_ctx.cpus_updated && !attach_ctx.mems_updated) { + llist_for_each_entry(cs, dst_cs_head.first, attach_node) + cs->old_mems_allowed = attach_ctx.nodemask_to; goto out; + } + /* Task iteration shouldn't happen with attach_ctx.many_dest_cs set */ cgroup_taskset_for_each(task, css, tset) cpuset_attach_task(cs, task); -out: if (attach_ctx.task_work_queued) schedule_flush_migrate_mm(); cs->old_mems_allowed = attach_ctx.nodemask_to; - - if (cs->nr_migrate_dl_tasks) { - cs->nr_deadline_tasks += cs->nr_migrate_dl_tasks; - reset_migrate_dl_data(cs); - } - +out: clear_attach_data(&src_cs_head, false); + clear_attach_data(&dst_cs_head, false); dec_attach_in_progress_locked(); mutex_unlock(&cpuset_mutex); -- 2.54.0

