Implement a basic cgroup core-scheduling interface. A new cpu.core_sched file is added which takes the values 0,1. When set, the cgroup and all it's descendants will be granted the same cookie and thus allowed to share a core with each-other, but not with system tasks or tasks of other subtrees that might have another cookie.
The file is hierarchical, and a subtree can again set it to 1, in which case that subtree will get a different cookie and will no longer share with the parent tree. For each task, the nearest core_sched parent 'wins'. Interaction with the prctl() interface is non-existent and left for a future patch. Noteably; this patch somewhat abuses cgroup_mutex. By holding cgroup_mutex over the write() operation, which sets the cookie, the cookie is stable in any cgroup callback (that is called with cgroup_mutex held). A future patch relies on ss->can_attach() and ss->attach() being 'atomic', which is hard to do without cgroup_mutex. Signed-off-by: Peter Zijlstra (Intel) <pet...@infradead.org> --- kernel/sched/core.c | 150 +++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/sched/sched.h | 7 ++ 2 files changed, 157 insertions(+) --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5688,10 +5688,53 @@ static inline void sched_core_cpu_starti } } } + +void sched_core_cgroup_online(struct task_group *parent, struct task_group *tg) +{ + lockdep_assert_held(&cgroup_mutex); + + if (parent->core_parent) { + WARN_ON_ONCE(parent->core_cookie); + WARN_ON_ONCE(!parent->core_parent->core_cookie); + tg->core_parent = parent->core_parent; + + } else if (parent->core_cookie) { + WARN_ON_ONCE(parent->core_parent); + tg->core_parent = parent; + } +} + +void sched_core_cgroup_free(struct task_group *tg) +{ + sched_core_put_cookie(tg->core_cookie); +} + +unsigned long sched_core_cgroup_cookie(struct task_group *tg) +{ + unsigned long cookie = 0; + + lockdep_assert_held(&cgroup_mutex); + + if (tg->core_cookie) + cookie = tg->core_cookie; + else if (tg->core_parent) + cookie = tg->core_parent->core_cookie; + + return sched_core_get_cookie(cookie); +} + #else /* !CONFIG_SCHED_CORE */ static inline void sched_core_cpu_starting(unsigned int cpu) {} +static inline void sched_core_cgroup_free(struct task_group *tg) { } +static inline void sched_core_cgroup_online(struct task_group *parent, struct task_group tg) { } + +static inline unsigned long sched_core_cgroup_cookie(struct task_group *tg) +{ + return 0; +} + static struct task_struct * pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) { @@ -9310,6 +9353,7 @@ static void sched_free_group(struct task free_fair_sched_group(tg); free_rt_sched_group(tg); autogroup_free(tg); + sched_core_cgroup_free(tg); kmem_cache_free(task_group_cache, tg); } @@ -9353,6 +9397,8 @@ void sched_online_group(struct task_grou spin_unlock_irqrestore(&task_group_lock, flags); online_fair_sched_group(tg); + + sched_core_cgroup_online(parent, tg); } /* rcu callback to free various structures associated with a task group */ @@ -9414,6 +9460,7 @@ void sched_move_task(struct task_struct { int queued, running, queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; + unsigned long cookie; struct rq_flags rf; struct rq *rq; @@ -9443,6 +9490,10 @@ void sched_move_task(struct task_struct } task_rq_unlock(rq, tsk, &rf); + + cookie = sched_core_cgroup_cookie(tsk->sched_task_group); + cookie = sched_core_update_cookie(tsk, cookie); + sched_core_put_cookie(cookie); } static inline struct task_group *css_tg(struct cgroup_subsys_state *css) @@ -10050,6 +10101,89 @@ static u64 cpu_rt_period_read_uint(struc } #endif /* CONFIG_RT_GROUP_SCHED */ +#ifdef CONFIG_SCHED_CORE +u64 cpu_sched_core_read_u64(struct cgroup_subsys_state *css, struct cftype *cft) +{ + return !!css_tg(css)->core_cookie; +} + +int cpu_sched_core_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, u64 val) +{ + unsigned long cookie = 0, old_cookie = 0; + struct task_group *tg = css_tg(css); + struct cgroup_subsys_state *cssi; + struct task_group *parent = NULL; + int ret = 0; + + if (val > 1) + return -ERANGE; + + if (!static_branch_likely(&sched_smt_present)) + return -ENODEV; + + mutex_lock(&cgroup_mutex); + if (!!val == !!tg->core_cookie) + goto unlock; + + old_cookie = tg->core_cookie; + if (val) { + cookie = sched_core_alloc_cookie(); + if (!cookie) { + ret = -ENOMEM; + goto unlock; + } + WARN_ON_ONCE(old_cookie); + + } else if (tg->parent) { + if (tg->parent->core_parent) + parent = tg->parent->core_parent; + else if (tg->parent->core_cookie) + parent = tg->parent; + } + + WARN_ON_ONCE(cookie && parent); + + tg->core_cookie = sched_core_get_cookie(cookie); + tg->core_parent = parent; + + if (cookie) + parent = tg; + else if (parent) + cookie = sched_core_get_cookie(parent->core_cookie); + + css_for_each_descendant_pre(cssi, css) { + struct task_group *tgi = css_tg(cssi); + struct css_task_iter it; + struct task_struct *p; + + if (tgi != tg) { + if (tgi->core_cookie || (tgi->core_parent && tgi->core_parent != tg)) + continue; + + tgi->core_parent = parent; + tgi->core_cookie = 0; + } + + css_task_iter_start(cssi, 0, &it); + while ((p = css_task_iter_next(&it))) { + unsigned long p_cookie; + + cookie = sched_core_get_cookie(cookie); + p_cookie = sched_core_update_cookie(p, cookie); + sched_core_put_cookie(p_cookie); + } + css_task_iter_end(&it); + } + +unlock: + mutex_unlock(&cgroup_mutex); + + sched_core_put_cookie(cookie); + sched_core_put_cookie(old_cookie); + return ret; +} +#endif + static struct cftype cpu_legacy_files[] = { #ifdef CONFIG_FAIR_GROUP_SCHED { @@ -10100,6 +10234,14 @@ static struct cftype cpu_legacy_files[] .write = cpu_uclamp_max_write, }, #endif +#ifdef CONFIG_SCHED_CORE + { + .name = "core_sched", + .flags = CFTYPE_NOT_ON_ROOT, + .read_u64 = cpu_sched_core_read_u64, + .write_u64 = cpu_sched_core_write_u64, + }, +#endif { } /* Terminate */ }; @@ -10281,6 +10423,14 @@ static struct cftype cpu_files[] = { .write = cpu_uclamp_max_write, }, #endif +#ifdef CONFIG_SCHED_CORE + { + .name = "core_sched", + .flags = CFTYPE_NOT_ON_ROOT, + .read_u64 = cpu_sched_core_read_u64, + .write_u64 = cpu_sched_core_write_u64, + }, +#endif { } /* terminate */ }; --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -431,6 +431,10 @@ struct task_group { struct uclamp_se uclamp[UCLAMP_CNT]; #endif +#ifdef CONFIG_SCHED_CORE + struct task_group *core_parent; + unsigned long core_cookie; +#endif }; #ifdef CONFIG_FAIR_GROUP_SCHED @@ -1130,6 +1134,9 @@ static inline bool is_migration_disabled struct sched_group; #ifdef CONFIG_SCHED_CORE + +extern struct mutex cgroup_mutex; // XXX + DECLARE_STATIC_KEY_FALSE(__sched_core_enabled); static inline struct cpumask *sched_group_span(struct sched_group *sg);