Implement a basic cgroup core-scheduling interface.

A new cpu.core_sched file is added which takes the values 0,1. When
set, the cgroup and all it's descendants will be granted the same
cookie and thus allowed to share a core with each-other, but not with
system tasks or tasks of other subtrees that might have another
cookie.

The file is hierarchical, and a subtree can again set it to 1, in
which case that subtree will get a different cookie and will no longer
share with the parent tree.

For each task, the nearest core_sched parent 'wins'.

Interaction with the prctl() interface is non-existent and left for a
future patch.

Noteably; this patch somewhat abuses cgroup_mutex. By holding
cgroup_mutex over the write() operation, which sets the cookie, the
cookie is stable in any cgroup callback (that is called with
cgroup_mutex held). A future patch relies on ss->can_attach() and
ss->attach() being 'atomic', which is hard to do without cgroup_mutex.

Signed-off-by: Peter Zijlstra (Intel) <pet...@infradead.org>
---
 kernel/sched/core.c  |  150 +++++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/sched/sched.h |    7 ++
 2 files changed, 157 insertions(+)

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5688,10 +5688,53 @@ static inline void sched_core_cpu_starti
                }
        }
 }
+
+void sched_core_cgroup_online(struct task_group *parent, struct task_group *tg)
+{
+       lockdep_assert_held(&cgroup_mutex);
+
+       if (parent->core_parent) {
+               WARN_ON_ONCE(parent->core_cookie);
+               WARN_ON_ONCE(!parent->core_parent->core_cookie);
+               tg->core_parent = parent->core_parent;
+
+       } else if (parent->core_cookie) {
+               WARN_ON_ONCE(parent->core_parent);
+               tg->core_parent = parent;
+       }
+}
+
+void sched_core_cgroup_free(struct task_group *tg)
+{
+       sched_core_put_cookie(tg->core_cookie);
+}
+
+unsigned long sched_core_cgroup_cookie(struct task_group *tg)
+{
+       unsigned long cookie = 0;
+
+       lockdep_assert_held(&cgroup_mutex);
+
+       if (tg->core_cookie)
+               cookie = tg->core_cookie;
+       else if (tg->core_parent)
+               cookie = tg->core_parent->core_cookie;
+
+       return sched_core_get_cookie(cookie);
+}
+
 #else /* !CONFIG_SCHED_CORE */
 
 static inline void sched_core_cpu_starting(unsigned int cpu) {}
 
+static inline void sched_core_cgroup_free(struct task_group *tg) { }
+static inline void sched_core_cgroup_online(struct task_group *parent, struct 
task_group tg) { }
+
+static inline unsigned long sched_core_cgroup_cookie(struct task_group *tg)
+{
+       return 0;
+}
+
 static struct task_struct *
 pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
 {
@@ -9310,6 +9353,7 @@ static void sched_free_group(struct task
        free_fair_sched_group(tg);
        free_rt_sched_group(tg);
        autogroup_free(tg);
+       sched_core_cgroup_free(tg);
        kmem_cache_free(task_group_cache, tg);
 }
 
@@ -9353,6 +9397,8 @@ void sched_online_group(struct task_grou
        spin_unlock_irqrestore(&task_group_lock, flags);
 
        online_fair_sched_group(tg);
+
+       sched_core_cgroup_online(parent, tg);
 }
 
 /* rcu callback to free various structures associated with a task group */
@@ -9414,6 +9460,7 @@ void sched_move_task(struct task_struct
 {
        int queued, running, queue_flags =
                DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
+       unsigned long cookie;
        struct rq_flags rf;
        struct rq *rq;
 
@@ -9443,6 +9490,10 @@ void sched_move_task(struct task_struct
        }
 
        task_rq_unlock(rq, tsk, &rf);
+
+       cookie = sched_core_cgroup_cookie(tsk->sched_task_group);
+       cookie = sched_core_update_cookie(tsk, cookie);
+       sched_core_put_cookie(cookie);
 }
 
 static inline struct task_group *css_tg(struct cgroup_subsys_state *css)
@@ -10050,6 +10101,89 @@ static u64 cpu_rt_period_read_uint(struc
 }
 #endif /* CONFIG_RT_GROUP_SCHED */
 
+#ifdef CONFIG_SCHED_CORE
+u64 cpu_sched_core_read_u64(struct cgroup_subsys_state *css, struct cftype 
*cft)
+{
+       return !!css_tg(css)->core_cookie;
+}
+
+int cpu_sched_core_write_u64(struct cgroup_subsys_state *css, struct cftype 
*cft, u64 val)
+{
+       unsigned long cookie = 0, old_cookie = 0;
+       struct task_group *tg = css_tg(css);
+       struct cgroup_subsys_state *cssi;
+       struct task_group *parent = NULL;
+       int ret = 0;
+
+       if (val > 1)
+               return -ERANGE;
+
+       if (!static_branch_likely(&sched_smt_present))
+               return -ENODEV;
+
+       mutex_lock(&cgroup_mutex);
+       if (!!val == !!tg->core_cookie)
+               goto unlock;
+
+       old_cookie = tg->core_cookie;
+       if (val) {
+               cookie = sched_core_alloc_cookie();
+               if (!cookie) {
+                       ret = -ENOMEM;
+                       goto unlock;
+               }
+               WARN_ON_ONCE(old_cookie);
+
+       } else if (tg->parent) {
+               if (tg->parent->core_parent)
+                       parent = tg->parent->core_parent;
+               else if (tg->parent->core_cookie)
+                       parent = tg->parent;
+       }
+
+       WARN_ON_ONCE(cookie && parent);
+
+       tg->core_cookie = sched_core_get_cookie(cookie);
+       tg->core_parent = parent;
+
+       if (cookie)
+               parent = tg;
+       else if (parent)
+               cookie = sched_core_get_cookie(parent->core_cookie);
+
+       css_for_each_descendant_pre(cssi, css) {
+               struct task_group *tgi = css_tg(cssi);
+               struct css_task_iter it;
+               struct task_struct *p;
+
+               if (tgi != tg) {
+                       if (tgi->core_cookie || (tgi->core_parent && 
tgi->core_parent != tg))
+                               continue;
+
+                       tgi->core_parent = parent;
+                       tgi->core_cookie = 0;
+               }
+
+               css_task_iter_start(cssi, 0, &it);
+               while ((p = css_task_iter_next(&it))) {
+                       unsigned long p_cookie;
+
+                       cookie = sched_core_get_cookie(cookie);
+                       p_cookie = sched_core_update_cookie(p, cookie);
+                       sched_core_put_cookie(p_cookie);
+               }
+               css_task_iter_end(&it);
+       }
+
+unlock:
+       mutex_unlock(&cgroup_mutex);
+
+       sched_core_put_cookie(cookie);
+       sched_core_put_cookie(old_cookie);
+       return ret;
+}
+#endif
+
 static struct cftype cpu_legacy_files[] = {
 #ifdef CONFIG_FAIR_GROUP_SCHED
        {
@@ -10100,6 +10234,14 @@ static struct cftype cpu_legacy_files[]
                .write = cpu_uclamp_max_write,
        },
 #endif
+#ifdef CONFIG_SCHED_CORE
+       {
+               .name = "core_sched",
+               .flags = CFTYPE_NOT_ON_ROOT,
+               .read_u64 = cpu_sched_core_read_u64,
+               .write_u64 = cpu_sched_core_write_u64,
+       },
+#endif
        { }     /* Terminate */
 };
 
@@ -10281,6 +10423,14 @@ static struct cftype cpu_files[] = {
                .write = cpu_uclamp_max_write,
        },
 #endif
+#ifdef CONFIG_SCHED_CORE
+       {
+               .name = "core_sched",
+               .flags = CFTYPE_NOT_ON_ROOT,
+               .read_u64 = cpu_sched_core_read_u64,
+               .write_u64 = cpu_sched_core_write_u64,
+       },
+#endif
        { }     /* terminate */
 };
 
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -431,6 +431,10 @@ struct task_group {
        struct uclamp_se        uclamp[UCLAMP_CNT];
 #endif
 
+#ifdef CONFIG_SCHED_CORE
+       struct task_group       *core_parent;
+       unsigned long           core_cookie;
+#endif
 };
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1130,6 +1134,9 @@ static inline bool is_migration_disabled
 
 struct sched_group;
 #ifdef CONFIG_SCHED_CORE
+
+extern struct mutex cgroup_mutex; // XXX
+
 DECLARE_STATIC_KEY_FALSE(__sched_core_enabled);
 static inline struct cpumask *sched_group_span(struct sched_group *sg);
 


Reply via email to