All the nasty bits that manage prctl() and cgroup core-sched
interaction.

In order to manage this; a 'fat' cookie is introduced, this is
basically a nested cookie that links to two other cookies. Any unique
combination of task and cgroup cookies will get _one_ fat cookie.

Uniqueness of fat cookies is ensured by use of a global tree.

Due to the locking rules for cookies, the need for fat cookies is not
apparent up-front, nor can they be allocated in-situ, therefore
pre-allocate them agressively and mostly free them instantly when not
used.

Signed-off-by: Peter Zijlstra (Intel) <pet...@infradead.org>
---
 include/linux/sched.h     |    1 
 kernel/sched/core.c       |   25 +++
 kernel/sched/core_sched.c |  329 ++++++++++++++++++++++++++++++++++++++++++----
 kernel/sched/sched.h      |   11 +
 4 files changed, 337 insertions(+), 29 deletions(-)

--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -704,6 +704,7 @@ struct task_struct {
 #ifdef CONFIG_SCHED_CORE
        struct rb_node                  core_node;
        unsigned long                   core_cookie;
+       void                            *core_spare_fat;
        unsigned int                    core_occupation;
 #endif
 
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -9487,7 +9487,7 @@ void sched_move_task(struct task_struct
        task_rq_unlock(rq, tsk, &rf);
 
        cookie = sched_core_cgroup_cookie(tsk->sched_task_group);
-       cookie = sched_core_update_cookie(tsk, cookie);
+       cookie = sched_core_update_cookie(tsk, cookie | GROUP_COOKIE);
        sched_core_put_cookie(cookie);
 }
 
@@ -9592,6 +9592,10 @@ static int cpu_cgroup_can_attach(struct
 
                if (ret)
                        break;
+
+               ret = sched_core_prealloc_fat(task);
+               if (ret)
+                       break;
        }
        return ret;
 }
@@ -10122,13 +10126,28 @@ int cpu_sched_core_write_u64(struct cgro
 
        old_cookie = tg->core_cookie;
        if (val) {
-               cookie = sched_core_alloc_cookie();
+               cookie = sched_core_alloc_cookie(GROUP_COOKIE);
                if (!cookie) {
                        ret = -ENOMEM;
                        goto unlock;
                }
                WARN_ON_ONCE(old_cookie);
 
+               css_for_each_descendant_pre(cssi, css) {
+                       struct css_task_iter it;
+                       struct task_struct *p;
+
+                       css_task_iter_start(cssi, 0, &it);
+                       while ((p = css_task_iter_next(&it))) {
+                               ret = sched_core_prealloc_fat(p);
+                               if (ret) {
+                                       css_task_iter_end(&it);
+                                       goto unlock;
+                               }
+                       }
+                       css_task_iter_end(&it);
+               }
+
        } else if (tg->parent) {
                if (tg->parent->core_parent)
                        parent = tg->parent->core_parent;
@@ -10164,7 +10183,7 @@ int cpu_sched_core_write_u64(struct cgro
                        unsigned long p_cookie;
 
                        cookie = sched_core_get_cookie(cookie);
-                       p_cookie = sched_core_update_cookie(p, cookie);
+                       p_cookie = sched_core_update_cookie(p, cookie | 
GROUP_COOKIE);
                        sched_core_put_cookie(p_cookie);
                }
                css_task_iter_end(&it);
--- a/kernel/sched/core_sched.c
+++ b/kernel/sched/core_sched.c
@@ -1,6 +1,8 @@
 // SPDX-License-Identifier: GPL-2.0-only
 
 #include <linux/prctl.h>
+#include <linux/rbtree.h>
+#include <linux/cgroup.h>
 #include "sched.h"
 
 /*
@@ -8,26 +10,243 @@
  * address is used to compute the cookie of the task.
  */
 struct sched_core_cookie {
-       refcount_t refcnt;
+       refcount_t      refcnt;
+       unsigned int    type;
 };
 
-unsigned long sched_core_alloc_cookie(void)
+static inline void *cookie_ptr(unsigned long cookie)
+{
+       return (void *)(cookie & ~3UL);
+}
+
+static inline int cookie_type(unsigned long cookie)
+{
+       return cookie & 3;
+}
+
+static inline void sched_core_init_cookie(struct sched_core_cookie *ck, 
unsigned int type)
+{
+       refcount_set(&ck->refcnt, 1);
+       ck->type = type;
+}
+
+#ifdef CONFIG_CGROUP_SCHED
+
+#define FAT_COOKIE     0x03
+
+struct sched_core_fat_cookie {
+       struct sched_core_cookie        cookie;
+       unsigned long                   task_cookie;
+       unsigned long                   group_cookie;
+       struct rb_node                  node;
+};
+
+static DEFINE_RAW_SPINLOCK(fat_lock);
+static struct rb_root fat_root;
+
+static void fat_mutex_lock(void)
+{
+       /*
+        * { ss->can_attach(), ss->attach() } vs prctl() for p->core_spare_fat
+        */
+       mutex_lock(&cgroup_mutex);
+}
+
+static void fat_mutex_unlock(void)
+{
+       mutex_unlock(&cgroup_mutex);
+}
+
+static void sched_core_put_fat(struct sched_core_fat_cookie *fat)
+{
+       unsigned long flags;
+
+       if (fat->cookie.type != FAT_COOKIE)
+               return;
+
+       sched_core_put_cookie(fat->task_cookie);
+       sched_core_put_cookie(fat->group_cookie);
+
+       if (!RB_EMPTY_NODE(&fat->node)) {
+               raw_spin_lock_irqsave(&fat_lock, flags);
+               rb_erase(&fat->node, &fat_root);
+               raw_spin_unlock_irqrestore(&fat_lock, flags);
+       }
+}
+
+static void *node_2_fat(struct rb_node *n)
+{
+       return rb_entry(n, struct sched_core_fat_cookie, node);
+}
+
+static int fat_cmp(struct rb_node *a, struct rb_node *b)
+{
+       struct sched_core_fat_cookie *ca = node_2_fat(a);
+       struct sched_core_fat_cookie *cb = node_2_fat(b);
+
+       if (ca->group_cookie < cb->group_cookie)
+               return -1;
+       if (ca->group_cookie > cb->group_cookie)
+               return 1;
+
+       if (ca->task_cookie < cb->task_cookie)
+               return -1;
+       if (ca->task_cookie > cb->task_cookie)
+               return 1;
+
+       if (refcount_inc_not_zero(&cb->cookie.refcnt))
+               return 0;
+
+       return 1;
+}
+
+static unsigned long __sched_core_fat_cookie(struct task_struct *p,
+                                            void **spare_fat,
+                                            unsigned long cookie)
+{
+       unsigned long task_cookie, group_cookie;
+       unsigned int p_type = cookie_type(p->core_cookie);
+       unsigned int c_type = cookie_type(cookie);
+       struct sched_core_fat_cookie *fat;
+       unsigned long flags;
+       struct rb_node *n;
+
+       if (WARN_ON_ONCE(c_type == FAT_COOKIE))
+               return cookie;
+
+       if (!p_type || p_type == c_type)
+               return cookie;
+
+       if (p_type == FAT_COOKIE) {
+               fat = cookie_ptr(p->core_cookie);
+
+               /* loose fat */
+               if (!cookie_ptr(cookie)) {
+                       if (c_type == TASK_COOKIE)
+                               cookie = fat->group_cookie;
+                       else
+                               cookie = fat->task_cookie;
+
+                       WARN_ON_ONCE(!cookie_ptr(cookie));
+                       return sched_core_get_cookie(cookie);
+               }
+
+               /* other fat */
+               if (c_type == TASK_COOKIE)
+                       group_cookie = fat->group_cookie;
+               else
+                       task_cookie = fat->task_cookie;
+
+       } else {
+
+               /* new fat */
+               if (p_type == TASK_COOKIE)
+                       task_cookie = p->core_cookie;
+               else
+                       group_cookie = p->core_cookie;
+       }
+
+       if (c_type == TASK_COOKIE)
+               task_cookie = cookie;
+       else
+               group_cookie = cookie;
+
+       fat = *spare_fat;
+       if (WARN_ON_ONCE(!fat))
+               return cookie;
+
+       sched_core_init_cookie(&fat->cookie, FAT_COOKIE);
+       fat->task_cookie = sched_core_get_cookie(task_cookie);
+       fat->group_cookie = sched_core_get_cookie(group_cookie);
+       RB_CLEAR_NODE(&fat->node);
+
+       raw_spin_lock_irqsave(&fat_lock, flags);
+       n = rb_find_add(&fat->node, &fat_root, fat_cmp);
+       raw_spin_unlock_irqrestore(&fat_lock, flags);
+
+       if (n) {
+               sched_core_put_fat(fat);
+               fat = node_2_fat(n);
+       } else {
+               *spare_fat = NULL;
+       }
+
+       return (unsigned long)fat | FAT_COOKIE;
+}
+
+static int __sched_core_alloc_fat(void **spare_fat)
+{
+       if (*spare_fat)
+               return 0;
+
+       *spare_fat = kmalloc(sizeof(struct sched_core_fat_cookie), GFP_KERNEL);
+       if (!*spare_fat)
+               return -ENOMEM;
+
+       return 0;
+}
+
+int sched_core_prealloc_fat(struct task_struct *p)
+{
+       lockdep_assert_held(&cgroup_mutex);
+       return __sched_core_alloc_fat(&p->core_spare_fat);
+}
+
+static inline unsigned long __sched_core_task_cookie(struct task_struct *p)
+{
+       unsigned long cookie = p->core_cookie;
+       unsigned int c_type = cookie_type(cookie);
+
+       if (!(c_type & TASK_COOKIE))
+               return 0;
+
+       if (c_type == FAT_COOKIE)
+               cookie = ((struct sched_core_fat_cookie 
*)cookie_ptr(cookie))->task_cookie;
+
+       return cookie;
+}
+
+#else
+
+static inline void fat_mutex_lock(void) { }
+static inline void fat_mutex_unlock(void) { }
+
+static inline void sched_core_put_fat(void *ptr) { }
+static inline int __sched_core_alloc_fat(void **spare_fat) { return 0; }
+
+static inline unsigned long __sched_core_fat_cookie(struct task_struct *p,
+                                                   void **spare_fat,
+                                                   unsigned long cookie)
+{
+       return cookie;
+}
+
+static inline unsigned long __sched_core_task_cookie(struct task_struct *p)
+{
+       return p->core_cookie;
+}
+
+#endif /* CGROUP_SCHED */
+
+unsigned long sched_core_alloc_cookie(unsigned int type)
 {
        struct sched_core_cookie *ck = kmalloc(sizeof(*ck), GFP_KERNEL);
        if (!ck)
                return 0;
 
-       refcount_set(&ck->refcnt, 1);
+       WARN_ON_ONCE(type > GROUP_COOKIE);
+       sched_core_init_cookie(ck, type);
        sched_core_get();
 
-       return (unsigned long)ck;
+       return (unsigned long)ck | type;
 }
 
 void sched_core_put_cookie(unsigned long cookie)
 {
-       struct sched_core_cookie *ptr = (void *)cookie;
+       struct sched_core_cookie *ptr = cookie_ptr(cookie);
 
        if (ptr && refcount_dec_and_test(&ptr->refcnt)) {
+               sched_core_put_fat((void *)ptr);
                kfree(ptr);
                sched_core_put();
        }
@@ -35,7 +254,7 @@ void sched_core_put_cookie(unsigned long
 
 unsigned long sched_core_get_cookie(unsigned long cookie)
 {
-       struct sched_core_cookie *ptr = (void *)cookie;
+       struct sched_core_cookie *ptr = cookie_ptr(cookie);
 
        if (ptr)
                refcount_inc(&ptr->refcnt);
@@ -50,14 +269,22 @@ unsigned long sched_core_get_cookie(unsi
  * @cookie: The new cookie.
  * @cookie_type: The cookie field to which the cookie corresponds.
  */
-unsigned long sched_core_update_cookie(struct task_struct *p, unsigned long 
cookie)
+static unsigned long __sched_core_update_cookie(struct task_struct *p,
+                                               void **spare_fat,
+                                               unsigned long cookie)
 {
        unsigned long old_cookie;
        struct rq_flags rf;
        struct rq *rq;
        bool enqueued;
 
-       rq = task_rq_lock(p, &rf);
+       raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
+
+       cookie = __sched_core_fat_cookie(p, spare_fat, cookie);
+       if (!cookie_ptr(cookie))
+               cookie = 0UL;
+
+       rq = __task_rq_lock(p, &rf);
 
        /*
         * Since creating a cookie implies sched_core_get(), and we cannot set
@@ -90,9 +317,19 @@ unsigned long sched_core_update_cookie(s
        return old_cookie;
 }
 
+unsigned long sched_core_update_cookie(struct task_struct *p, unsigned long 
cookie)
+{
+       cookie =  __sched_core_update_cookie(p, &p->core_spare_fat, cookie);
+       if (p->core_spare_fat) {
+               kfree(p->core_spare_fat);
+               p->core_spare_fat = NULL;
+       }
+       return cookie;
+}
+
 static unsigned long sched_core_clone_cookie(struct task_struct *p)
 {
-       unsigned long cookie, flags;
+       unsigned long flags, cookie;
 
        raw_spin_lock_irqsave(&p->pi_lock, flags);
        cookie = sched_core_get_cookie(p->core_cookie);
@@ -101,26 +338,47 @@ static unsigned long sched_core_clone_co
        return cookie;
 }
 
+static unsigned long sched_core_clone_task_cookie(struct task_struct *p)
+{
+       unsigned long flags, cookie;
+
+       raw_spin_lock_irqsave(&p->pi_lock, flags);
+       cookie = sched_core_get_cookie(__sched_core_task_cookie(p));
+       raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+
+       return cookie;
+}
+
 void sched_core_fork(struct task_struct *p)
 {
        RB_CLEAR_NODE(&p->core_node);
        p->core_cookie = sched_core_clone_cookie(current);
+       p->core_spare_fat = NULL;
 }
 
 void sched_core_free(struct task_struct *p)
 {
        sched_core_put_cookie(p->core_cookie);
+       kfree(p->core_spare_fat);
 }
 
 int sched_core_exec(void)
 {
        /* absent a policy mech, if task had a cookie, give it a new one */
-       if (current->core_cookie) {
-               unsigned long cookie = sched_core_alloc_cookie();
+       if (current->core_cookie & TASK_COOKIE) {
+               void *spare_fat = NULL;
+               unsigned long cookie;
+
+               if (__sched_core_alloc_fat(&spare_fat))
+                       return -ENOMEM;
+
+               cookie = sched_core_alloc_cookie(TASK_COOKIE);
                if (!cookie)
                        return -ENOMEM;
-               cookie = sched_core_update_cookie(current, cookie);
+
+               cookie = __sched_core_update_cookie(current, &spare_fat, 
cookie);
                sched_core_put_cookie(cookie);
+               kfree(spare_fat);
        }
 
        return 0;
@@ -129,7 +387,7 @@ int sched_core_exec(void)
 static void __sched_core_set(struct task_struct *p, unsigned long cookie)
 {
        cookie = sched_core_get_cookie(cookie);
-       cookie = sched_core_update_cookie(p, cookie);
+       cookie = sched_core_update_cookie(p, cookie | TASK_COOKIE);
        sched_core_put_cookie(cookie);
 }
 
@@ -171,55 +429,62 @@ int sched_core_share_pid(unsigned int cm
                goto out;
        }
 
+       fat_mutex_lock();
+
+       err = sched_core_prealloc_fat(task);
+       if (err)
+               goto out_unlock;
+
        switch (cmd) {
        case PR_SCHED_CORE_GET:
                if (type != PIDTYPE_PID || uaddr & 7) {
                        err = -EINVAL;
-                       goto out;
+                       goto out_unlock;
                }
-               cookie = sched_core_clone_cookie(task);
-               if (cookie) {
+               cookie = sched_core_clone_task_cookie(task);
+               if (cookie_ptr(cookie)) {
                        /* XXX improve ? */
                        ptr_to_hashval((void *)cookie, &id);
                }
                err = put_user(id, (u64 __user *)uaddr);
-               goto out;
+               goto out_unlock;
 
        case PR_SCHED_CORE_CLEAR:
                cookie = 0;
                break;
 
        case PR_SCHED_CORE_CREATE:
-               cookie = sched_core_alloc_cookie();
+               cookie = sched_core_alloc_cookie(TASK_COOKIE);
                if (!cookie) {
                        err = -ENOMEM;
-                       goto out;
+                       goto out_unlock;
                }
                break;
 
        case PR_SCHED_CORE_SHARE_TO:
-               cookie = sched_core_clone_cookie(current);
+               cookie = sched_core_clone_task_cookie(current);
                break;
 
        case PR_SCHED_CORE_SHARE_FROM:
                if (type != PIDTYPE_PID) {
                        err = -EINVAL;
-                       goto out;
+                       goto out_unlock;
                }
-               cookie = sched_core_clone_cookie(task);
+               cookie = sched_core_clone_task_cookie(task);
                __sched_core_set(current, cookie);
-               goto out;
+               goto out_unlock;
 
        default:
                err = -EINVAL;
-               goto out;
+               goto out_unlock;
        };
 
        if (type == PIDTYPE_PID) {
                __sched_core_set(task, cookie);
-               goto out;
+               goto out_unlock;
        }
 
+again:
        read_lock(&tasklist_lock);
        grp = task_pid_type(task, type);
 
@@ -228,6 +493,18 @@ int sched_core_share_pid(unsigned int cm
                        err = -EPERM;
                        goto out_tasklist;
                }
+
+               if (IS_ENABLED(CONFIG_CGROUP_SCHED) && !p->core_spare_fat) {
+                       get_task_struct(p);
+                       read_unlock(&tasklist_lock);
+
+                       err = sched_core_prealloc_fat(p);
+                       put_task_struct(p);
+                       if (err)
+                               goto out_unlock;
+
+                       goto again;
+               }
        } while_each_pid_thread(grp, type, p);
 
        do_each_pid_thread(grp, type, p) {
@@ -236,6 +513,8 @@ int sched_core_share_pid(unsigned int cm
 out_tasklist:
        read_unlock(&tasklist_lock);
 
+out_unlock:
+       fat_mutex_unlock();
 out:
        sched_core_put_cookie(cookie);
        put_task_struct(task);
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1240,10 +1240,14 @@ extern void sched_core_dequeue(struct rq
 extern void sched_core_get(void);
 extern void sched_core_put(void);
 
-extern unsigned long sched_core_alloc_cookie(void);
+#define TASK_COOKIE    0x01
+#define GROUP_COOKIE   0x02
+
+extern unsigned long sched_core_alloc_cookie(unsigned int type);
 extern void sched_core_put_cookie(unsigned long cookie);
 extern unsigned long sched_core_get_cookie(unsigned long cookie);
 extern unsigned long sched_core_update_cookie(struct task_struct *p, unsigned 
long cookie);
+extern int sched_core_prealloc_fat(struct task_struct *p);
 
 #else /* !CONFIG_SCHED_CORE */
 
@@ -1257,6 +1261,11 @@ static inline bool sched_core_disabled(v
        return true;
 }
 
+static inline int sched_core_prealloc_fat(struct task_struct *p)
+{
+       return 0;
+}
+
 static inline raw_spinlock_t *rq_lockp(struct rq *rq)
 {
        return &rq->__lock;


Reply via email to