On Tue, Nov 17, 2020 at 06:19:52PM -0500, Joel Fernandes (Google) wrote:
> In order to prevent interference and clearly support both per-task and CGroup
> APIs, split the cookie into 2 and allow it to be set from either per-task, or
> CGroup API. The final cookie is the combined value of both and is computed 
> when
> the stop-machine executes during a change of cookie.
> 
> Also, for the per-task cookie, it will get weird if we use pointers of any
> emphemeral objects. For this reason, introduce a refcounted object who's sole
> purpose is to assign unique cookie value by way of the object's pointer.
> 
> While at it, refactor the CGroup code a bit. Future patches will introduce 
> more
> APIs and support.
> 
> Reviewed-by: Josh Don <[email protected]>
> Tested-by: Julien Desfossez <[email protected]>
> Signed-off-by: Joel Fernandes (Google) <[email protected]>
> ---
>  include/linux/sched.h |   2 +
>  kernel/sched/core.c   | 241 ++++++++++++++++++++++++++++++++++++++++--
>  kernel/sched/debug.c  |   4 +
>  3 files changed, 236 insertions(+), 11 deletions(-)
> 
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index a60868165590..c6a3b0fa952b 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -688,6 +688,8 @@ struct task_struct {
>  #ifdef CONFIG_SCHED_CORE
>       struct rb_node                  core_node;
>       unsigned long                   core_cookie;
> +     unsigned long                   core_task_cookie;
> +     unsigned long                   core_group_cookie;
>       unsigned int                    core_occupation;
>  #endif
>  
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index b99a7493d590..7ccca355623a 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -346,11 +346,14 @@ void sched_core_put(void)
>       mutex_unlock(&sched_core_mutex);
>  }
>  
> +static int sched_core_share_tasks(struct task_struct *t1, struct task_struct 
> *t2);
> +
>  #else /* !CONFIG_SCHED_CORE */
>  
>  static inline void sched_core_enqueue(struct rq *rq, struct task_struct *p) 
> { }
>  static inline void sched_core_dequeue(struct rq *rq, struct task_struct *p) 
> { }
>  static bool sched_core_enqueued(struct task_struct *task) { return false; }
> +static int sched_core_share_tasks(struct task_struct *t1, struct task_struct 
> *t2) { }
>  
>  #endif /* CONFIG_SCHED_CORE */
>  
> @@ -4032,6 +4035,20 @@ int sched_fork(unsigned long clone_flags, struct 
> task_struct *p)
>  #endif
>  #ifdef CONFIG_SCHED_CORE
>       RB_CLEAR_NODE(&p->core_node);
> +
> +     /*
> +      * Tag child via per-task cookie only if parent is tagged via per-task
> +      * cookie. This is independent of, but can be additive to the CGroup 
> tagging.
> +      */
> +     if (current->core_task_cookie) {
> +
> +             /* If it is not CLONE_THREAD fork, assign a unique per-task 
> tag. */
> +             if (!(clone_flags & CLONE_THREAD)) {
> +                     return sched_core_share_tasks(p, p);
> +               }
> +             /* Otherwise share the parent's per-task tag. */
> +             return sched_core_share_tasks(p, current);
> +     }
>  #endif
>       return 0;
>  }
> @@ -9731,6 +9748,217 @@ static u64 cpu_rt_period_read_uint(struct 
> cgroup_subsys_state *css,
>  #endif /* CONFIG_RT_GROUP_SCHED */
>  
>  #ifdef CONFIG_SCHED_CORE
> +/*
> + * A simple wrapper around refcount. An allocated sched_core_cookie's
> + * address is used to compute the cookie of the task.
> + */
> +struct sched_core_cookie {
> +     refcount_t refcnt;
> +};
> +
> +/*
> + * sched_core_tag_requeue - Common helper for all interfaces to set a cookie.
> + * @p: The task to assign a cookie to.
> + * @cookie: The cookie to assign.
> + * @group: is it a group interface or a per-task interface.
> + *
> + * This function is typically called from a stop-machine handler.

Can you clarify if it is typically or always, what are the implications for
locking?

> + */
> +void sched_core_tag_requeue(struct task_struct *p, unsigned long cookie, 
> bool group)
> +{
> +     if (!p)
> +             return;
> +
> +     if (group)
> +             p->core_group_cookie = cookie;
> +     else
> +             p->core_task_cookie = cookie;
> +
> +     /* Use up half of the cookie's bits for task cookie and remaining for 
> group cookie. */
> +     p->core_cookie = (p->core_task_cookie <<
> +                             (sizeof(unsigned long) * 4)) + 
> p->core_group_cookie;
> +

Always use masks to ensure this fits in the space we have, should we be 
concerned about
overflows and the potential for collision of cookie values?

> +     if (sched_core_enqueued(p)) {
> +             sched_core_dequeue(task_rq(p), p);
> +             if (!p->core_task_cookie)
> +                     return;
> +     }
> +
> +     if (sched_core_enabled(task_rq(p)) &&
> +                     p->core_cookie && task_on_rq_queued(p))
> +             sched_core_enqueue(task_rq(p), p);
> +}
> +
> +/* Per-task interface */
> +static unsigned long sched_core_alloc_task_cookie(void)
> +{
> +     struct sched_core_cookie *ptr =
> +             kmalloc(sizeof(struct sched_core_cookie), GFP_KERNEL);
> +
> +     if (!ptr)
> +             return 0;
> +     refcount_set(&ptr->refcnt, 1);
> +
> +     /*
> +      * NOTE: sched_core_put() is not done by put_task_cookie(). Instead, it
> +      * is done after the stopper runs.
> +      */
> +     sched_core_get();
> +     return (unsigned long)ptr;
> +}
> +
> +static bool sched_core_get_task_cookie(unsigned long cookie)
> +{
> +     struct sched_core_cookie *ptr = (struct sched_core_cookie *)cookie;
> +
> +     /*
> +      * NOTE: sched_core_put() is not done by put_task_cookie(). Instead, it
> +      * is done after the stopper runs.
> +      */
> +     sched_core_get();
> +     return refcount_inc_not_zero(&ptr->refcnt);
> +}
> +
> +static void sched_core_put_task_cookie(unsigned long cookie)
> +{
> +     struct sched_core_cookie *ptr = (struct sched_core_cookie *)cookie;
> +
> +     if (refcount_dec_and_test(&ptr->refcnt))
> +             kfree(ptr);
> +}
> +
> +struct sched_core_task_write_tag {
> +     struct task_struct *tasks[2];
> +     unsigned long cookies[2];
> +};

Use a better name instead of 2?

> +
> +/*
> + * Ensure that the task has been requeued. The stopper ensures that the task 
> cannot
> + * be migrated to a different CPU while its core scheduler queue state is 
> being updated.
> + * It also makes sure to requeue a task if it was running actively on 
> another CPU.
> + */
> +static int sched_core_task_join_stopper(void *data)
> +{
> +     struct sched_core_task_write_tag *tag = (struct 
> sched_core_task_write_tag *)data;
> +     int i;
> +
> +     for (i = 0; i < 2; i++)

Use ARRAY_SIZE(cookies) if you have to or ARRAY_SIZE(tasks)

> +             sched_core_tag_requeue(tag->tasks[i], tag->cookies[i], false /* 
> !group */);
> +
> +     return 0;
> +}
> +
> +static int sched_core_share_tasks(struct task_struct *t1, struct task_struct 
> *t2)
> +{

Can you please explain how t1 and t2 are related, there is a table below, but
I don't understand case#2, where the cookies get reset, is t2 the core leader
and t2 leads what t1 and t2 collectively get?

May be just called t2 as parent?

> +     struct sched_core_task_write_tag wr = {}; /* for stop machine. */
> +     bool sched_core_put_after_stopper = false;
> +     unsigned long cookie;
> +     int ret = -ENOMEM;
> +
> +     mutex_lock(&sched_core_mutex);
> +
> +     /*
> +      * NOTE: sched_core_get() is done by sched_core_alloc_task_cookie() or
> +      *       sched_core_put_task_cookie(). However, sched_core_put() is done
> +      *       by this function *after* the stopper removes the tasks from the
> +      *       core queue, and not before. This is just to play it safe.
> +      */
> +     if (t2 == NULL) {
> +             if (t1->core_task_cookie) {
> +                     sched_core_put_task_cookie(t1->core_task_cookie);
> +                     sched_core_put_after_stopper = true;
> +                     wr.tasks[0] = t1; /* Keep wr.cookies[0] reset for t1. */
> +             }
> +     } else if (t1 == t2) {
> +             /* Assign a unique per-task cookie solely for t1. */
> +
> +             cookie = sched_core_alloc_task_cookie();
> +             if (!cookie)
> +                     goto out_unlock;
> +
> +             if (t1->core_task_cookie) {
> +                     sched_core_put_task_cookie(t1->core_task_cookie);
> +                     sched_core_put_after_stopper = true;
> +             }
> +             wr.tasks[0] = t1;
> +             wr.cookies[0] = cookie;
> +     } else
> +     /*
> +      *              t1              joining         t2
> +      * CASE 1:
> +      * before       0                               0
> +      * after        new cookie                      new cookie
> +      *
> +      * CASE 2:
> +      * before       X (non-zero)                    0
> +      * after        0                               0
> +      *
> +      * CASE 3:
> +      * before       0                               X (non-zero)
> +      * after        X                               X
> +      *
> +      * CASE 4:
> +      * before       Y (non-zero)                    X (non-zero)
> +      * after        X                               X
> +      */
> +     if (!t1->core_task_cookie && !t2->core_task_cookie) {
> +             /* CASE 1. */
> +             cookie = sched_core_alloc_task_cookie();
> +             if (!cookie)
> +                     goto out_unlock;
> +
> +             /* Add another reference for the other task. */
> +             if (!sched_core_get_task_cookie(cookie)) {
> +                     ret = -EINVAL;
> +                     goto out_unlock;
> +             }
> +
> +             wr.tasks[0] = t1;
> +             wr.tasks[1] = t2;
> +             wr.cookies[0] = wr.cookies[1] = cookie;
> +
> +     } else if (t1->core_task_cookie && !t2->core_task_cookie) {
> +             /* CASE 2. */
> +             sched_core_put_task_cookie(t1->core_task_cookie);
> +             sched_core_put_after_stopper = true;
> +
> +             wr.tasks[0] = t1; /* Reset cookie for t1. */
> +
> +     } else if (!t1->core_task_cookie && t2->core_task_cookie) {
> +             /* CASE 3. */
> +             if (!sched_core_get_task_cookie(t2->core_task_cookie)) {
> +                     ret = -EINVAL;
> +                     goto out_unlock;
> +             }
> +
> +             wr.tasks[0] = t1;
> +             wr.cookies[0] = t2->core_task_cookie;
> +
> +     } else {
> +             /* CASE 4. */
> +             if (!sched_core_get_task_cookie(t2->core_task_cookie)) {
> +                     ret = -EINVAL;
> +                     goto out_unlock;
> +             }
> +             sched_core_put_task_cookie(t1->core_task_cookie);
> +             sched_core_put_after_stopper = true;
> +
> +             wr.tasks[0] = t1;
> +             wr.cookies[0] = t2->core_task_cookie;
> +     }
> +
> +     stop_machine(sched_core_task_join_stopper, (void *)&wr, NULL);
> +
> +     if (sched_core_put_after_stopper)
> +             sched_core_put();
> +
> +     ret = 0;
> +out_unlock:
> +     mutex_unlock(&sched_core_mutex);
> +     return ret;
> +}
> +
> +/* CGroup interface */
>  static u64 cpu_core_tag_read_u64(struct cgroup_subsys_state *css, struct 
> cftype *cft)
>  {
>       struct task_group *tg = css_tg(css);
> @@ -9761,18 +9989,9 @@ static int __sched_write_tag(void *data)
>        * when we set cgroup tag to 0 when the loop is done below.
>        */
>       while ((p = css_task_iter_next(&it))) {
> -             p->core_cookie = !!val ? (unsigned long)tg : 0UL;
> -
> -             if (sched_core_enqueued(p)) {
> -                     sched_core_dequeue(task_rq(p), p);
> -                     if (!p->core_cookie)
> -                             continue;
> -             }
> -
> -             if (sched_core_enabled(task_rq(p)) &&
> -                 p->core_cookie && task_on_rq_queued(p))
> -                     sched_core_enqueue(task_rq(p), p);
> +             unsigned long cookie = !!val ? (unsigned long)tg : 0UL;
>  
> +             sched_core_tag_requeue(p, cookie, true /* group */);
>       }
>       css_task_iter_end(&it);
>  
> diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
> index 60a922d3f46f..8c452b8010ad 100644
> --- a/kernel/sched/debug.c
> +++ b/kernel/sched/debug.c
> @@ -1024,6 +1024,10 @@ void proc_sched_show_task(struct task_struct *p, 
> struct pid_namespace *ns,
>               __PS("clock-delta", t1-t0);
>       }
>  
> +#ifdef CONFIG_SCHED_CORE
> +     __PS("core_cookie", p->core_cookie);
> +#endif
> +
>       sched_show_numa(p, m);
>  }
>

Balbir Singh.  

Reply via email to