[RFC] sched: Consider higher or lower sched_class tasks on migration

Byungchul Park Thu, 23 Jul 2020 03:48:40 -0700

Hello folks,

I found there's some problem on rt migration code while I was working
on a latency sensitive system.


I thought we need to consider not only tasks' priorty within the
sched_cass but also priorities between sched_classes on migration
at least for rt(or deadline) tasks.

Even though I worked against very low version of stable-tree (v4.4),
I will work on the newest mainline tree and re-submit it as a normal
patch (not RFC) if you agree with the approach.

BTW, am I misshing something on this?

Thanks,
Byungchul

----->8-----
>From 785a569d705d8f743fc8bd58b810222cbb5f6eb1 Mon Sep 17 00:00:00 2001
From: Byungchul Park <byungchul.p...@lge.com>
Date: Thu, 23 Jul 2020 18:42:05 +0900
Subject: [RFC] sched: Consider higher or lower sched_class tasks on
 migration

Scheduler should avoid to migrate rt(or deadline) tasks to other CPUs
having higher sched_class tasks but it doesn't.

In addition, when an upper sched_class task was picked using
->pick_next_task(), it should be taken into account that lower
sched_class tasks than the picked task should be moved to other CPUs.

For example, when a dl task was just picked, the scheduler should push
rt tasks to other CPUs if the preempted task is in rt sched_class and
its priority is high enough to run on another CPU.

While at it, rename queue_{push,pull}_* functions to reflect their
classes.

Signed-off-by: Byungchul Park <byungchul.p...@lge.com>
---
 kernel/sched/core.c        | 15 +++++++++++++++
 kernel/sched/cpudeadline.c | 21 +++++++++++++++++++--
 kernel/sched/cpudeadline.h |  1 +
 kernel/sched/cpupri.c      |  2 ++
 kernel/sched/deadline.c    | 31 ++++++++++++++++++++-----------
 kernel/sched/rt.c          | 16 ++++++----------
 kernel/sched/sched.h       |  8 ++++++++
 kernel/sched/stop_task.c   | 28 ++++++++++++++++++++++++++++
 8 files changed, 99 insertions(+), 23 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d99240f..a391eb2 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1157,6 +1157,21 @@ static int migration_cpu_stop(void *data)
        return 0;
 }
 
+int exclude_higher_class(struct cpumask *dstp, struct task_struct *p)
+{
+       const struct sched_class *cur = p->sched_class;
+       const struct sched_class *class;
+       struct rq *rq = task_rq(p);
+
+       for_each_class(class) {
+               if (class == cur)
+                       break;
+               if (class->exclude_higher_class)
+                       class->exclude_higher_class(rq, dstp);
+       }
+       return !cpumask_empty(dstp);
+}
+
 /*
  * sched_class::set_cpus_allowed must do the below, but is not required to
  * actually call this function.
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
index 5a75b08..19887ff 100644
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -15,6 +15,7 @@
 #include <linux/kernel.h>
 #include <linux/slab.h>
 #include "cpudeadline.h"
+#include "sched.h"
 
 static inline int parent(int i)
 {
@@ -103,14 +104,25 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
        const struct sched_dl_entity *dl_se = &p->dl;
 
        if (later_mask &&
-           cpumask_and(later_mask, cp->free_cpus, &p->cpus_allowed)) {
+           cpumask_and(later_mask, cp->free_cpus, &p->cpus_allowed) &&
+           exclude_higher_class(later_mask, p)) {
                best_cpu = cpumask_any(later_mask);
                goto out;
        } else if (cpumask_test_cpu(cpudl_maximum(cp), &p->cpus_allowed) &&
                        dl_time_before(dl_se->deadline, cp->elements[0].dl)) {
                best_cpu = cpudl_maximum(cp);
-               if (later_mask)
+               if (later_mask) {
                        cpumask_set_cpu(best_cpu, later_mask);
+
+                       /*
+                        * TODO: It would be better to pick the second best
+                        * in this case.
+                        */
+                       if (!exclude_higher_class(later_mask, p)) {
+                               best_cpu = -1;
+                               cpumask_clear_cpu(best_cpu, later_mask);
+                       }
+               }
        }
 
 out:
@@ -201,6 +213,11 @@ void cpudl_clear_freecpu(struct cpudl *cp, int cpu)
        cpumask_clear_cpu(cpu, cp->free_cpus);
 }
 
+void cpudl_exclude_dlcpus(struct cpudl *cp, struct cpumask *dstp)
+{
+       cpumask_and(dstp, dstp, cp->free_cpus);
+}
+
 /*
  * cpudl_init - initialize the cpudl structure
  * @cp: the cpudl max-heap context
diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h
index fcbdf83..bc2510c 100644
--- a/kernel/sched/cpudeadline.h
+++ b/kernel/sched/cpudeadline.h
@@ -27,6 +27,7 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int 
is_valid);
 int cpudl_init(struct cpudl *cp);
 void cpudl_set_freecpu(struct cpudl *cp, int cpu);
 void cpudl_clear_freecpu(struct cpudl *cp, int cpu);
+void cpudl_exclude_dlcpus(struct cpudl *cp, struct cpumask *dstp);
 void cpudl_cleanup(struct cpudl *cp);
 #endif /* CONFIG_SMP */
 
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
index 981fcd7..a509a96 100644
--- a/kernel/sched/cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -32,6 +32,7 @@
 #include <linux/sched/rt.h>
 #include <linux/slab.h>
 #include "cpupri.h"
+#include "sched.h"
 
 /* Convert between a 140 based task->prio, and our 102 based cpupri */
 static int convert_prio(int prio)
@@ -108,6 +109,7 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
 
                if (lowest_mask) {
                        cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask);
+                       exclude_higher_class(lowest_mask, p);
 
                        /*
                         * We have to ensure that we have at least one bit
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index e12b0a4..5b43690 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -219,7 +219,7 @@ static DEFINE_PER_CPU(struct callback_head, dl_pull_head);
 static void push_dl_tasks(struct rq *);
 static void pull_dl_task(struct rq *);
 
-static inline void queue_push_tasks(struct rq *rq)
+void queue_push_dl_tasks(struct rq *rq)
 {
        if (!has_pushable_dl_tasks(rq))
                return;
@@ -227,7 +227,7 @@ static inline void queue_push_tasks(struct rq *rq)
        queue_balance_callback(rq, &per_cpu(dl_push_head, rq->cpu), 
push_dl_tasks);
 }
 
-static inline void queue_pull_task(struct rq *rq)
+static inline void queue_pull_dl_task(struct rq *rq)
 {
        queue_balance_callback(rq, &per_cpu(dl_pull_head, rq->cpu), 
pull_dl_task);
 }
@@ -314,11 +314,7 @@ static inline void pull_dl_task(struct rq *rq)
 {
 }
 
-static inline void queue_push_tasks(struct rq *rq)
-{
-}
-
-static inline void queue_pull_task(struct rq *rq)
+static inline void queue_pull_dl_task(struct rq *rq)
 {
 }
 #endif /* CONFIG_SMP */
@@ -1329,7 +1325,14 @@ struct task_struct *pick_next_task_dl(struct rq *rq, 
struct task_struct *prev)
        if (hrtick_enabled(rq))
                start_hrtick_dl(rq, p);
 
-       queue_push_tasks(rq);
+       /*
+        * The callback queued earlier will be executed later. Let the
+        * dl migration performed first in balance_callback().
+        */
+       if (prev->sched_class == &rt_sched_class)
+               queue_push_rt_tasks(rq);
+
+       queue_push_dl_tasks(rq);
 
        return p;
 }
@@ -1800,6 +1803,11 @@ static void task_woken_dl(struct rq *rq, struct 
task_struct *p)
        }
 }
 
+static void exclude_higher_class_dl(struct rq *rq, struct cpumask *dstp)
+{
+       cpudl_exclude_dlcpus(&rq->rd->cpudl, dstp);
+}
+
 static void set_cpus_allowed_dl(struct task_struct *p,
                                const struct cpumask *new_mask)
 {
@@ -1884,7 +1892,7 @@ static void switched_from_dl(struct rq *rq, struct 
task_struct *p)
        if (!task_on_rq_queued(p) || rq->dl.dl_nr_running)
                return;
 
-       queue_pull_task(rq);
+       queue_pull_dl_task(rq);
 }
 
 /*
@@ -1896,7 +1904,7 @@ static void switched_to_dl(struct rq *rq, struct 
task_struct *p)
        if (task_on_rq_queued(p) && rq->curr != p) {
 #ifdef CONFIG_SMP
                if (p->nr_cpus_allowed > 1 && rq->dl.overloaded)
-                       queue_push_tasks(rq);
+                       queue_push_dl_tasks(rq);
 #endif
                if (dl_task(rq->curr))
                        check_preempt_curr_dl(rq, p, 0);
@@ -1921,7 +1929,7 @@ static void prio_changed_dl(struct rq *rq, struct 
task_struct *p,
                 * or lowering its prio, so...
                 */
                if (!rq->dl.overloaded)
-                       queue_pull_task(rq);
+                       queue_pull_dl_task(rq);
 
                /*
                 * If we now have a earlier deadline task than p,
@@ -1959,6 +1967,7 @@ const struct sched_class dl_sched_class = {
        .rq_online              = rq_online_dl,
        .rq_offline             = rq_offline_dl,
        .task_woken             = task_woken_dl,
+       .exclude_higher_class   = exclude_higher_class_dl,
 #endif
 
        .set_curr_task          = set_curr_task_dl,
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 801b4ec..b789d4f 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -349,7 +349,7 @@ static DEFINE_PER_CPU(struct callback_head, rt_pull_head);
 static void push_rt_tasks(struct rq *);
 static void pull_rt_task(struct rq *);
 
-static inline void queue_push_tasks(struct rq *rq)
+void queue_push_rt_tasks(struct rq *rq)
 {
        if (!has_pushable_tasks(rq))
                return;
@@ -357,7 +357,7 @@ static inline void queue_push_tasks(struct rq *rq)
        queue_balance_callback(rq, &per_cpu(rt_push_head, rq->cpu), 
push_rt_tasks);
 }
 
-static inline void queue_pull_task(struct rq *rq)
+static inline void queue_pull_rt_task(struct rq *rq)
 {
        queue_balance_callback(rq, &per_cpu(rt_pull_head, rq->cpu), 
pull_rt_task);
 }
@@ -414,10 +414,6 @@ static inline bool need_pull_rt_task(struct rq *rq, struct 
task_struct *prev)
 static inline void pull_rt_task(struct rq *this_rq)
 {
 }
-
-static inline void queue_push_tasks(struct rq *rq)
-{
-}
 #endif /* CONFIG_SMP */
 
 static void enqueue_top_rt_rq(struct rt_rq *rt_rq);
@@ -1495,7 +1491,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev)
        /* The running task is never eligible for pushing */
        dequeue_pushable_task(rq, p);
 
-       queue_push_tasks(rq);
+       queue_push_rt_tasks(rq);
 
        return p;
 }
@@ -2113,7 +2109,7 @@ static void switched_from_rt(struct rq *rq, struct 
task_struct *p)
        if (!task_on_rq_queued(p) || rq->rt.rt_nr_running)
                return;
 
-       queue_pull_task(rq);
+       queue_pull_rt_task(rq);
 }
 
 void __init init_sched_rt_class(void)
@@ -2144,7 +2140,7 @@ static void switched_to_rt(struct rq *rq, struct 
task_struct *p)
        if (task_on_rq_queued(p) && rq->curr != p) {
 #ifdef CONFIG_SMP
                if (p->nr_cpus_allowed > 1 && rq->rt.overloaded)
-                       queue_push_tasks(rq);
+                       queue_push_rt_tasks(rq);
 #endif /* CONFIG_SMP */
                if (p->prio < rq->curr->prio && cpu_online(cpu_of(rq)))
                        resched_curr(rq);
@@ -2168,7 +2164,7 @@ prio_changed_rt(struct rq *rq, struct task_struct *p, int 
oldprio)
                 * may need to pull tasks to this runqueue.
                 */
                if (oldprio < p->prio)
-                       queue_pull_task(rq);
+                       queue_pull_rt_task(rq);
 
                /*
                 * If there's a higher priority task waiting to run
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 0c9ebd8..d164f54 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -789,6 +789,9 @@ queue_balance_callback(struct rq *rq,
        rq->balance_callback = head;
 }
 
+extern void queue_push_rt_tasks(struct rq *rq);
+extern void queue_push_dl_tasks(struct rq *rq);
+
 extern void sched_ttwu_pending(void);
 
 #define rcu_dereference_check_sched_domain(p) \
@@ -910,6 +913,9 @@ extern int group_balance_cpu(struct sched_group *sg);
 
 #else
 
+static inline void queue_push_rt_tasks(struct rq *rq) { }
+static inline void queue_push_dl_tasks(struct rq *rq) { }
+
 static inline void sched_ttwu_pending(void) { }
 
 #endif /* CONFIG_SMP */
@@ -1212,6 +1218,7 @@ struct sched_class {
 
        void (*set_cpus_allowed)(struct task_struct *p,
                                 const struct cpumask *newmask);
+       void (*exclude_higher_class)(struct rq *rq, struct cpumask *dstp);
 
        void (*rq_online)(struct rq *rq);
        void (*rq_offline)(struct rq *rq);
@@ -1269,6 +1276,7 @@ extern void idle_exit_fair(struct rq *this_rq);
 
 extern void set_cpus_allowed_common(struct task_struct *p, const struct 
cpumask *new_mask);
 
+extern int exclude_higher_class(struct cpumask *dstp, struct task_struct *p);
 #else
 
 static inline void idle_enter_fair(struct rq *rq) { }
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index cbc67da..e6694cb 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -15,6 +15,26 @@ select_task_rq_stop(struct task_struct *p, int cpu, int 
sd_flag, int flags)
 {
        return task_cpu(p); /* stop tasks as never migrate */
 }
+
+static cpumask_t stop_runnable_mask;
+
+static void exclude_higher_class_stop(struct rq *rq, struct cpumask *dstp)
+{
+       cpumask_andnot(dstp, dstp, &stop_runnable_mask);
+}
+
+static inline void set_stop_runnable_mask(int cpu)
+{
+       cpumask_set_cpu(cpu, &stop_runnable_mask);
+}
+
+static inline void clear_stop_runnable_mask(int cpu)
+{
+       cpumask_clear_cpu(cpu, &stop_runnable_mask);
+}
+#else
+static inline void set_stop_runnable_mask(int cpu) { }
+static inline void clear_stop_runnable_mask(int cpu) { }
 #endif /* CONFIG_SMP */
 
 static void
@@ -33,6 +53,11 @@ pick_next_task_stop(struct rq *rq, struct task_struct *prev)
 
        put_prev_task(rq, prev);
 
+       if (prev->sched_class == &dl_sched_class)
+               queue_push_dl_tasks(rq);
+       else if (prev->sched_class == &rt_sched_class)
+               queue_push_rt_tasks(rq);
+
        stop->se.exec_start = rq_clock_task(rq);
 
        return stop;
@@ -42,12 +67,14 @@ static void
 enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags)
 {
        add_nr_running(rq, 1);
+       set_stop_runnable_mask(cpu_of(rq));
 }
 
 static void
 dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags)
 {
        sub_nr_running(rq, 1);
+       clear_stop_runnable_mask(cpu_of(rq));
 }
 
 static void yield_task_stop(struct rq *rq)
@@ -124,6 +151,7 @@ const struct sched_class stop_sched_class = {
 #ifdef CONFIG_SMP
        .select_task_rq         = select_task_rq_stop,
        .set_cpus_allowed       = set_cpus_allowed_common,
+       .exclude_higher_class   = exclude_higher_class_stop,
 #endif
 
        .set_curr_task          = set_curr_task_stop,
-- 
1.9.1

[RFC] sched: Consider higher or lower sched_class tasks on migration

Reply via email to