During force-idle, we end up doing cross-cpu comparison of vruntimes during pick_next_task. If we simply compare (vruntime-min_vruntime) across CPUs, and if the CPUs only have 1 task each, we will always end up comparing 0 with 0 and pick just one of the tasks all the time. This starves the task that was not picked. To fix this, take a snapshot of the min_vruntime when entering force idle and use it for comparison. This min_vruntime snapshot will only be used for cross-CPU vruntime comparison, and nothing else.
This resolves several performance issues that were seen in ChromeOS audio usecase. Tested-by: Julien Desfossez <jdesfos...@digitalocean.com> Signed-off-by: Joel Fernandes (Google) <j...@joelfernandes.org> --- kernel/sched/core.c | 33 ++++++++++++++++++++------------- kernel/sched/fair.c | 40 ++++++++++++++++++++++++++++++++++++++++ kernel/sched/sched.h | 5 +++++ 3 files changed, 65 insertions(+), 13 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 469428979182..a5404ec9e89a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -115,19 +115,8 @@ static inline bool prio_less(struct task_struct *a, struct task_struct *b) if (pa == -1) /* dl_prio() doesn't work because of stop_class above */ return !dl_time_before(a->dl.deadline, b->dl.deadline); - if (pa == MAX_RT_PRIO + MAX_NICE) { /* fair */ - u64 vruntime = b->se.vruntime; - - /* - * Normalize the vruntime if tasks are in different cpus. - */ - if (task_cpu(a) != task_cpu(b)) { - vruntime -= task_cfs_rq(b)->min_vruntime; - vruntime += task_cfs_rq(a)->min_vruntime; - } - - return !((s64)(a->se.vruntime - vruntime) <= 0); - } + if (pa == MAX_RT_PRIO + MAX_NICE) /* fair */ + return cfs_prio_less(a, b); return false; } @@ -4648,6 +4637,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) struct task_struct *next, *max = NULL; const struct sched_class *class; const struct cpumask *smt_mask; + bool fi_before = false; bool need_sync; int i, j, cpu; @@ -4712,6 +4702,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) rq->core->core_cookie = 0UL; if (rq->core->core_forceidle) { need_sync = true; + fi_before = true; rq->core->core_forceidle = false; } for_each_cpu(i, smt_mask) { @@ -4723,6 +4714,14 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) update_rq_clock(rq_i); } + /* Reset the snapshot if core is no longer in force-idle. */ + if (!fi_before) { + for_each_cpu(i, smt_mask) { + struct rq *rq_i = cpu_rq(i); + rq_i->cfs.min_vruntime_fi = rq_i->cfs.min_vruntime; + } + } + /* * Try and select tasks for each sibling in decending sched_class * order. @@ -4859,6 +4858,14 @@ next_class:; resched_curr(rq_i); } + /* Snapshot if core is in force-idle. */ + if (!fi_before && rq->core->core_forceidle) { + for_each_cpu(i, smt_mask) { + struct rq *rq_i = cpu_rq(i); + rq_i->cfs.min_vruntime_fi = rq_i->cfs.min_vruntime; + } + } + done: set_next_task(rq, next); return next; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 56bea0decda1..9cae08c3fca1 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -10686,6 +10686,46 @@ static inline void task_tick_core(struct rq *rq, struct task_struct *curr) __entity_slice_used(&curr->se, MIN_NR_TASKS_DURING_FORCEIDLE)) resched_curr(rq); } + +bool cfs_prio_less(struct task_struct *a, struct task_struct *b) +{ + bool samecpu = task_cpu(a) == task_cpu(b); + struct sched_entity *sea = &a->se; + struct sched_entity *seb = &b->se; + struct cfs_rq *cfs_rqa; + struct cfs_rq *cfs_rqb; + s64 delta; + + if (samecpu) { + /* vruntime is per cfs_rq */ + while (!is_same_group(sea, seb)) { + int sea_depth = sea->depth; + int seb_depth = seb->depth; + if (sea_depth >= seb_depth) + sea = parent_entity(sea); + if (sea_depth <= seb_depth) + seb = parent_entity(seb); + } + + delta = (s64)(sea->vruntime - seb->vruntime); + goto out; + } + + /* crosscpu: compare root level se's vruntime to decide priority */ + while (sea->parent) + sea = sea->parent; + while (seb->parent) + seb = seb->parent; + + cfs_rqa = sea->cfs_rq; + cfs_rqb = seb->cfs_rq; + + /* normalize vruntime WRT their rq's base */ + delta = (s64)(sea->vruntime - seb->vruntime) + + (s64)(cfs_rqb->min_vruntime_fi - cfs_rqa->min_vruntime_fi); +out: + return delta > 0; +} #else static inline void task_tick_core(struct rq *rq, struct task_struct *curr) {} #endif diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 884d23d5e55d..dfdb0ebb07a8 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -524,6 +524,9 @@ struct cfs_rq { u64 exec_clock; u64 min_vruntime; +#ifdef CONFIG_SCHED_CORE + u64 min_vruntime_fi; +#endif #ifndef CONFIG_64BIT u64 min_vruntime_copy; #endif @@ -1106,6 +1109,8 @@ static inline raw_spinlock_t *rq_lockp(struct rq *rq) return &rq->__lock; } +bool cfs_prio_less(struct task_struct *a, struct task_struct *b); + #else /* !CONFIG_SCHED_CORE */ static inline bool sched_core_enabled(struct rq *rq) -- 2.29.0.rc1.297.gfa9743e501-goog