On Sat, 2016-05-21 at 21:00 +0200, Mike Galbraith wrote: > On Sat, 2016-05-21 at 16:04 +0200, Mike Galbraith wrote: > > > Wakees that were not migrated/normalized eat an unwanted min_vruntime, > > and likely take a size XXL latency hit. Big box running master bled > > profusely under heavy load until I turned TTWU_QUEUE off.
May as well make it official and against master.today. Fly or die little patchlet. sched/fair: Move se->vruntime normalization state into struct sched_entity b5179ac70de ceased globally normalizing wakee vruntime in ttwu(), leaving sched_ttwu_pending() with the need to know whether each wakee on wake_list was migrated or not, to pass that on to fair class functions so they can DTRT wrt vruntime normalization. Store vruntime normalization state in struct sched_entity, so fair class functions that need it always have it, and sched_ttwu_pending() again doesn't need to care whether tasks on the wake_list have been migrated or not. Since there are now no consumers of ENQUEUE_MIGRATED, drop it as well. master v4.6-8889-gf6c658df6385 virgin 256 49096 71698.99 MB/sec warmup 1 sec latency 1136.488 ms 256 155009 72862.08 MB/sec execute 1 sec latency 3136.900 ms 256 207430 72628.04 MB/sec execute 2 sec latency 4137.001 ms 256 259635 72442.97 MB/sec execute 3 sec latency 5137.105 ms 256 311905 72371.84 MB/sec execute 4 sec latency 6137.214 ms 256 364210 72564.99 MB/sec execute 5 sec latency 7137.323 ms 256 416551 72598.74 MB/sec execute 6 sec latency 5816.895 ms 256 468824 72601.54 MB/sec execute 7 sec latency 6815.386 ms 256 520996 72621.87 MB/sec execute 8 sec latency 7815.499 ms 256 573113 72608.75 MB/sec execute 9 sec latency 8815.609 ms 256 cleanup 10 sec 0 cleanup 10 sec master v4.6-8889-gf6c658df6385 post 256 51527 75357.55 MB/sec warmup 1 sec latency 21.591 ms 256 157610 73188.06 MB/sec execute 1 sec latency 12.985 ms 256 210089 72809.01 MB/sec execute 2 sec latency 11.543 ms 256 262554 72681.86 MB/sec execute 3 sec latency 0.209 ms 256 315432 72798.65 MB/sec execute 4 sec latency 0.206 ms 256 368162 72963.33 MB/sec execute 5 sec latency 8.052 ms 256 420854 72976.50 MB/sec execute 6 sec latency 0.221 ms 256 473420 72953.76 MB/sec execute 7 sec latency 0.198 ms 256 525859 73011.17 MB/sec execute 8 sec latency 2.810 ms 256 578301 73052.84 MB/sec execute 9 sec latency 0.247 ms 256 cleanup 10 sec 0 cleanup 10 sec Fixes: b5179ac70de sched/fair: Prepare to fix fairness problems on migration Signed-off-by: Mike Galbraith <umgwanakikb...@gmail.com> --- include/linux/sched.h | 1 kernel/sched/core.c | 6 +---- kernel/sched/fair.c | 60 ++++++++++++++++++++------------------------------ 3 files changed, 28 insertions(+), 39 deletions(-) --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1319,6 +1319,7 @@ struct sched_entity { struct rb_node run_node; struct list_head group_node; unsigned int on_rq; + bool normalized; u64 exec_start; u64 sum_exec_runtime; --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1719,9 +1719,6 @@ ttwu_do_activate(struct rq *rq, struct t #ifdef CONFIG_SMP if (p->sched_contributes_to_load) rq->nr_uninterruptible--; - - if (wake_flags & WF_MIGRATED) - en_flags |= ENQUEUE_MIGRATED; #endif ttwu_activate(rq, p, en_flags); @@ -1774,7 +1771,7 @@ void sched_ttwu_pending(void) * See ttwu_queue(); we only call ttwu_queue_remote() when * its a x-cpu wakeup. */ - ttwu_do_activate(rq, p, WF_MIGRATED, cookie); + ttwu_do_activate(rq, p, 0, cookie); } lockdep_unpin_lock(&rq->lock, cookie); @@ -2166,6 +2163,7 @@ static void __sched_fork(unsigned long c p->se.prev_sum_exec_runtime = 0; p->se.nr_migrations = 0; p->se.vruntime = 0; + p->se.normalized = true; INIT_LIST_HEAD(&p->se.group_node); #ifdef CONFIG_FAIR_GROUP_SCHED --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3230,6 +3230,7 @@ place_entity(struct cfs_rq *cfs_rq, stru /* ensure we never gain time by being placed backwards. */ se->vruntime = max_vruntime(se->vruntime, vruntime); + se->normalized = false; } static void check_enqueue_throttle(struct cfs_rq *cfs_rq); @@ -3285,29 +3286,40 @@ static inline void check_schedstat_requi * CPU and an up-to-date min_vruntime on the destination CPU. */ +static void normalize_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + se->vruntime -= cfs_rq->min_vruntime; + se->normalized = true; +} + +static void renormalize_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + se->vruntime += cfs_rq->min_vruntime; + se->normalized = false; +} + static void enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { - bool renorm = !(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATED); - bool curr = cfs_rq->curr == se; + bool renorm = se->normalized, curr = cfs_rq->curr == se; /* - * If we're the current task, we must renormalise before calling + * If we're the current task, we must renormalize before calling * update_curr(). */ if (renorm && curr) - se->vruntime += cfs_rq->min_vruntime; + renormalize_entity(cfs_rq, se); update_curr(cfs_rq); /* - * Otherwise, renormalise after, such that we're placed at the current + * Otherwise, renormalize after, such that we're placed at the current * moment in time, instead of some random moment in the past. Being * placed in the past could significantly boost this task to the * fairness detriment of existing tasks. */ if (renorm && !curr) - se->vruntime += cfs_rq->min_vruntime; + renormalize_entity(cfs_rq, se); enqueue_entity_load_avg(cfs_rq, se); account_entity_enqueue(cfs_rq, se); @@ -3406,7 +3418,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, st * movement in our normalized position. */ if (!(flags & DEQUEUE_SLEEP)) - se->vruntime -= cfs_rq->min_vruntime; + normalize_entity(cfs_rq, se); /* return excess runtime on last dequeue */ return_cfs_rq_runtime(cfs_rq); @@ -5408,7 +5420,7 @@ static void migrate_task_rq_fair(struct min_vruntime = cfs_rq->min_vruntime; #endif - se->vruntime -= min_vruntime; + normalize_entity(cfs_rq, se); } /* @@ -8319,7 +8331,7 @@ static void task_fork_fair(struct task_s resched_curr(rq); } - se->vruntime -= cfs_rq->min_vruntime; + normalize_entity(cfs_rq, se); raw_spin_unlock_irqrestore(&rq->lock, flags); } @@ -8348,29 +8360,7 @@ prio_changed_fair(struct rq *rq, struct static inline bool vruntime_normalized(struct task_struct *p) { - struct sched_entity *se = &p->se; - - /* - * In both the TASK_ON_RQ_QUEUED and TASK_ON_RQ_MIGRATING cases, - * the dequeue_entity(.flags=0) will already have normalized the - * vruntime. - */ - if (p->on_rq) - return true; - - /* - * When !on_rq, vruntime of the task has usually NOT been normalized. - * But there are some cases where it has already been normalized: - * - * - A forked child which is waiting for being woken up by - * wake_up_new_task(). - * - A task which has been woken up by try_to_wake_up() and - * waiting for actually being woken up by sched_ttwu_pending(). - */ - if (!se->sum_exec_runtime || p->state == TASK_WAKING) - return true; - - return false; + return p->se.normalized; } static void detach_task_cfs_rq(struct task_struct *p) @@ -8384,7 +8374,7 @@ static void detach_task_cfs_rq(struct ta * cause 'unlimited' sleep bonus. */ place_entity(cfs_rq, se, 0); - se->vruntime -= cfs_rq->min_vruntime; + normalize_entity(cfs_rq, se); } /* Catch up with the cfs_rq and remove our load when we leave */ @@ -8407,8 +8397,8 @@ static void attach_task_cfs_rq(struct ta /* Synchronize task with its cfs_rq */ attach_entity_load_avg(cfs_rq, se); - if (!vruntime_normalized(p)) - se->vruntime += cfs_rq->min_vruntime; + if (vruntime_normalized(p)) + renormalize_entity(cfs_rq, se); } static void switched_from_fair(struct rq *rq, struct task_struct *p)