Hi Rik, On Thu, 22 Aug 2019 at 04:18, Rik van Riel <r...@surriel.com> wrote: > > The runnable_load magic is used to quickly propagate information about > runnable tasks up the hierarchy of runqueues. The runnable_load_avg is > mostly used for the load balancing code, which only examines the value at > the root cfs_rq. > > Redefine the root cfs_rq runnable_load_avg to be the sum of task_h_loads > of the runnable tasks. This works because the hierarchical runnable_load of > a task is already equal to the task_se_h_load today. This provides enough > information to the load balancer. > > The runnable_load_avg of the cgroup cfs_rqs does not appear to be > used for anything, so don't bother calculating those. > > This removes one of the things that the code currently traverses the > cgroup hierarchy for, and getting rid of it brings us one step closer > to a flat runqueue for the CPU controller.
I like your proposal but just wanted to clarify one thing with this patch. Although you removed the computation of runnable_load_avg of the cgroup cfs_rq, we are still traversing the hierarchy to update the root cfs_rq runnable_load_avg because we are traversing the hierarchy for computing the task_h_loads That being said, if we manage to remove the need on using runnable_load_avg we will completely skip this traversal. I have a proposal to remove it from load balance and wake up path but i haven't look at numa stats which also use it > > Signed-off-by: Rik van Riel <r...@surriel.com> > --- > include/linux/sched.h | 3 +- > kernel/sched/core.c | 2 - > kernel/sched/debug.c | 1 + > kernel/sched/fair.c | 125 +++++++++++++----------------------------- > kernel/sched/pelt.c | 64 ++++++--------------- > kernel/sched/sched.h | 6 -- > 6 files changed, 56 insertions(+), 145 deletions(-) > > diff --git a/include/linux/sched.h b/include/linux/sched.h > index 11837410690f..f5bb6948e40c 100644 > --- a/include/linux/sched.h > +++ b/include/linux/sched.h > @@ -391,7 +391,6 @@ struct util_est { > struct sched_avg { > u64 last_update_time; > u64 load_sum; > - u64 runnable_load_sum; > u32 util_sum; > u32 period_contrib; > unsigned long load_avg; > @@ -439,7 +438,6 @@ struct sched_statistics { > struct sched_entity { > /* For load-balancing: */ > struct load_weight load; > - unsigned long runnable_weight; > struct rb_node run_node; > struct list_head group_node; > unsigned int on_rq; > @@ -455,6 +453,7 @@ struct sched_entity { > > #ifdef CONFIG_FAIR_GROUP_SCHED > int depth; > + unsigned long enqueued_h_load; > struct sched_entity *parent; > /* rq on which this entity is (to be) queued: */ > struct cfs_rq *cfs_rq; > diff --git a/kernel/sched/core.c b/kernel/sched/core.c > index 874c427742a9..fbd96900f715 100644 > --- a/kernel/sched/core.c > +++ b/kernel/sched/core.c > @@ -744,7 +744,6 @@ static void set_load_weight(struct task_struct *p, bool > update_load) > if (task_has_idle_policy(p)) { > load->weight = scale_load(WEIGHT_IDLEPRIO); > load->inv_weight = WMULT_IDLEPRIO; > - p->se.runnable_weight = load->weight; > return; > } > > @@ -757,7 +756,6 @@ static void set_load_weight(struct task_struct *p, bool > update_load) > } else { > load->weight = scale_load(sched_prio_to_weight[prio]); > load->inv_weight = sched_prio_to_wmult[prio]; > - p->se.runnable_weight = load->weight; > } > } > > diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c > index f6beaca97a09..cefc1b171c0b 100644 > --- a/kernel/sched/debug.c > +++ b/kernel/sched/debug.c > @@ -962,6 +962,7 @@ void proc_sched_show_task(struct task_struct *p, struct > pid_namespace *ns, > P(se.avg.load_avg); > P(se.avg.runnable_load_avg); > P(se.avg.util_avg); > + P(se.enqueued_h_load); > P(se.avg.last_update_time); > P(se.avg.util_est.ewma); > P(se.avg.util_est.enqueued); > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c > index eadf9a96b3e1..30afeda1620f 100644 > --- a/kernel/sched/fair.c > +++ b/kernel/sched/fair.c > @@ -723,9 +723,7 @@ void init_entity_runnable_average(struct sched_entity *se) > * nothing has been attached to the task group yet. > */ > if (entity_is_task(se)) > - sa->runnable_load_avg = sa->load_avg = > scale_load_down(se->load.weight); > - > - se->runnable_weight = se->load.weight; > + sa->load_avg = scale_load_down(se->load.weight); > > /* when this task enqueue'ed, it will contribute to its cfs_rq's > load_avg */ > } > @@ -2766,20 +2764,39 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct > sched_entity *se) > static inline void > enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) > { > - cfs_rq->runnable_weight += se->runnable_weight; > + if (entity_is_task(se)) { > + struct cfs_rq *root_cfs_rq = &cfs_rq->rq->cfs; > + se->enqueued_h_load = task_se_h_load(se); > > - cfs_rq->avg.runnable_load_avg += se->avg.runnable_load_avg; > - cfs_rq->avg.runnable_load_sum += se_runnable(se) * > se->avg.runnable_load_sum; > + root_cfs_rq->avg.runnable_load_avg += se->enqueued_h_load; > + } > } > > static inline void > dequeue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) > { > - cfs_rq->runnable_weight -= se->runnable_weight; > + if (entity_is_task(se)) { > + struct cfs_rq *root_cfs_rq = &cfs_rq->rq->cfs; > + sub_positive(&root_cfs_rq->avg.runnable_load_avg, > + se->enqueued_h_load); > + } > +} > + > +static inline void > +update_runnable_load_avg(struct sched_entity *se) > +{ > + struct cfs_rq *root_cfs_rq = &cfs_rq_of(se)->rq->cfs; > + long new_h_load, delta; > > - sub_positive(&cfs_rq->avg.runnable_load_avg, > se->avg.runnable_load_avg); > - sub_positive(&cfs_rq->avg.runnable_load_sum, > - se_runnable(se) * se->avg.runnable_load_sum); > + SCHED_WARN_ON(!entity_is_task(se)); > + > + if (!se->on_rq) > + return; > + > + new_h_load = task_se_h_load(se); > + delta = new_h_load - se->enqueued_h_load; > + root_cfs_rq->avg.runnable_load_avg += delta; > + se->enqueued_h_load = new_h_load; > } > > static inline void > @@ -2807,7 +2824,7 @@ dequeue_load_avg(struct cfs_rq *cfs_rq, struct > sched_entity *se) { } > #endif > > static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, > - unsigned long weight, unsigned long runnable) > + unsigned long weight) > { > if (se->on_rq) { > /* commit outstanding execution time */ > @@ -2818,7 +2835,6 @@ static void reweight_entity(struct cfs_rq *cfs_rq, > struct sched_entity *se, > } > dequeue_load_avg(cfs_rq, se); > > - se->runnable_weight = runnable; > update_load_set(&se->load, weight); > > #ifdef CONFIG_SMP > @@ -2826,8 +2842,6 @@ static void reweight_entity(struct cfs_rq *cfs_rq, > struct sched_entity *se, > u32 divider = LOAD_AVG_MAX - 1024 + se->avg.period_contrib; > > se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, > divider); > - se->avg.runnable_load_avg = > - div_u64(se_runnable(se) * se->avg.runnable_load_sum, > divider); > } while (0); > #endif > > @@ -2845,7 +2859,7 @@ void reweight_task(struct task_struct *p, int prio) > struct load_weight *load = &se->load; > unsigned long weight = scale_load(sched_prio_to_weight[prio]); > > - reweight_entity(cfs_rq, se, weight, weight); > + reweight_entity(cfs_rq, se, weight); > load->inv_weight = sched_prio_to_wmult[prio]; > } > > @@ -2958,49 +2972,6 @@ static long calc_group_shares(struct cfs_rq *cfs_rq) > return clamp_t(long, shares, MIN_SHARES, tg_shares); > } > > -/* > - * This calculates the effective runnable weight for a group entity based on > - * the group entity weight calculated above. > - * > - * Because of the above approximation (2), our group entity weight is > - * an load_avg based ratio (3). This means that it includes blocked load and > - * does not represent the runnable weight. > - * > - * Approximate the group entity's runnable weight per ratio from the group > - * runqueue: > - * > - * grq->avg.runnable_load_avg > - * ge->runnable_weight = ge->load.weight * -------------------------- (7) > - * grq->avg.load_avg > - * > - * However, analogous to above, since the avg numbers are slow, this leads to > - * transients in the from-idle case. Instead we use: > - * > - * ge->runnable_weight = ge->load.weight * > - * > - * max(grq->avg.runnable_load_avg, grq->runnable_weight) > - * ----------------------------------------------------- (8) > - * max(grq->avg.load_avg, grq->load.weight) > - * > - * Where these max() serve both to use the 'instant' values to fix the slow > - * from-idle and avoid the /0 on to-idle, similar to (6). > - */ > -static long calc_group_runnable(struct cfs_rq *cfs_rq, long shares) > -{ > - long runnable, load_avg; > - > - load_avg = max(cfs_rq->avg.load_avg, > - scale_load_down(cfs_rq->load.weight)); > - > - runnable = max(cfs_rq->avg.runnable_load_avg, > - scale_load_down(cfs_rq->runnable_weight)); > - > - runnable *= shares; > - if (load_avg) > - runnable /= load_avg; > - > - return clamp_t(long, runnable, MIN_SHARES, shares); > -} > #endif /* CONFIG_SMP */ > > static inline int throttled_hierarchy(struct cfs_rq *cfs_rq); > @@ -3012,25 +2983,26 @@ static inline int throttled_hierarchy(struct cfs_rq > *cfs_rq); > static void update_cfs_group(struct sched_entity *se) > { > struct cfs_rq *gcfs_rq = group_cfs_rq(se); > - long shares, runnable; > + long shares; > > - if (!gcfs_rq) > + if (!gcfs_rq) { > + update_runnable_load_avg(se); > return; > + } > > if (throttled_hierarchy(gcfs_rq)) > return; > > #ifndef CONFIG_SMP > - runnable = shares = READ_ONCE(gcfs_rq->tg->shares); > + shares = READ_ONCE(gcfs_rq->tg->shares); > > if (likely(se->load.weight == shares)) > return; > #else > shares = calc_group_shares(gcfs_rq); > - runnable = calc_group_runnable(gcfs_rq, shares); > #endif > > - reweight_entity(cfs_rq_of(se), se, shares, runnable); > + reweight_entity(cfs_rq_of(se), se, shares); > } > > #else /* CONFIG_FAIR_GROUP_SCHED */ > @@ -3243,8 +3215,8 @@ static inline void > update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, > struct cfs_rq *gcfs_rq) > { > long delta_avg, running_sum, runnable_sum = > gcfs_rq->prop_runnable_sum; > - unsigned long runnable_load_avg, load_avg; > - u64 runnable_load_sum, load_sum = 0; > + unsigned long load_avg; > + u64 load_sum = 0; > s64 delta_sum; > > if (!runnable_sum) > @@ -3292,19 +3264,6 @@ update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct > sched_entity *se, struct cf > se->avg.load_avg = load_avg; > add_positive(&cfs_rq->avg.load_avg, delta_avg); > add_positive(&cfs_rq->avg.load_sum, delta_sum); > - > - runnable_load_sum = (s64)se_runnable(se) * runnable_sum; > - runnable_load_avg = div_s64(runnable_load_sum, LOAD_AVG_MAX); > - delta_sum = runnable_load_sum - se_weight(se) * > se->avg.runnable_load_sum; > - delta_avg = runnable_load_avg - se->avg.runnable_load_avg; > - > - se->avg.runnable_load_sum = runnable_sum; > - se->avg.runnable_load_avg = runnable_load_avg; > - > - if (se->on_rq) { > - add_positive(&cfs_rq->avg.runnable_load_avg, delta_avg); > - add_positive(&cfs_rq->avg.runnable_load_sum, delta_sum); > - } > } > > static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long > runnable_sum) > @@ -3399,7 +3358,7 @@ static inline void add_tg_cfs_propagate(struct cfs_rq > *cfs_rq, long runnable_sum > static inline int > update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) > { > - unsigned long removed_load = 0, removed_util = 0, > removed_runnable_sum = 0; > + unsigned long removed_load = 0, removed_util = 0; > struct sched_avg *sa = &cfs_rq->avg; > int decayed = 0; > > @@ -3410,7 +3369,6 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) > raw_spin_lock(&cfs_rq->removed.lock); > swap(cfs_rq->removed.util_avg, removed_util); > swap(cfs_rq->removed.load_avg, removed_load); > - swap(cfs_rq->removed.runnable_sum, removed_runnable_sum); > cfs_rq->removed.nr = 0; > raw_spin_unlock(&cfs_rq->removed.lock); > > @@ -3422,8 +3380,6 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) > sub_positive(&sa->util_avg, r); > sub_positive(&sa->util_sum, r * divider); > > - add_tg_cfs_propagate(cfs_rq, -(long)removed_runnable_sum); > - > decayed = 1; > } > > @@ -3477,8 +3433,6 @@ static void attach_entity_load_avg(struct cfs_rq > *cfs_rq, struct sched_entity *s > div_u64(se->avg.load_avg * se->avg.load_sum, > se_weight(se)); > } > > - se->avg.runnable_load_sum = se->avg.load_sum; > - > enqueue_load_avg(cfs_rq, se); > cfs_rq->avg.util_avg += se->avg.util_avg; > cfs_rq->avg.util_sum += se->avg.util_sum; > @@ -7735,9 +7689,6 @@ static inline bool cfs_rq_is_decayed(struct cfs_rq > *cfs_rq) > if (cfs_rq->avg.util_sum) > return false; > > - if (cfs_rq->avg.runnable_load_sum) > - return false; > - > return true; > } > > diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c > index befce29bd882..b3f3e8b29394 100644 > --- a/kernel/sched/pelt.c > +++ b/kernel/sched/pelt.c > @@ -106,7 +106,7 @@ static u32 __accumulate_pelt_segments(u64 periods, u32 > d1, u32 d3) > */ > static __always_inline u32 > accumulate_sum(u64 delta, struct sched_avg *sa, > - unsigned long load, unsigned long runnable, int running) > + unsigned long load, int running) > { > u32 contrib = (u32)delta; /* p == 0 -> delta < 1024 */ > u64 periods; > @@ -119,8 +119,6 @@ accumulate_sum(u64 delta, struct sched_avg *sa, > */ > if (periods) { > sa->load_sum = decay_load(sa->load_sum, periods); > - sa->runnable_load_sum = > - decay_load(sa->runnable_load_sum, periods); > sa->util_sum = decay_load((u64)(sa->util_sum), periods); > > /* > @@ -134,8 +132,6 @@ accumulate_sum(u64 delta, struct sched_avg *sa, > > if (load) > sa->load_sum += load * contrib; > - if (runnable) > - sa->runnable_load_sum += runnable * contrib; > if (running) > sa->util_sum += contrib << SCHED_CAPACITY_SHIFT; > > @@ -172,7 +168,7 @@ accumulate_sum(u64 delta, struct sched_avg *sa, > */ > static __always_inline int > ___update_load_sum(u64 now, struct sched_avg *sa, > - unsigned long load, unsigned long runnable, int running) > + unsigned long load, int running) > { > u64 delta; > > @@ -206,7 +202,7 @@ ___update_load_sum(u64 now, struct sched_avg *sa, > * update_blocked_averages() > */ > if (!load) > - runnable = running = 0; > + running = 0; > > /* > * Now we know we crossed measurement unit boundaries. The *_avg > @@ -215,14 +211,14 @@ ___update_load_sum(u64 now, struct sched_avg *sa, > * Step 1: accumulate *_sum since last_update_time. If we haven't > * crossed period boundaries, finish. > */ > - if (!accumulate_sum(delta, sa, load, runnable, running)) > + if (!accumulate_sum(delta, sa, load, running)) > return 0; > > return 1; > } > > static __always_inline void > -___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long > runnable) > +___update_load_avg(struct sched_avg *sa, unsigned long load) > { > u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib; > > @@ -230,41 +226,25 @@ ___update_load_avg(struct sched_avg *sa, unsigned long > load, unsigned long runna > * Step 2: update *_avg. > */ > sa->load_avg = div_u64(load * sa->load_sum, divider); > - sa->runnable_load_avg = div_u64(runnable * sa->runnable_load_sum, > divider); > WRITE_ONCE(sa->util_avg, sa->util_sum / divider); > } > > /* > * sched_entity: > * > - * task: > - * se_runnable() == se_weight() > - * > - * group: [ see update_cfs_group() ] > - * se_weight() = tg->weight * grq->load_avg / tg->load_avg > - * se_runnable() = se_weight(se) * grq->runnable_load_avg / grq->load_avg > - * > * load_sum := runnable_sum > * load_avg = se_weight(se) * runnable_avg > * > - * runnable_load_sum := runnable_sum > - * runnable_load_avg = se_runnable(se) * runnable_avg > - * > - * XXX collapse load_sum and runnable_load_sum > - * > * cfq_rq: > * > * load_sum = \Sum se_weight(se) * se->avg.load_sum > * load_avg = \Sum se->avg.load_avg > - * > - * runnable_load_sum = \Sum se_runnable(se) * se->avg.runnable_load_sum > - * runnable_load_avg = \Sum se->avg.runable_load_avg > */ > > int __update_load_avg_blocked_se(u64 now, struct sched_entity *se) > { > - if (___update_load_sum(now, &se->avg, 0, 0, 0)) { > - ___update_load_avg(&se->avg, se_weight(se), se_runnable(se)); > + if (___update_load_sum(now, &se->avg, 0, 0)) { > + ___update_load_avg(&se->avg, se_weight(se)); > return 1; > } > > @@ -273,10 +253,10 @@ int __update_load_avg_blocked_se(u64 now, struct > sched_entity *se) > > int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_entity > *se) > { > - if (___update_load_sum(now, &se->avg, !!se->on_rq, !!se->on_rq, > + if (___update_load_sum(now, &se->avg, !!se->on_rq, > cfs_rq->curr == se)) { > > - ___update_load_avg(&se->avg, se_weight(se), se_runnable(se)); > + ___update_load_avg(&se->avg, se_weight(se)); > cfs_se_util_change(&se->avg); > return 1; > } > @@ -288,10 +268,9 @@ int __update_load_avg_cfs_rq(u64 now, struct cfs_rq > *cfs_rq) > { > if (___update_load_sum(now, &cfs_rq->avg, > scale_load_down(cfs_rq->load.weight), > - scale_load_down(cfs_rq->runnable_weight), > cfs_rq->curr != NULL)) { > > - ___update_load_avg(&cfs_rq->avg, 1, 1); > + ___update_load_avg(&cfs_rq->avg, 1); > return 1; > } > > @@ -303,20 +282,18 @@ int __update_load_avg_cfs_rq(u64 now, struct cfs_rq > *cfs_rq) > * > * util_sum = \Sum se->avg.util_sum but se->avg.util_sum is not tracked > * util_sum = cpu_scale * load_sum > - * runnable_load_sum = load_sum > * > - * load_avg and runnable_load_avg are not supported and meaningless. > + * load_avg is not supported and meaningless. > * > */ > > int update_rt_rq_load_avg(u64 now, struct rq *rq, int running) > { > if (___update_load_sum(now, &rq->avg_rt, > - running, > running, > running)) { > > - ___update_load_avg(&rq->avg_rt, 1, 1); > + ___update_load_avg(&rq->avg_rt, 1); > return 1; > } > > @@ -328,18 +305,16 @@ int update_rt_rq_load_avg(u64 now, struct rq *rq, int > running) > * > * util_sum = \Sum se->avg.util_sum but se->avg.util_sum is not tracked > * util_sum = cpu_scale * load_sum > - * runnable_load_sum = load_sum > * > */ > > int update_dl_rq_load_avg(u64 now, struct rq *rq, int running) > { > if (___update_load_sum(now, &rq->avg_dl, > - running, > running, > running)) { > > - ___update_load_avg(&rq->avg_dl, 1, 1); > + ___update_load_avg(&rq->avg_dl, 1); > return 1; > } > > @@ -352,7 +327,6 @@ int update_dl_rq_load_avg(u64 now, struct rq *rq, int > running) > * > * util_sum = \Sum se->avg.util_sum but se->avg.util_sum is not tracked > * util_sum = cpu_scale * load_sum > - * runnable_load_sum = load_sum > * > */ > > @@ -379,17 +353,11 @@ int update_irq_load_avg(struct rq *rq, u64 running) > * We can safely remove running from rq->clock because > * rq->clock += delta with delta >= running > */ > - ret = ___update_load_sum(rq->clock - running, &rq->avg_irq, > - 0, > - 0, > - 0); > - ret += ___update_load_sum(rq->clock, &rq->avg_irq, > - 1, > - 1, > - 1); > + ret = ___update_load_sum(rq->clock - running, &rq->avg_irq, 0, 0); > + ret += ___update_load_sum(rq->clock, &rq->avg_irq, 1, 1); > > if (ret) > - ___update_load_avg(&rq->avg_irq, 1, 1); > + ___update_load_avg(&rq->avg_irq, 1); > > return ret; > } > diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h > index b52ed1ada0be..5be14cee61f9 100644 > --- a/kernel/sched/sched.h > +++ b/kernel/sched/sched.h > @@ -487,7 +487,6 @@ struct cfs_bandwidth { }; > /* CFS-related fields in a runqueue */ > struct cfs_rq { > struct load_weight load; > - unsigned long runnable_weight; > unsigned int nr_running; > unsigned int h_nr_running; > > @@ -700,11 +699,6 @@ static inline long se_weight(struct sched_entity *se) > return scale_load_down(se->load.weight); > } > > -static inline long se_runnable(struct sched_entity *se) > -{ > - return scale_load_down(se->runnable_weight); > -} > - > static inline bool sched_asym_prefer(int a, int b) > { > return arch_asym_cpu_priority(a) > arch_asym_cpu_priority(b); > -- > 2.20.1 >