Le lundi 21 juin 2021 à 14:42:23 (+0200), Odin Ugedal a écrit : > Hi, > > Did some more research, and it looks like this is what happens: > > $ tree /sys/fs/cgroup/ltp/ -d --charset=ascii > /sys/fs/cgroup/ltp/ > |-- drain > `-- test-6851 > `-- level2 > |-- level3a > | |-- worker1 > | `-- worker2 > `-- level3b > `-- worker3 > > Timeline (ish): > - worker3 gets throttled > - level3b is decayed, since it has no more load > - level2 get throttled > - worker3 get unthrottled > - level2 get unthrottled > - worker3 is added to list > - level3b is not added to list, since nr_running==0 and is decayed > > > The attached diff (based on > https://lore.kernel.org/lkml/20210518125202.78658-3-o...@uged.al/) > fixes the issue for me. Not the most elegant solution, but the > simplest one as of now, and to show what is wrong. > > Any thoughts Vincent?
I would prefer that we use the reason of adding the cfs in the list instead. Something like the below should also fixed the problem. It is based on a proposal I made to Rik sometimes ago when he tried to flatten the rq: https://lore.kernel.org/lkml/20190906191237.27006-6-r...@surriel.com/ This will ensure that a cfs is added in the list whenever one of its child is still in the list. --- kernel/sched/fair.c | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ea7de54cb022..e751061a9449 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3272,6 +3272,31 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags) #ifdef CONFIG_SMP #ifdef CONFIG_FAIR_GROUP_SCHED +/* + * Because list_add_leaf_cfs_rq always places a child cfs_rq on the list + * immediately before a parent cfs_rq, and cfs_rqs are removed from the list + * bottom-up, we only have to test whether the cfs_rq before us on the list + * is our child. + * If cfs_rq is not on the list, test wether a child needs its to be added to + * connect a branch to the tree * (see list_add_leaf_cfs_rq() for details). + */ +static inline bool child_cfs_rq_on_list(struct cfs_rq *cfs_rq) +{ + struct cfs_rq *prev_cfs_rq; + struct list_head *prev; + + if (cfs_rq->on_list) { + prev = cfs_rq->leaf_cfs_rq_list.prev; + } else { + struct rq *rq = rq_of(cfs_rq); + + prev = rq->tmp_alone_branch; + } + + prev_cfs_rq = container_of(prev, struct cfs_rq, leaf_cfs_rq_list); + + return (prev_cfs_rq->tg->parent == cfs_rq->tg); +} static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) { @@ -3287,6 +3312,9 @@ static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) if (cfs_rq->avg.runnable_sum) return false; + if (child_cfs_rq_on_list(cfs_rq)) + return false; + return true; } -- 2.17.1 > > Thanks > Odin > > > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c > index bfaa6e1f6067..aa32e9c29efd 100644 > --- a/kernel/sched/fair.c > +++ b/kernel/sched/fair.c > @@ -376,7 +376,8 @@ static inline bool list_add_leaf_cfs_rq(struct > cfs_rq *cfs_rq) > return false; > } > > -static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq) > +/* Returns 1 if cfs_rq was present in the list and removed */ > +static inline bool list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq) > { > if (cfs_rq->on_list) { > struct rq *rq = rq_of(cfs_rq); > @@ -393,7 +394,9 @@ static inline void list_del_leaf_cfs_rq(struct > cfs_rq *cfs_rq) > > list_del_rcu(&cfs_rq->leaf_cfs_rq_list); > cfs_rq->on_list = 0; > + return 1; > } > + return 0; > } > > static inline void assert_list_leaf_cfs_rq(struct rq *rq) > @@ -3298,24 +3301,6 @@ static inline void cfs_rq_util_change(struct > cfs_rq *cfs_rq, int flags) > > #ifdef CONFIG_SMP > #ifdef CONFIG_FAIR_GROUP_SCHED > - > -static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) > -{ > - if (cfs_rq->load.weight) > - return false; > - > - if (cfs_rq->avg.load_sum) > - return false; > - > - if (cfs_rq->avg.util_sum) > - return false; > - > - if (cfs_rq->avg.runnable_sum) > - return false; > - > - return true; > -} > - > /** > * update_tg_load_avg - update the tg's load avg > * @cfs_rq: the cfs_rq whose avg changed > @@ -4109,11 +4094,6 @@ static inline void update_misfit_status(struct > task_struct *p, struct rq *rq) > > #else /* CONFIG_SMP */ > > -static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) > -{ > - return true; > -} > - > #define UPDATE_TG 0x0 > #define SKIP_AGE_LOAD 0x0 > #define DO_ATTACH 0x0 > @@ -4771,10 +4751,11 @@ static int tg_unthrottle_up(struct task_group > *tg, void *data) > if (!cfs_rq->throttle_count) { > cfs_rq->throttled_clock_task_time += rq_clock_task(rq) - > cfs_rq->throttled_clock_task; > - > - /* Add cfs_rq with load or one or more already running > entities to the list */ > - if (!cfs_rq_is_decayed(cfs_rq) || cfs_rq->nr_running) > + if (cfs_rq->insert_on_unthrottle) { > list_add_leaf_cfs_rq(cfs_rq); > + if (tg->parent) > + > tg->parent->cfs_rq[cpu_of(rq)]->insert_on_unthrottle = true; > + } > } > > return 0; > @@ -4788,7 +4769,7 @@ static int tg_throttle_down(struct task_group > *tg, void *data) > /* group is entering throttled state, stop time */ > if (!cfs_rq->throttle_count) { > cfs_rq->throttled_clock_task = rq_clock_task(rq); > - list_del_leaf_cfs_rq(cfs_rq); > + cfs_rq->insert_on_unthrottle = list_del_leaf_cfs_rq(cfs_rq); > } > cfs_rq->throttle_count++; > > @@ -8019,6 +8000,23 @@ static bool __update_blocked_others(struct rq > *rq, bool *done) > > #ifdef CONFIG_FAIR_GROUP_SCHED > > +static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) > +{ > + if (cfs_rq->load.weight) > + return false; > + > + if (cfs_rq->avg.load_sum) > + return false; > + > + if (cfs_rq->avg.util_sum) > + return false; > + > + if (cfs_rq->avg.runnable_sum) > + return false; > + > + return true; > +} > + > static bool __update_blocked_fair(struct rq *rq, bool *done) > { > struct cfs_rq *cfs_rq, *pos; > diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h > index a189bec13729..12a707d99ee6 100644 > --- a/kernel/sched/sched.h > +++ b/kernel/sched/sched.h > @@ -602,6 +602,7 @@ struct cfs_rq { > u64 throttled_clock_task_time; > int throttled; > int throttle_count; > + int insert_on_unthrottle; > struct list_head throttled_list; > #endif /* CONFIG_CFS_BANDWIDTH */ > #endif /* CONFIG_FAIR_GROUP_SCHED */