Hi Sacha On Mon, 21 Jun 2021 at 18:22, Vincent Guittot <vincent.guit...@linaro.org> wrote: > > Le lundi 21 juin 2021 à 14:42:23 (+0200), Odin Ugedal a écrit : > > Hi, > > > > Did some more research, and it looks like this is what happens: > > > > $ tree /sys/fs/cgroup/ltp/ -d --charset=ascii > > /sys/fs/cgroup/ltp/ > > |-- drain > > `-- test-6851 > > `-- level2 > > |-- level3a > > | |-- worker1 > > | `-- worker2 > > `-- level3b > > `-- worker3 > > > > Timeline (ish): > > - worker3 gets throttled > > - level3b is decayed, since it has no more load > > - level2 get throttled > > - worker3 get unthrottled > > - level2 get unthrottled > > - worker3 is added to list > > - level3b is not added to list, since nr_running==0 and is decayed > > > > > > The attached diff (based on > > https://lore.kernel.org/lkml/20210518125202.78658-3-o...@uged.al/) > > fixes the issue for me. Not the most elegant solution, but the > > simplest one as of now, and to show what is wrong. > > > > Any thoughts Vincent? > > > I would prefer that we use the reason of adding the cfs in the list instead. > > Something like the below should also fixed the problem. It is based on a > proposal I made to Rik sometimes ago when he tried to flatten the rq: > https://lore.kernel.org/lkml/20190906191237.27006-6-r...@surriel.com/ > > This will ensure that a cfs is added in the list whenever one of its child > is still in the list.
Could you confirm that this patch fixes the problem for you too ? > > --- > kernel/sched/fair.c | 28 ++++++++++++++++++++++++++++ > 1 file changed, 28 insertions(+) > > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c > index ea7de54cb022..e751061a9449 100644 > --- a/kernel/sched/fair.c > +++ b/kernel/sched/fair.c > @@ -3272,6 +3272,31 @@ static inline void cfs_rq_util_change(struct cfs_rq > *cfs_rq, int flags) > > #ifdef CONFIG_SMP > #ifdef CONFIG_FAIR_GROUP_SCHED > +/* > + * Because list_add_leaf_cfs_rq always places a child cfs_rq on the list > + * immediately before a parent cfs_rq, and cfs_rqs are removed from the list > + * bottom-up, we only have to test whether the cfs_rq before us on the list > + * is our child. > + * If cfs_rq is not on the list, test wether a child needs its to be added to > + * connect a branch to the tree * (see list_add_leaf_cfs_rq() for details). > + */ > +static inline bool child_cfs_rq_on_list(struct cfs_rq *cfs_rq) > +{ > + struct cfs_rq *prev_cfs_rq; > + struct list_head *prev; > + > + if (cfs_rq->on_list) { > + prev = cfs_rq->leaf_cfs_rq_list.prev; > + } else { > + struct rq *rq = rq_of(cfs_rq); > + > + prev = rq->tmp_alone_branch; > + } > + > + prev_cfs_rq = container_of(prev, struct cfs_rq, leaf_cfs_rq_list); > + > + return (prev_cfs_rq->tg->parent == cfs_rq->tg); > +} > > static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) > { > @@ -3287,6 +3312,9 @@ static inline bool cfs_rq_is_decayed(struct cfs_rq > *cfs_rq) > if (cfs_rq->avg.runnable_sum) > return false; > > + if (child_cfs_rq_on_list(cfs_rq)) > + return false; > + > return true; > } > > -- > 2.17.1 > > > > > > > Thanks > > Odin > > > > > > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c > > index bfaa6e1f6067..aa32e9c29efd 100644 > > --- a/kernel/sched/fair.c > > +++ b/kernel/sched/fair.c > > @@ -376,7 +376,8 @@ static inline bool list_add_leaf_cfs_rq(struct > > cfs_rq *cfs_rq) > > return false; > > } > > > > -static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq) > > +/* Returns 1 if cfs_rq was present in the list and removed */ > > +static inline bool list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq) > > { > > if (cfs_rq->on_list) { > > struct rq *rq = rq_of(cfs_rq); > > @@ -393,7 +394,9 @@ static inline void list_del_leaf_cfs_rq(struct > > cfs_rq *cfs_rq) > > > > list_del_rcu(&cfs_rq->leaf_cfs_rq_list); > > cfs_rq->on_list = 0; > > + return 1; > > } > > + return 0; > > } > > > > static inline void assert_list_leaf_cfs_rq(struct rq *rq) > > @@ -3298,24 +3301,6 @@ static inline void cfs_rq_util_change(struct > > cfs_rq *cfs_rq, int flags) > > > > #ifdef CONFIG_SMP > > #ifdef CONFIG_FAIR_GROUP_SCHED > > - > > -static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) > > -{ > > - if (cfs_rq->load.weight) > > - return false; > > - > > - if (cfs_rq->avg.load_sum) > > - return false; > > - > > - if (cfs_rq->avg.util_sum) > > - return false; > > - > > - if (cfs_rq->avg.runnable_sum) > > - return false; > > - > > - return true; > > -} > > - > > /** > > * update_tg_load_avg - update the tg's load avg > > * @cfs_rq: the cfs_rq whose avg changed > > @@ -4109,11 +4094,6 @@ static inline void update_misfit_status(struct > > task_struct *p, struct rq *rq) > > > > #else /* CONFIG_SMP */ > > > > -static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) > > -{ > > - return true; > > -} > > - > > #define UPDATE_TG 0x0 > > #define SKIP_AGE_LOAD 0x0 > > #define DO_ATTACH 0x0 > > @@ -4771,10 +4751,11 @@ static int tg_unthrottle_up(struct task_group > > *tg, void *data) > > if (!cfs_rq->throttle_count) { > > cfs_rq->throttled_clock_task_time += rq_clock_task(rq) - > > cfs_rq->throttled_clock_task; > > - > > - /* Add cfs_rq with load or one or more already running > > entities to the list */ > > - if (!cfs_rq_is_decayed(cfs_rq) || cfs_rq->nr_running) > > + if (cfs_rq->insert_on_unthrottle) { > > list_add_leaf_cfs_rq(cfs_rq); > > + if (tg->parent) > > + > > tg->parent->cfs_rq[cpu_of(rq)]->insert_on_unthrottle = true; > > + } > > } > > > > return 0; > > @@ -4788,7 +4769,7 @@ static int tg_throttle_down(struct task_group > > *tg, void *data) > > /* group is entering throttled state, stop time */ > > if (!cfs_rq->throttle_count) { > > cfs_rq->throttled_clock_task = rq_clock_task(rq); > > - list_del_leaf_cfs_rq(cfs_rq); > > + cfs_rq->insert_on_unthrottle = list_del_leaf_cfs_rq(cfs_rq); > > } > > cfs_rq->throttle_count++; > > > > @@ -8019,6 +8000,23 @@ static bool __update_blocked_others(struct rq > > *rq, bool *done) > > > > #ifdef CONFIG_FAIR_GROUP_SCHED > > > > +static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) > > +{ > > + if (cfs_rq->load.weight) > > + return false; > > + > > + if (cfs_rq->avg.load_sum) > > + return false; > > + > > + if (cfs_rq->avg.util_sum) > > + return false; > > + > > + if (cfs_rq->avg.runnable_sum) > > + return false; > > + > > + return true; > > +} > > + > > static bool __update_blocked_fair(struct rq *rq, bool *done) > > { > > struct cfs_rq *cfs_rq, *pos; > > diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h > > index a189bec13729..12a707d99ee6 100644 > > --- a/kernel/sched/sched.h > > +++ b/kernel/sched/sched.h > > @@ -602,6 +602,7 @@ struct cfs_rq { > > u64 throttled_clock_task_time; > > int throttled; > > int throttle_count; > > + int insert_on_unthrottle; > > struct list_head throttled_list; > > #endif /* CONFIG_CFS_BANDWIDTH */ > > #endif /* CONFIG_FAIR_GROUP_SCHED */