Currently, rq->leaf_cfs_rq_list is a traversal ordered list of all live cfs_rqs which have ever been active on the CPU; unfortunately, this makes update_blocked_averages() O(total number of CPU cgroups) which isn't scalable at all.
The next patch will make rq->leaf_cfs_rq_list only contain the cfs_rqs which are currently active. In preparation, this patch converts users which need to traverse all cfs_rqs to use task_groups list instead. task_groups list is protected by its own lock. While it allows RCU protected traversal and the order of operations guarantees that all online cfs_rqs will be visited, holding rq->lock won't protect against iterating an already unregistered cfs_rq. To avoid operating on an already dead cfs_rq, a new state variable cfs_rq->online is added which is protected by rq->lock and tracks whether the cfs_rq is registered. As clearing of cfs_rq->online should be protected by rq->lock, the locking avoidance optimization in unregister_fair_sched_group() is removed. Given that the optimization is meanginful only when removing cgroups which are created but never used and the general heaviness of cgroup removal, the impact is unlikely to be noticeable. Another change is that print_cfs_stats() would now print cfs_rqs in a different order. If this matters, we can improve it later so that it first prints cfs_rqs on leaf_cfs_rq_list and then follow up with the rest. Signed-off-by: Tejun Heo <t...@kernel.org> Cc: Ingo Molnar <mi...@redhat.com> Cc: Peter Zijlstra <pet...@infradead.org> Cc: Mike Galbraith <efa...@gmx.de> Cc: Paul Turner <p...@google.com> Cc: Chris Mason <c...@fb.com> --- Hello, This is another set of CPU controller performance fix patches and based on top of -next as of today. Thanks. kernel/sched/fair.c | 45 +++++++++++++++++++++++++-------------------- kernel/sched/sched.h | 1 + 2 files changed, 26 insertions(+), 20 deletions(-) --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4644,23 +4644,32 @@ static void destroy_cfs_bandwidth(struct static void __maybe_unused update_runtime_enabled(struct rq *rq) { - struct cfs_rq *cfs_rq; + struct task_group *tg; - for_each_leaf_cfs_rq(rq, cfs_rq) { - struct cfs_bandwidth *cfs_b = &cfs_rq->tg->cfs_bandwidth; + rcu_read_lock(); + list_for_each_entry_rcu(tg, &task_groups, list) { + struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; + struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; + + if (!cfs_rq->online) + continue; raw_spin_lock(&cfs_b->lock); cfs_rq->runtime_enabled = cfs_b->quota != RUNTIME_INF; raw_spin_unlock(&cfs_b->lock); } + rcu_read_unlock(); } static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) { - struct cfs_rq *cfs_rq; + struct task_group *tg; - for_each_leaf_cfs_rq(rq, cfs_rq) { - if (!cfs_rq->runtime_enabled) + rcu_read_lock(); + list_for_each_entry_rcu(tg, &task_groups, list) { + struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; + + if (!cfs_rq->online || !cfs_rq->runtime_enabled) continue; /* @@ -4677,6 +4686,7 @@ static void __maybe_unused unthrottle_of if (cfs_rq_throttled(cfs_rq)) unthrottle_cfs_rq(cfs_rq); } + rcu_read_unlock(); } #else /* CONFIG_CFS_BANDWIDTH */ @@ -9345,6 +9355,7 @@ void online_fair_sched_group(struct task se = tg->se[i]; raw_spin_lock_irq(&rq->lock); + se->my_q->online = 1; update_rq_clock(rq); attach_entity_cfs_rq(se); sync_throttle(tg, i); @@ -9355,24 +9366,18 @@ void online_fair_sched_group(struct task void unregister_fair_sched_group(struct task_group *tg) { unsigned long flags; - struct rq *rq; int cpu; for_each_possible_cpu(cpu) { + struct rq *rq = cpu_rq(cpu); + struct cfs_rq *cfs_rq = tg->cfs_rq[cpu]; + if (tg->se[cpu]) remove_entity_load_avg(tg->se[cpu]); - /* - * Only empty task groups can be destroyed; so we can speculatively - * check on_list without danger of it being re-added. - */ - if (!tg->cfs_rq[cpu]->on_list) - continue; - - rq = cpu_rq(cpu); - raw_spin_lock_irqsave(&rq->lock, flags); - list_del_leaf_cfs_rq(tg->cfs_rq[cpu]); + list_del_leaf_cfs_rq(cfs_rq); + cfs_rq->online = 0; raw_spin_unlock_irqrestore(&rq->lock, flags); } } @@ -9523,11 +9528,11 @@ const struct sched_class fair_sched_clas #ifdef CONFIG_SCHED_DEBUG void print_cfs_stats(struct seq_file *m, int cpu) { - struct cfs_rq *cfs_rq; + struct task_group *tg; rcu_read_lock(); - for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq) - print_cfs_rq(m, cpu, cfs_rq); + list_for_each_entry_rcu(tg, &task_groups, list) + print_cfs_rq(m, cpu, tg->cfs_rq[cpu]); rcu_read_unlock(); } --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -448,6 +448,7 @@ struct cfs_rq { #ifdef CONFIG_FAIR_GROUP_SCHED struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ + int online; /* online state, protected by rq->lock */ /* * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in