cfs load_balance only takes care of CFS tasks whereas CPUs can be used by other scheduling class. Typically, a CFS task preempted by a RT or deadline task will not get a chance to be pulled on another CPU because the load_balance doesn't take into account tasks from other classes. Add sum of nr_running in the statistics and use it to detect such situation.
Signed-off-by: Vincent Guittot <vincent.guit...@linaro.org> --- kernel/sched/fair.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d33379c..7e74836 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7716,6 +7716,7 @@ struct sg_lb_stats { unsigned long group_load; /* Total load over the CPUs of the group */ unsigned long group_capacity; unsigned long group_util; /* Total utilization of the group */ + unsigned int sum_nr_running; /* Nr of tasks running in the group */ unsigned int sum_h_nr_running; /* Nr of CFS tasks running in the group */ unsigned int idle_cpus; unsigned int group_weight; @@ -7949,7 +7950,7 @@ static inline int sg_imbalanced(struct sched_group *group) static inline bool group_has_capacity(struct lb_env *env, struct sg_lb_stats *sgs) { - if (sgs->sum_h_nr_running < sgs->group_weight) + if (sgs->sum_nr_running < sgs->group_weight) return true; if ((sgs->group_capacity * 100) > @@ -7970,7 +7971,7 @@ group_has_capacity(struct lb_env *env, struct sg_lb_stats *sgs) static inline bool group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs) { - if (sgs->sum_h_nr_running <= sgs->group_weight) + if (sgs->sum_nr_running <= sgs->group_weight) return false; if ((sgs->group_capacity * 100) < @@ -8074,6 +8075,8 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->sum_h_nr_running += rq->cfs.h_nr_running; nr_running = rq->nr_running; + sgs->sum_nr_running += nr_running; + if (nr_running > 1) *sg_status |= SG_OVERLOAD; @@ -8423,7 +8426,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s * groups. */ env->balance_type = migrate_task; - env->imbalance = (busiest->sum_h_nr_running - local->sum_h_nr_running) >> 1; + env->imbalance = (busiest->sum_nr_running - local->sum_nr_running) >> 1; return; } @@ -8585,7 +8588,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env) /* Try to move all excess tasks to child's sibling domain */ if (sds.prefer_sibling && local->group_type == group_has_spare && - busiest->sum_h_nr_running > local->sum_h_nr_running + 1) + busiest->sum_nr_running > local->sum_nr_running + 1) goto force_balance; if (busiest->group_type != group_overloaded && -- 2.7.4