fair: Fix how load gets propagated from cfs_rq to its sched_entity

Peter Zijlstra Mon, 01 May 2017 07:18:05 -0700

Hi,

sorry for being late to the party, trying to catch-up..


On Wed, Apr 26, 2017 at 06:51:23PM +0200, Vincent Guittot wrote:
> Ok, I see your point and agree that there is an issue when propagating
> load_avg of a task group which has tasks with lower weight than the share
> but your proposal has got issue because it uses runnable_load_avg instead
> of load_avg and this makes propagation of loadavg_avg incorrect, something
> like below which keeps using load_avg solve the problem
> 
> +     if (gcfs_rq->load.weight) {
> +             long shares = scale_load_down(calc_cfs_shares(gcfs_rq, 
> gcfs_rq->tg));
> +
> +             load = min(gcfs_rq->avg.load_avg *
> +                        shares / scale_load_down(gcfs_rq->load.weight), 
> shares);
> 


So this here does:

  ( tg->load_avg = \Sum cfs_rq->load_avg )

  load      = cfs_rq->load.weight

  tg_weight = tg->load_avg - cfs_rq->contrib + load


           tg->shares * load
  shares = -----------------
               tg_weight


                        cfs_rq->load_avg
  avg_shares = shares * ----------------
                              load

               tg->shares * cfs_rq->load_avg
             = -----------------------------
                      tg_weight


  ( se->load.weight = shares )

  se->load_avg = min(shares, avg_shares);


So where shares (and se->load.weight) are an upper bound (due to using
cfs_rq->load.weight, see calc_cfs_shares), avg_shares is supposedly a
more accurate representation based on our PELT averages.

This looks OK; and I agree with Vincent that we should use
cfs_rq->avg.load_avg, not cfs_rq->runnable_load_avg, since tg->load_avg
is a sum of the former, not the latter.

Also, arguably calculating the above avg_shares directly (using the
second equation) might be more precise; but I doubt it makes much of a
difference, however since we do min(), we should at least clamp against
MIN_SHARES again.

Furthermore, it appears to me we want a different tg_weight value for
the avg_shares, something like:

  tg_weight = tg->load_avg - cfs_rq->contrib + cfs_rq->avg.load_avg

To better match with the numerator's units, otherwise it will have a
tendency to push avg_shares down further than it needs to be.


(All assuming it actually works of course.. compile tested only)

--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2634,7 +2634,9 @@ account_entity_dequeue(struct cfs_rq *cf
 # ifdef CONFIG_SMP
 static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
 {
-       long tg_weight, load, shares;
+       long tg_weight, tg_shares, load, shares;
+
+       tg_shares = READ_ONCE(tg->shares);
 
        /*
         * This really should be: cfs_rq->avg.load_avg, but instead we use
@@ -2665,12 +2667,7 @@ static long calc_cfs_shares(struct cfs_r
         * case no task is runnable on a CPU MIN_SHARES=2 should be returned
         * instead of 0.
         */
-       if (shares < MIN_SHARES)
-               shares = MIN_SHARES;
-       if (shares > tg->shares)
-               shares = tg->shares;
-
-       return shares;
+       return clamp_t(long, shares, MIN_SHARES, tg_shares);
 }
 # else /* CONFIG_SMP */
 static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group 
*tg)
@@ -3075,37 +3072,69 @@ static inline void
 update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
        struct cfs_rq *gcfs_rq = group_cfs_rq(se);
-       long delta, load = gcfs_rq->avg.load_avg;
+       long load = 0, delta;
 
        /*
-        * If the load of group cfs_rq is null, the load of the
-        * sched_entity will also be null so we can skip the formula
+        * If there is load in our group cfs_rq (@gcfs_rq), then update its
+        * corresponding group se (@se) load value to reflect any change in
+        * weights.
+        *
+        * Also see effective_load(), where we do something similar.
+        *
+        *   ( tg->load_avg = \Sum cfs_rq->load_avg )
+        *   
+        *   tg_weight = tg->load_avg - cfs_rq->contrib
+        *   
+        *   
+        *            tg->shares * cfs_rq->weight
+        *   shares = ---------------------------
+        *            tg_weight  + cfs_rq->weight
+        *   
+        *   
+        *                tg->shares * cfs_rq->load_avg
+        *   avg_shares = -----------------------------
+        *                tg_weight  + cfs_rq->load_avg
+        *   
+        *   
+        *   ( se->load.weight = shares )
+        *   
+        *   se->load_avg = min(shares, avg_shares);
+        *
+        * Where @shares is an upper bound to @avg_shares, see the comments
+        * in calc_cfs_shares().
         */
-       if (load) {
-               long tg_load;
-
-               /* Get tg's load and ensure tg_load > 0 */
-               tg_load = atomic_long_read(&gcfs_rq->tg->load_avg) + 1;
-
-               /* Ensure tg_load >= load and updated with current load*/
-               tg_load -= gcfs_rq->tg_load_avg_contrib;
-               tg_load += load;
+       if (gcfs_rq->load.weight) {
+               long tg_weight1, tg_weight2, tg_shares, shares, avg_shares;
+               struct task_group *tg = gcfs_rq->tg;
 
                /*
-                * We need to compute a correction term in the case that the
-                * task group is consuming more CPU than a task of equal
-                * weight. A task with a weight equals to tg->shares will have
-                * a load less or equal to scale_load_down(tg->shares).
-                * Similarly, the sched_entities that represent the task group
-                * at parent level, can't have a load higher than
-                * scale_load_down(tg->shares). And the Sum of sched_entities'
-                * load must be <= scale_load_down(tg->shares).
+                * This open-codes calc_cfs_shares(), in order to ensure
+                * we use consistent values for @shares and @avg_shares,
+                * as well as make sure we clip the result properly.
                 */
-               if (tg_load > scale_load_down(gcfs_rq->tg->shares)) {
-                       /* scale gcfs_rq's load into tg's shares*/
-                       load *= scale_load_down(gcfs_rq->tg->shares);
-                       load /= tg_load;
-               }
+
+               tg_shares = READ_ONCE(tg->shares);
+
+               load = scale_load_down(gcfs_rq->load.weight);
+
+               tg_weight1 = atomic_long_read(&tg->load_avg);
+               tg_weight2 = (tg_weight1 -= cfs_rq->tg_load_avg_contrib);
+               tg_weight1 += load;
+
+               shares = tg_shares * load;
+               if (tg_weight1)
+                       shares /= tg_weight1;
+
+
+               load = READ_ONCE(gcfs_rq->avg.load_avg);
+
+               tg_weight2 += load;
+
+               avg_shares = tg_shares * load;
+               if (tg_weight2)
+                       avg_shares /= tg_weight2;
+
+               load = clamp_t(long, min(shares, avg_shares), MIN_SHARES, 
tg_shares);
        }
 
        delta = load - se->avg.load_avg;

Re: [PATCH 1/2] sched/fair: Fix how load gets propagated from cfs_rq to its sched_entity

Reply via email to