So maybe you want something like the below; that cures the thing Morten
raised, and we continue looking for sd, even after we found affine_sd.

It also avoids the pointless idle_cpu() check Mike raised by making
select_idle_sibling() return -1 if it doesn't find anything.

Then it continues doing the full balance IFF sd was set, which is keyed
off of sd->flags.

And note (as Mike already said), BALANCE_WAKE does _NOT_ look for idle
CPUs, it looks for the least loaded CPU. And its damn expensive.


Rewriting this entire thing is somewhere on the todo list :/

---
 kernel/sched/fair.c | 36 +++++++++++++++++++++++-------------
 1 file changed, 23 insertions(+), 13 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index d597aea43030..7330fb4fea9c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1385,8 +1385,13 @@ static void task_numa_compare(struct task_numa_env *env,
         * One idle CPU per node is evaluated for a task numa move.
         * Call select_idle_sibling to maybe find a better one.
         */
-       if (!cur)
-               env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu);
+       if (!cur) {
+               int cpu = select_idle_sibling(env->p, env->dst_cpu);
+               if (cpu < 0)
+                       cpu = env->dst_cpu;
+
+               env->dst_cpu = cpu;
+       }
 
 assign:
        task_numa_assign(env, cur, imp);
@@ -4951,16 +4956,15 @@ static int select_idle_sibling(struct task_struct *p, 
int target)
                                        goto next;
                        }
 
-                       target = cpumask_first_and(sched_group_cpus(sg),
+                       return cpumask_first_and(sched_group_cpus(sg),
                                        tsk_cpus_allowed(p));
-                       goto done;
 next:
                        sg = sg->next;
                } while (sg != sd->groups);
        }
-done:
-       return target;
+       return -1;
 }
+
 /*
  * get_cpu_usage returns the amount of capacity of a CPU that is used by CFS
  * tasks. The unit of the return value must be the one of capacity so we can
@@ -5022,22 +5026,28 @@ select_task_rq_fair(struct task_struct *p, int 
prev_cpu, int sd_flag, int wake_f
                 * If both cpu and prev_cpu are part of this domain,
                 * cpu is a valid SD_WAKE_AFFINE target.
                 */
-               if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
-                   cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
+               if (want_affine && !affine_sd &&
+                   (tmp->flags & SD_WAKE_AFFINE) &&
+                   cpumask_test_cpu(prev_cpu, sched_domain_span(tmp)))
                        affine_sd = tmp;
-                       break;
-               }
 
                if (tmp->flags & sd_flag)
                        sd = tmp;
+               else if (!want_affine || (want_affine && affine_sd))
+                       break;
        }
 
-       if (affine_sd && cpu != prev_cpu && wake_affine(affine_sd, p, sync))
+       if (affine_sd && cpu != prev_cpu && wake_affine(affine_sd, p, sync)) {
                prev_cpu = cpu;
+               sd = NULL; /* WAKE_AFFINE trumps BALANCE_WAKE */
+       }
 
        if (sd_flag & SD_BALANCE_WAKE) {
-               new_cpu = select_idle_sibling(p, prev_cpu);
-               goto unlock;
+               int tmp = select_idle_sibling(p, prev_cpu);
+               if (tmp >= 0) {
+                       new_cpu = tmp;
+                       goto unlock;
+               }
        }
 
        while (sd) {
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to