Linus,

Please pull the latest sched-core-for-linus git tree from:

   git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git 
sched-core-for-linus

   HEAD: 8323f26ce3425460769605a6aece7a174edaa7d1 sched: Fix race in 
task_group()

The biggest change is a performance improvement on SMP systems:

| 4 socket 40 core + SMT Westmere box, single 30 sec tbench 
| runs, higher is better:
|
| clients     1       2       4        8       16       32       64      128
|..........................................................................
| pre        30      41     118      645     3769     6214    12233    14312
| post      299     603    1211     2418     4697     6847    11606    14557
|
| A nice increase in performance.

which speedup is particularly noticeable on heavily interacting 
few-tasks workloads, so the changes should help desktop-style 
Xorg workloads and interactivity as well, on multi-core CPUs.

There are also cpuset suspend behavior fixes/restructuring and 
various smaller tweaks.

( I had to rebase the pending scheduler tree recently, as the 
  work in progress NUMA scheduling changes from Peter Zijlstra 
  were still too raw for v3.6. This was hopefully a rare, 
  one-off event. )

 Thanks,

        Ingo

------------------>
Mike Galbraith (1):
      sched: Improve scalability via 'CPU buddies', which withstand random 
perturbations

Peter Zijlstra (2):
      sched/x86: Remove broken power estimation
      sched: Fix race in task_group()

Prashanth Nageshappa (2):
      sched: Reorder 'struct lb_env' members to reduce its size
      sched: Reset loop counters if all tasks are pinned and we need to redo 
load balance

Srivatsa S. Bhat (4):
      CPU hotplug, cpusets, suspend: Don't modify cpusets during suspend/resume
      cpusets, hotplug: Implement cpuset tree traversal in a helper function
      cpusets, hotplug: Restructure functions that are invoked during hotplug
      cpusets: Remove/update outdated comments

Srivatsa Vaddagiri (1):
      sched: Improve balance_cpu() to consider other cpus in its group as 
target of (pinned) task


 arch/x86/kernel/cpu/Makefile |    2 +-
 arch/x86/kernel/cpu/sched.c  |   55 ------------------
 include/linux/cpuset.h       |    4 +-
 include/linux/init_task.h    |   12 +++-
 include/linux/sched.h        |    6 +-
 kernel/cpuset.c              |  130 ++++++++++++++++++++++++++++++------------
 kernel/sched/core.c          |   92 +++++++++++++++++++++++++++---
 kernel/sched/fair.c          |  113 +++++++++++++++++++++++++++---------
 kernel/sched/sched.h         |   23 ++++----
 9 files changed, 291 insertions(+), 146 deletions(-)
 delete mode 100644 arch/x86/kernel/cpu/sched.c

diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 6ab6aa2..c598126 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -14,7 +14,7 @@ CFLAGS_common.o               := $(nostackp)
 
 obj-y                  := intel_cacheinfo.o scattered.o topology.o
 obj-y                  += proc.o capflags.o powerflags.o common.o
-obj-y                  += vmware.o hypervisor.o sched.o mshyperv.o
+obj-y                  += vmware.o hypervisor.o mshyperv.o
 obj-y                  += rdrand.o
 obj-y                  += match.o
 
diff --git a/arch/x86/kernel/cpu/sched.c b/arch/x86/kernel/cpu/sched.c
deleted file mode 100644
index a640ae5..0000000
--- a/arch/x86/kernel/cpu/sched.c
+++ /dev/null
@@ -1,55 +0,0 @@
-#include <linux/sched.h>
-#include <linux/math64.h>
-#include <linux/percpu.h>
-#include <linux/irqflags.h>
-
-#include <asm/cpufeature.h>
-#include <asm/processor.h>
-
-#ifdef CONFIG_SMP
-
-static DEFINE_PER_CPU(struct aperfmperf, old_perf_sched);
-
-static unsigned long scale_aperfmperf(void)
-{
-       struct aperfmperf val, *old = &__get_cpu_var(old_perf_sched);
-       unsigned long ratio, flags;
-
-       local_irq_save(flags);
-       get_aperfmperf(&val);
-       local_irq_restore(flags);
-
-       ratio = calc_aperfmperf_ratio(old, &val);
-       *old = val;
-
-       return ratio;
-}
-
-unsigned long arch_scale_freq_power(struct sched_domain *sd, int cpu)
-{
-       /*
-        * do aperf/mperf on the cpu level because it includes things
-        * like turbo mode, which are relevant to full cores.
-        */
-       if (boot_cpu_has(X86_FEATURE_APERFMPERF))
-               return scale_aperfmperf();
-
-       /*
-        * maybe have something cpufreq here
-        */
-
-       return default_scale_freq_power(sd, cpu);
-}
-
-unsigned long arch_scale_smt_power(struct sched_domain *sd, int cpu)
-{
-       /*
-        * aperf/mperf already includes the smt gain
-        */
-       if (boot_cpu_has(X86_FEATURE_APERFMPERF))
-               return SCHED_LOAD_SCALE;
-
-       return default_scale_smt_power(sd, cpu);
-}
-
-#endif
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 668f66b..838320f 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -20,7 +20,7 @@ extern int number_of_cpusets; /* How many cpusets are defined 
in system? */
 
 extern int cpuset_init(void);
 extern void cpuset_init_smp(void);
-extern void cpuset_update_active_cpus(void);
+extern void cpuset_update_active_cpus(bool cpu_online);
 extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask);
 extern void cpuset_cpus_allowed_fallback(struct task_struct *p);
 extern nodemask_t cpuset_mems_allowed(struct task_struct *p);
@@ -124,7 +124,7 @@ static inline void set_mems_allowed(nodemask_t nodemask)
 static inline int cpuset_init(void) { return 0; }
 static inline void cpuset_init_smp(void) {}
 
-static inline void cpuset_update_active_cpus(void)
+static inline void cpuset_update_active_cpus(bool cpu_online)
 {
        partition_sched_domains(1, NULL, NULL);
 }
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 9e65eff..b806b82 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -123,8 +123,17 @@ extern struct group_info init_groups;
 
 extern struct cred init_cred;
 
+extern struct task_group root_task_group;
+
+#ifdef CONFIG_CGROUP_SCHED
+# define INIT_CGROUP_SCHED(tsk)                                                
\
+       .sched_task_group = &root_task_group,
+#else
+# define INIT_CGROUP_SCHED(tsk)
+#endif
+
 #ifdef CONFIG_PERF_EVENTS
-# define INIT_PERF_EVENTS(tsk)                                 \
+# define INIT_PERF_EVENTS(tsk)                                         \
        .perf_event_mutex =                                             \
                 __MUTEX_INITIALIZER(tsk.perf_event_mutex),             \
        .perf_event_list = LIST_HEAD_INIT(tsk.perf_event_list),
@@ -161,6 +170,7 @@ extern struct cred init_cred;
        },                                                              \
        .tasks          = LIST_HEAD_INIT(tsk.tasks),                    \
        INIT_PUSHABLE_TASKS(tsk)                                        \
+       INIT_CGROUP_SCHED(tsk)                                          \
        .ptraced        = LIST_HEAD_INIT(tsk.ptraced),                  \
        .ptrace_entry   = LIST_HEAD_INIT(tsk.ptrace_entry),             \
        .real_parent    = &tsk,                                         \
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4a1f493..fd9436a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -949,6 +949,7 @@ struct sched_domain {
        unsigned int smt_gain;
        int flags;                      /* See SD_* */
        int level;
+       int idle_buddy;                 /* cpu assigned to 
select_idle_sibling() */
 
        /* Runtime fields. */
        unsigned long last_balance;     /* init to jiffies. units in jiffies */
@@ -1244,6 +1245,9 @@ struct task_struct {
        const struct sched_class *sched_class;
        struct sched_entity se;
        struct sched_rt_entity rt;
+#ifdef CONFIG_CGROUP_SCHED
+       struct task_group *sched_task_group;
+#endif
 
 #ifdef CONFIG_PREEMPT_NOTIFIERS
        /* list of struct preempt_notifier: */
@@ -2723,7 +2727,7 @@ extern int sched_group_set_rt_period(struct task_group 
*tg,
 extern long sched_group_rt_period(struct task_group *tg);
 extern int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk);
 #endif
-#endif
+#endif /* CONFIG_CGROUP_SCHED */
 
 extern int task_can_switch_user(struct user_struct *up,
                                        struct task_struct *tsk);
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 8c8bd65..f33c715 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -147,6 +147,12 @@ typedef enum {
        CS_SPREAD_SLAB,
 } cpuset_flagbits_t;
 
+/* the type of hotplug event */
+enum hotplug_event {
+       CPUSET_CPU_OFFLINE,
+       CPUSET_MEM_OFFLINE,
+};
+
 /* convenient tests for these bits */
 static inline int is_cpu_exclusive(const struct cpuset *cs)
 {
@@ -1990,8 +1996,36 @@ static void remove_tasks_in_empty_cpuset(struct cpuset 
*cs)
 }
 
 /*
- * Walk the specified cpuset subtree and look for empty cpusets.
- * The tasks of such cpuset must be moved to a parent cpuset.
+ * Helper function to traverse cpusets.
+ * It can be used to walk the cpuset tree from top to bottom, completing
+ * one layer before dropping down to the next (thus always processing a
+ * node before any of its children).
+ */
+static struct cpuset *cpuset_next(struct list_head *queue)
+{
+       struct cpuset *cp;
+       struct cpuset *child;   /* scans child cpusets of cp */
+       struct cgroup *cont;
+
+       if (list_empty(queue))
+               return NULL;
+
+       cp = list_first_entry(queue, struct cpuset, stack_list);
+       list_del(queue->next);
+       list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
+               child = cgroup_cs(cont);
+               list_add_tail(&child->stack_list, queue);
+       }
+
+       return cp;
+}
+
+
+/*
+ * Walk the specified cpuset subtree upon a hotplug operation (CPU/Memory
+ * online/offline) and update the cpusets accordingly.
+ * For regular CPU/Mem hotplug, look for empty cpusets; the tasks of such
+ * cpuset must be moved to a parent cpuset.
  *
  * Called with cgroup_mutex held.  We take callback_mutex to modify
  * cpus_allowed and mems_allowed.
@@ -2000,50 +2034,61 @@ static void remove_tasks_in_empty_cpuset(struct cpuset 
*cs)
  * before dropping down to the next.  It always processes a node before
  * any of its children.
  *
- * For now, since we lack memory hot unplug, we'll never see a cpuset
- * that has tasks along with an empty 'mems'.  But if we did see such
- * a cpuset, we'd handle it just like we do if its 'cpus' was empty.
+ * In the case of memory hot-unplug, it will remove nodes from N_HIGH_MEMORY
+ * if all present pages from a node are offlined.
  */
-static void scan_for_empty_cpusets(struct cpuset *root)
+static void
+scan_cpusets_upon_hotplug(struct cpuset *root, enum hotplug_event event)
 {
        LIST_HEAD(queue);
-       struct cpuset *cp;      /* scans cpusets being updated */
-       struct cpuset *child;   /* scans child cpusets of cp */
-       struct cgroup *cont;
+       struct cpuset *cp;              /* scans cpusets being updated */
        static nodemask_t oldmems;      /* protected by cgroup_mutex */
 
        list_add_tail((struct list_head *)&root->stack_list, &queue);
 
-       while (!list_empty(&queue)) {
-               cp = list_first_entry(&queue, struct cpuset, stack_list);
-               list_del(queue.next);
-               list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
-                       child = cgroup_cs(cont);
-                       list_add_tail(&child->stack_list, &queue);
+       switch (event) {
+       case CPUSET_CPU_OFFLINE:
+               while ((cp = cpuset_next(&queue)) != NULL) {
+
+                       /* Continue past cpusets with all cpus online */
+                       if (cpumask_subset(cp->cpus_allowed, cpu_active_mask))
+                               continue;
+
+                       /* Remove offline cpus from this cpuset. */
+                       mutex_lock(&callback_mutex);
+                       cpumask_and(cp->cpus_allowed, cp->cpus_allowed,
+                                                       cpu_active_mask);
+                       mutex_unlock(&callback_mutex);
+
+                       /* Move tasks from the empty cpuset to a parent */
+                       if (cpumask_empty(cp->cpus_allowed))
+                               remove_tasks_in_empty_cpuset(cp);
+                       else
+                               update_tasks_cpumask(cp, NULL);
                }
+               break;
 
-               /* Continue past cpusets with all cpus, mems online */
-               if (cpumask_subset(cp->cpus_allowed, cpu_active_mask) &&
-                   nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY]))
-                       continue;
+       case CPUSET_MEM_OFFLINE:
+               while ((cp = cpuset_next(&queue)) != NULL) {
 
-               oldmems = cp->mems_allowed;
+                       /* Continue past cpusets with all mems online */
+                       if (nodes_subset(cp->mems_allowed,
+                                       node_states[N_HIGH_MEMORY]))
+                               continue;
 
-               /* Remove offline cpus and mems from this cpuset. */
-               mutex_lock(&callback_mutex);
-               cpumask_and(cp->cpus_allowed, cp->cpus_allowed,
-                           cpu_active_mask);
-               nodes_and(cp->mems_allowed, cp->mems_allowed,
+                       oldmems = cp->mems_allowed;
+
+                       /* Remove offline mems from this cpuset. */
+                       mutex_lock(&callback_mutex);
+                       nodes_and(cp->mems_allowed, cp->mems_allowed,
                                                node_states[N_HIGH_MEMORY]);
-               mutex_unlock(&callback_mutex);
+                       mutex_unlock(&callback_mutex);
 
-               /* Move tasks from the empty cpuset to a parent */
-               if (cpumask_empty(cp->cpus_allowed) ||
-                    nodes_empty(cp->mems_allowed))
-                       remove_tasks_in_empty_cpuset(cp);
-               else {
-                       update_tasks_cpumask(cp, NULL);
-                       update_tasks_nodemask(cp, &oldmems, NULL);
+                       /* Move tasks from the empty cpuset to a parent */
+                       if (nodes_empty(cp->mems_allowed))
+                               remove_tasks_in_empty_cpuset(cp);
+                       else
+                               update_tasks_nodemask(cp, &oldmems, NULL);
                }
        }
 }
@@ -2054,13 +2099,19 @@ static void scan_for_empty_cpusets(struct cpuset *root)
  * (of no affect) on systems that are actively using CPU hotplug
  * but making no active use of cpusets.
  *
+ * The only exception to this is suspend/resume, where we don't
+ * modify cpusets at all.
+ *
  * This routine ensures that top_cpuset.cpus_allowed tracks
  * cpu_active_mask on each CPU hotplug (cpuhp) event.
  *
  * Called within get_online_cpus().  Needs to call cgroup_lock()
  * before calling generate_sched_domains().
+ *
+ * @cpu_online: Indicates whether this is a CPU online event (true) or
+ * a CPU offline event (false).
  */
-void cpuset_update_active_cpus(void)
+void cpuset_update_active_cpus(bool cpu_online)
 {
        struct sched_domain_attr *attr;
        cpumask_var_t *doms;
@@ -2070,7 +2121,10 @@ void cpuset_update_active_cpus(void)
        mutex_lock(&callback_mutex);
        cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
        mutex_unlock(&callback_mutex);
-       scan_for_empty_cpusets(&top_cpuset);
+
+       if (!cpu_online)
+               scan_cpusets_upon_hotplug(&top_cpuset, CPUSET_CPU_OFFLINE);
+
        ndoms = generate_sched_domains(&doms, &attr);
        cgroup_unlock();
 
@@ -2082,7 +2136,7 @@ void cpuset_update_active_cpus(void)
 /*
  * Keep top_cpuset.mems_allowed tracking node_states[N_HIGH_MEMORY].
  * Call this routine anytime after node_states[N_HIGH_MEMORY] changes.
- * See also the previous routine cpuset_track_online_cpus().
+ * See cpuset_update_active_cpus() for CPU hotplug handling.
  */
 static int cpuset_track_online_nodes(struct notifier_block *self,
                                unsigned long action, void *arg)
@@ -2101,9 +2155,9 @@ static int cpuset_track_online_nodes(struct 
notifier_block *self,
        case MEM_OFFLINE:
                /*
                 * needn't update top_cpuset.mems_allowed explicitly because
-                * scan_for_empty_cpusets() will update it.
+                * scan_cpusets_upon_hotplug() will update it.
                 */
-               scan_for_empty_cpusets(&top_cpuset);
+               scan_cpusets_upon_hotplug(&top_cpuset, CPUSET_MEM_OFFLINE);
                break;
        default:
                break;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 468bdd4..5d011ef 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1096,7 +1096,7 @@ void set_task_cpu(struct task_struct *p, unsigned int 
new_cpu)
         * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable 
tasks.
         *
         * sched_move_task() holds both and thus holding either pins the cgroup,
-        * see set_task_rq().
+        * see task_group().
         *
         * Furthermore, all task_rq users should acquire both locks, see
         * task_rq_lock().
@@ -6024,6 +6024,11 @@ static void destroy_sched_domains(struct sched_domain 
*sd, int cpu)
  * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this
  * allows us to avoid some pointer chasing select_idle_sibling().
  *
+ * Iterate domains and sched_groups downward, assigning CPUs to be
+ * select_idle_sibling() hw buddy.  Cross-wiring hw makes bouncing
+ * due to random perturbation self canceling, ie sw buddies pull
+ * their counterpart to their CPU's hw counterpart.
+ *
  * Also keep a unique ID per domain (we use the first cpu number in
  * the cpumask of the domain), this allows us to quickly tell if
  * two cpus are in the same cache domain, see cpus_share_cache().
@@ -6037,8 +6042,40 @@ static void update_top_cache_domain(int cpu)
        int id = cpu;
 
        sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
-       if (sd)
+       if (sd) {
+               struct sched_domain *tmp = sd;
+               struct sched_group *sg, *prev;
+               bool right;
+
+               /*
+                * Traverse to first CPU in group, and count hops
+                * to cpu from there, switching direction on each
+                * hop, never ever pointing the last CPU rightward.
+                */
+               do {
+                       id = cpumask_first(sched_domain_span(tmp));
+                       prev = sg = tmp->groups;
+                       right = 1;
+
+                       while (cpumask_first(sched_group_cpus(sg)) != id)
+                               sg = sg->next;
+
+                       while (!cpumask_test_cpu(cpu, sched_group_cpus(sg))) {
+                               prev = sg;
+                               sg = sg->next;
+                               right = !right;
+                       }
+
+                       /* A CPU went down, never point back to domain start. */
+                       if (right && cpumask_first(sched_group_cpus(sg->next)) 
== id)
+                               right = false;
+
+                       sg = right ? sg->next : prev;
+                       tmp->idle_buddy = cpumask_first(sched_group_cpus(sg));
+               } while ((tmp = tmp->child));
+
                id = cpumask_first(sched_domain_span(sd));
+       }
 
        rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
        per_cpu(sd_llc_id, cpu) = id;
@@ -7097,34 +7134,66 @@ match2:
        mutex_unlock(&sched_domains_mutex);
 }
 
+static int num_cpus_frozen;    /* used to mark begin/end of suspend/resume */
+
 /*
  * Update cpusets according to cpu_active mask.  If cpusets are
  * disabled, cpuset_update_active_cpus() becomes a simple wrapper
  * around partition_sched_domains().
+ *
+ * If we come here as part of a suspend/resume, don't touch cpusets because we
+ * want to restore it back to its original state upon resume anyway.
  */
 static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
                             void *hcpu)
 {
-       switch (action & ~CPU_TASKS_FROZEN) {
+       switch (action) {
+       case CPU_ONLINE_FROZEN:
+       case CPU_DOWN_FAILED_FROZEN:
+
+               /*
+                * num_cpus_frozen tracks how many CPUs are involved in suspend
+                * resume sequence. As long as this is not the last online
+                * operation in the resume sequence, just build a single sched
+                * domain, ignoring cpusets.
+                */
+               num_cpus_frozen--;
+               if (likely(num_cpus_frozen)) {
+                       partition_sched_domains(1, NULL, NULL);
+                       break;
+               }
+
+               /*
+                * This is the last CPU online operation. So fall through and
+                * restore the original sched domains by considering the
+                * cpuset configurations.
+                */
+
        case CPU_ONLINE:
        case CPU_DOWN_FAILED:
-               cpuset_update_active_cpus();
-               return NOTIFY_OK;
+               cpuset_update_active_cpus(true);
+               break;
        default:
                return NOTIFY_DONE;
        }
+       return NOTIFY_OK;
 }
 
 static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long 
action,
                               void *hcpu)
 {
-       switch (action & ~CPU_TASKS_FROZEN) {
+       switch (action) {
        case CPU_DOWN_PREPARE:
-               cpuset_update_active_cpus();
-               return NOTIFY_OK;
+               cpuset_update_active_cpus(false);
+               break;
+       case CPU_DOWN_PREPARE_FROZEN:
+               num_cpus_frozen++;
+               partition_sched_domains(1, NULL, NULL);
+               break;
        default:
                return NOTIFY_DONE;
        }
+       return NOTIFY_OK;
 }
 
 void __init sched_init_smp(void)
@@ -7589,6 +7658,7 @@ void sched_destroy_group(struct task_group *tg)
  */
 void sched_move_task(struct task_struct *tsk)
 {
+       struct task_group *tg;
        int on_rq, running;
        unsigned long flags;
        struct rq *rq;
@@ -7603,6 +7673,12 @@ void sched_move_task(struct task_struct *tsk)
        if (unlikely(running))
                tsk->sched_class->put_prev_task(rq, tsk);
 
+       tg = container_of(task_subsys_state_check(tsk, cpu_cgroup_subsys_id,
+                               lockdep_is_held(&tsk->sighand->siglock)),
+                         struct task_group, css);
+       tg = autogroup_task_group(tsk, tg);
+       tsk->sched_task_group = tg;
+
 #ifdef CONFIG_FAIR_GROUP_SCHED
        if (tsk->sched_class->task_move_group)
                tsk->sched_class->task_move_group(tsk, on_rq);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c099cc6..22321db 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2637,8 +2637,6 @@ static int select_idle_sibling(struct task_struct *p, int 
target)
        int cpu = smp_processor_id();
        int prev_cpu = task_cpu(p);
        struct sched_domain *sd;
-       struct sched_group *sg;
-       int i;
 
        /*
         * If the task is going to be woken-up on this cpu and if it is
@@ -2655,29 +2653,17 @@ static int select_idle_sibling(struct task_struct *p, 
int target)
                return prev_cpu;
 
        /*
-        * Otherwise, iterate the domains and find an elegible idle cpu.
+        * Otherwise, check assigned siblings to find an elegible idle cpu.
         */
        sd = rcu_dereference(per_cpu(sd_llc, target));
-       for_each_lower_domain(sd) {
-               sg = sd->groups;
-               do {
-                       if (!cpumask_intersects(sched_group_cpus(sg),
-                                               tsk_cpus_allowed(p)))
-                               goto next;
-
-                       for_each_cpu(i, sched_group_cpus(sg)) {
-                               if (!idle_cpu(i))
-                                       goto next;
-                       }
 
-                       target = cpumask_first_and(sched_group_cpus(sg),
-                                       tsk_cpus_allowed(p));
-                       goto done;
-next:
-                       sg = sg->next;
-               } while (sg != sd->groups);
+       for_each_lower_domain(sd) {
+               if (!cpumask_test_cpu(sd->idle_buddy, tsk_cpus_allowed(p)))
+                       continue;
+               if (idle_cpu(sd->idle_buddy))
+                       return sd->idle_buddy;
        }
-done:
+
        return target;
 }
 
@@ -3068,16 +3054,19 @@ static unsigned long __read_mostly 
max_load_balance_interval = HZ/10;
 
 #define LBF_ALL_PINNED 0x01
 #define LBF_NEED_BREAK 0x02
+#define LBF_SOME_PINNED 0x04
 
 struct lb_env {
        struct sched_domain     *sd;
 
-       int                     src_cpu;
        struct rq               *src_rq;
+       int                     src_cpu;
 
        int                     dst_cpu;
        struct rq               *dst_rq;
 
+       struct cpumask          *dst_grpmask;
+       int                     new_dst_cpu;
        enum cpu_idle_type      idle;
        long                    imbalance;
        unsigned int            flags;
@@ -3145,9 +3134,31 @@ int can_migrate_task(struct task_struct *p, struct 
lb_env *env)
         * 3) are cache-hot on their current CPU.
         */
        if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
+               int new_dst_cpu;
+
                schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
+
+               /*
+                * Remember if this task can be migrated to any other cpu in
+                * our sched_group. We may want to revisit it if we couldn't
+                * meet load balance goals by pulling other tasks on src_cpu.
+                *
+                * Also avoid computing new_dst_cpu if we have already computed
+                * one in current iteration.
+                */
+               if (!env->dst_grpmask || (env->flags & LBF_SOME_PINNED))
+                       return 0;
+
+               new_dst_cpu = cpumask_first_and(env->dst_grpmask,
+                                               tsk_cpus_allowed(p));
+               if (new_dst_cpu < nr_cpu_ids) {
+                       env->flags |= LBF_SOME_PINNED;
+                       env->new_dst_cpu = new_dst_cpu;
+               }
                return 0;
        }
+
+       /* Record that we found atleast one task that could run on dst_cpu */
        env->flags &= ~LBF_ALL_PINNED;
 
        if (task_running(env->src_rq, p)) {
@@ -4227,7 +4238,8 @@ static int load_balance(int this_cpu, struct rq *this_rq,
                        struct sched_domain *sd, enum cpu_idle_type idle,
                        int *balance)
 {
-       int ld_moved, active_balance = 0;
+       int ld_moved, cur_ld_moved, active_balance = 0;
+       int lb_iterations, max_lb_iterations;
        struct sched_group *group;
        struct rq *busiest;
        unsigned long flags;
@@ -4237,11 +4249,13 @@ static int load_balance(int this_cpu, struct rq 
*this_rq,
                .sd             = sd,
                .dst_cpu        = this_cpu,
                .dst_rq         = this_rq,
+               .dst_grpmask    = sched_group_cpus(sd->groups),
                .idle           = idle,
                .loop_break     = sched_nr_migrate_break,
        };
 
        cpumask_copy(cpus, cpu_active_mask);
+       max_lb_iterations = cpumask_weight(env.dst_grpmask);
 
        schedstat_inc(sd, lb_count[idle]);
 
@@ -4267,6 +4281,7 @@ redo:
        schedstat_add(sd, lb_imbalance[idle], env.imbalance);
 
        ld_moved = 0;
+       lb_iterations = 1;
        if (busiest->nr_running > 1) {
                /*
                 * Attempt to move tasks. If find_busiest_group has found
@@ -4284,7 +4299,13 @@ more_balance:
                double_rq_lock(this_rq, busiest);
                if (!env.loop)
                        update_h_load(env.src_cpu);
-               ld_moved += move_tasks(&env);
+
+               /*
+                * cur_ld_moved - load moved in current iteration
+                * ld_moved     - cumulative load moved across iterations
+                */
+               cur_ld_moved = move_tasks(&env);
+               ld_moved += cur_ld_moved;
                double_rq_unlock(this_rq, busiest);
                local_irq_restore(flags);
 
@@ -4296,14 +4317,52 @@ more_balance:
                /*
                 * some other cpu did the load balance for us.
                 */
-               if (ld_moved && this_cpu != smp_processor_id())
-                       resched_cpu(this_cpu);
+               if (cur_ld_moved && env.dst_cpu != smp_processor_id())
+                       resched_cpu(env.dst_cpu);
+
+               /*
+                * Revisit (affine) tasks on src_cpu that couldn't be moved to
+                * us and move them to an alternate dst_cpu in our sched_group
+                * where they can run. The upper limit on how many times we
+                * iterate on same src_cpu is dependent on number of cpus in our
+                * sched_group.
+                *
+                * This changes load balance semantics a bit on who can move
+                * load to a given_cpu. In addition to the given_cpu itself
+                * (or a ilb_cpu acting on its behalf where given_cpu is
+                * nohz-idle), we now have balance_cpu in a position to move
+                * load to given_cpu. In rare situations, this may cause
+                * conflicts (balance_cpu and given_cpu/ilb_cpu deciding
+                * _independently_ and at _same_ time to move some load to
+                * given_cpu) causing exceess load to be moved to given_cpu.
+                * This however should not happen so much in practice and
+                * moreover subsequent load balance cycles should correct the
+                * excess load moved.
+                */
+               if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0 &&
+                               lb_iterations++ < max_lb_iterations) {
+
+                       this_rq          = cpu_rq(env.new_dst_cpu);
+                       env.dst_rq       = this_rq;
+                       env.dst_cpu      = env.new_dst_cpu;
+                       env.flags       &= ~LBF_SOME_PINNED;
+                       env.loop         = 0;
+                       env.loop_break   = sched_nr_migrate_break;
+                       /*
+                        * Go back to "more_balance" rather than "redo" since we
+                        * need to continue with same src_cpu.
+                        */
+                       goto more_balance;
+               }
 
                /* All tasks on this runqueue were pinned by CPU affinity */
                if (unlikely(env.flags & LBF_ALL_PINNED)) {
                        cpumask_clear_cpu(cpu_of(busiest), cpus);
-                       if (!cpumask_empty(cpus))
+                       if (!cpumask_empty(cpus)) {
+                               env.loop = 0;
+                               env.loop_break = sched_nr_migrate_break;
                                goto redo;
+                       }
                        goto out_balanced;
                }
        }
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 55844f2..c35a1a7 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -538,22 +538,19 @@ extern int group_balance_cpu(struct sched_group *sg);
 /*
  * Return the group to which this tasks belongs.
  *
- * We use task_subsys_state_check() and extend the RCU verification with
- * pi->lock and rq->lock because cpu_cgroup_attach() holds those locks for each
- * task it moves into the cgroup. Therefore by holding either of those locks,
- * we pin the task to the current cgroup.
+ * We cannot use task_subsys_state() and friends because the cgroup
+ * subsystem changes that value before the cgroup_subsys::attach() method
+ * is called, therefore we cannot pin it and might observe the wrong value.
+ *
+ * The same is true for autogroup's p->signal->autogroup->tg, the autogroup
+ * core changes this before calling sched_move_task().
+ *
+ * Instead we use a 'copy' which is updated from sched_move_task() while
+ * holding both task_struct::pi_lock and rq::lock.
  */
 static inline struct task_group *task_group(struct task_struct *p)
 {
-       struct task_group *tg;
-       struct cgroup_subsys_state *css;
-
-       css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
-                       lockdep_is_held(&p->pi_lock) ||
-                       lockdep_is_held(&task_rq(p)->lock));
-       tg = container_of(css, struct task_group, css);
-
-       return autogroup_task_group(p, tg);
+       return p->sched_task_group;
 }
 
 /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to